diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 2086 |
1 files changed, 1374 insertions, 712 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 5dbc42694477..74f169ac0773 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
| 31 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
| 32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
| 33 | #include <linux/debug_locks.h> | ||
| 33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
| 34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
| 35 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
| @@ -50,6 +51,7 @@ | |||
| 50 | #include <linux/times.h> | 51 | #include <linux/times.h> |
| 51 | #include <linux/acct.h> | 52 | #include <linux/acct.h> |
| 52 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
| 54 | #include <linux/delayacct.h> | ||
| 53 | #include <asm/tlb.h> | 55 | #include <asm/tlb.h> |
| 54 | 56 | ||
| 55 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
| @@ -168,29 +170,28 @@ | |||
| 168 | */ | 170 | */ |
| 169 | 171 | ||
| 170 | #define SCALE_PRIO(x, prio) \ | 172 | #define SCALE_PRIO(x, prio) \ |
| 171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 173 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
| 172 | 174 | ||
| 173 | static unsigned int task_timeslice(task_t *p) | 175 | static unsigned int static_prio_timeslice(int static_prio) |
| 174 | { | 176 | { |
| 175 | if (p->static_prio < NICE_TO_PRIO(0)) | 177 | if (static_prio < NICE_TO_PRIO(0)) |
| 176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
| 177 | else | 179 | else |
| 178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 180 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
| 181 | } | ||
| 182 | |||
| 183 | static inline unsigned int task_timeslice(struct task_struct *p) | ||
| 184 | { | ||
| 185 | return static_prio_timeslice(p->static_prio); | ||
| 179 | } | 186 | } |
| 180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
| 181 | < (long long) (sd)->cache_hot_time) | ||
| 182 | 187 | ||
| 183 | /* | 188 | /* |
| 184 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
| 185 | */ | 190 | */ |
| 186 | 191 | ||
| 187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
| 188 | |||
| 189 | typedef struct runqueue runqueue_t; | ||
| 190 | |||
| 191 | struct prio_array { | 192 | struct prio_array { |
| 192 | unsigned int nr_active; | 193 | unsigned int nr_active; |
| 193 | unsigned long bitmap[BITMAP_SIZE]; | 194 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
| 194 | struct list_head queue[MAX_PRIO]; | 195 | struct list_head queue[MAX_PRIO]; |
| 195 | }; | 196 | }; |
| 196 | 197 | ||
| @@ -201,7 +202,7 @@ struct prio_array { | |||
| 201 | * (such as the load balancing or the thread migration code), lock | 202 | * (such as the load balancing or the thread migration code), lock |
| 202 | * acquire operations must be ordered by ascending &runqueue. | 203 | * acquire operations must be ordered by ascending &runqueue. |
| 203 | */ | 204 | */ |
| 204 | struct runqueue { | 205 | struct rq { |
| 205 | spinlock_t lock; | 206 | spinlock_t lock; |
| 206 | 207 | ||
| 207 | /* | 208 | /* |
| @@ -209,6 +210,7 @@ struct runqueue { | |||
| 209 | * remote CPUs use both these fields when doing load calculation. | 210 | * remote CPUs use both these fields when doing load calculation. |
| 210 | */ | 211 | */ |
| 211 | unsigned long nr_running; | 212 | unsigned long nr_running; |
| 213 | unsigned long raw_weighted_load; | ||
| 212 | #ifdef CONFIG_SMP | 214 | #ifdef CONFIG_SMP |
| 213 | unsigned long cpu_load[3]; | 215 | unsigned long cpu_load[3]; |
| 214 | #endif | 216 | #endif |
| @@ -224,9 +226,9 @@ struct runqueue { | |||
| 224 | 226 | ||
| 225 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
| 226 | unsigned long long timestamp_last_tick; | 228 | unsigned long long timestamp_last_tick; |
| 227 | task_t *curr, *idle; | 229 | struct task_struct *curr, *idle; |
| 228 | struct mm_struct *prev_mm; | 230 | struct mm_struct *prev_mm; |
| 229 | prio_array_t *active, *expired, arrays[2]; | 231 | struct prio_array *active, *expired, arrays[2]; |
| 230 | int best_expired_prio; | 232 | int best_expired_prio; |
| 231 | atomic_t nr_iowait; | 233 | atomic_t nr_iowait; |
| 232 | 234 | ||
| @@ -236,10 +238,10 @@ struct runqueue { | |||
| 236 | /* For active balancing */ | 238 | /* For active balancing */ |
| 237 | int active_balance; | 239 | int active_balance; |
| 238 | int push_cpu; | 240 | int push_cpu; |
| 241 | int cpu; /* cpu of this runqueue */ | ||
| 239 | 242 | ||
| 240 | task_t *migration_thread; | 243 | struct task_struct *migration_thread; |
| 241 | struct list_head migration_queue; | 244 | struct list_head migration_queue; |
| 242 | int cpu; | ||
| 243 | #endif | 245 | #endif |
| 244 | 246 | ||
| 245 | #ifdef CONFIG_SCHEDSTATS | 247 | #ifdef CONFIG_SCHEDSTATS |
| @@ -261,9 +263,19 @@ struct runqueue { | |||
| 261 | unsigned long ttwu_cnt; | 263 | unsigned long ttwu_cnt; |
| 262 | unsigned long ttwu_local; | 264 | unsigned long ttwu_local; |
| 263 | #endif | 265 | #endif |
| 266 | struct lock_class_key rq_lock_key; | ||
| 264 | }; | 267 | }; |
| 265 | 268 | ||
| 266 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 269 | static DEFINE_PER_CPU(struct rq, runqueues); |
| 270 | |||
| 271 | static inline int cpu_of(struct rq *rq) | ||
| 272 | { | ||
| 273 | #ifdef CONFIG_SMP | ||
| 274 | return rq->cpu; | ||
| 275 | #else | ||
| 276 | return 0; | ||
| 277 | #endif | ||
| 278 | } | ||
| 267 | 279 | ||
| 268 | /* | 280 | /* |
| 269 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 281 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
| @@ -272,8 +284,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); | |||
| 272 | * The domain tree of any CPU may only be accessed from within | 284 | * The domain tree of any CPU may only be accessed from within |
| 273 | * preempt-disabled sections. | 285 | * preempt-disabled sections. |
| 274 | */ | 286 | */ |
| 275 | #define for_each_domain(cpu, domain) \ | 287 | #define for_each_domain(cpu, __sd) \ |
| 276 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | 288 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
| 277 | 289 | ||
| 278 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 290 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 279 | #define this_rq() (&__get_cpu_var(runqueues)) | 291 | #define this_rq() (&__get_cpu_var(runqueues)) |
| @@ -288,26 +300,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | |||
| 288 | #endif | 300 | #endif |
| 289 | 301 | ||
| 290 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 302 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
| 291 | static inline int task_running(runqueue_t *rq, task_t *p) | 303 | static inline int task_running(struct rq *rq, struct task_struct *p) |
| 292 | { | 304 | { |
| 293 | return rq->curr == p; | 305 | return rq->curr == p; |
| 294 | } | 306 | } |
| 295 | 307 | ||
| 296 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 308 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 297 | { | 309 | { |
| 298 | } | 310 | } |
| 299 | 311 | ||
| 300 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 312 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| 301 | { | 313 | { |
| 302 | #ifdef CONFIG_DEBUG_SPINLOCK | 314 | #ifdef CONFIG_DEBUG_SPINLOCK |
| 303 | /* this is a valid case when another task releases the spinlock */ | 315 | /* this is a valid case when another task releases the spinlock */ |
| 304 | rq->lock.owner = current; | 316 | rq->lock.owner = current; |
| 305 | #endif | 317 | #endif |
| 318 | /* | ||
| 319 | * If we are tracking spinlock dependencies then we have to | ||
| 320 | * fix up the runqueue lock - which gets 'carried over' from | ||
| 321 | * prev into current: | ||
| 322 | */ | ||
| 323 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
| 324 | |||
| 306 | spin_unlock_irq(&rq->lock); | 325 | spin_unlock_irq(&rq->lock); |
| 307 | } | 326 | } |
| 308 | 327 | ||
| 309 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 328 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 310 | static inline int task_running(runqueue_t *rq, task_t *p) | 329 | static inline int task_running(struct rq *rq, struct task_struct *p) |
| 311 | { | 330 | { |
| 312 | #ifdef CONFIG_SMP | 331 | #ifdef CONFIG_SMP |
| 313 | return p->oncpu; | 332 | return p->oncpu; |
| @@ -316,7 +335,7 @@ static inline int task_running(runqueue_t *rq, task_t *p) | |||
| 316 | #endif | 335 | #endif |
| 317 | } | 336 | } |
| 318 | 337 | ||
| 319 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 338 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 320 | { | 339 | { |
| 321 | #ifdef CONFIG_SMP | 340 | #ifdef CONFIG_SMP |
| 322 | /* | 341 | /* |
| @@ -333,7 +352,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | |||
| 333 | #endif | 352 | #endif |
| 334 | } | 353 | } |
| 335 | 354 | ||
| 336 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 355 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| 337 | { | 356 | { |
| 338 | #ifdef CONFIG_SMP | 357 | #ifdef CONFIG_SMP |
| 339 | /* | 358 | /* |
| @@ -351,14 +370,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
| 351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 370 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 352 | 371 | ||
| 353 | /* | 372 | /* |
| 373 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
| 374 | * Must be called interrupts disabled. | ||
| 375 | */ | ||
| 376 | static inline struct rq *__task_rq_lock(struct task_struct *p) | ||
| 377 | __acquires(rq->lock) | ||
| 378 | { | ||
| 379 | struct rq *rq; | ||
| 380 | |||
| 381 | repeat_lock_task: | ||
| 382 | rq = task_rq(p); | ||
| 383 | spin_lock(&rq->lock); | ||
| 384 | if (unlikely(rq != task_rq(p))) { | ||
| 385 | spin_unlock(&rq->lock); | ||
| 386 | goto repeat_lock_task; | ||
| 387 | } | ||
| 388 | return rq; | ||
| 389 | } | ||
| 390 | |||
| 391 | /* | ||
| 354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 392 | * task_rq_lock - lock the runqueue a given task resides on and disable |
| 355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 393 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
| 356 | * explicitly disabling preemption. | 394 | * explicitly disabling preemption. |
| 357 | */ | 395 | */ |
| 358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 396 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
| 359 | __acquires(rq->lock) | 397 | __acquires(rq->lock) |
| 360 | { | 398 | { |
| 361 | struct runqueue *rq; | 399 | struct rq *rq; |
| 362 | 400 | ||
| 363 | repeat_lock_task: | 401 | repeat_lock_task: |
| 364 | local_irq_save(*flags); | 402 | local_irq_save(*flags); |
| @@ -371,7 +409,13 @@ repeat_lock_task: | |||
| 371 | return rq; | 409 | return rq; |
| 372 | } | 410 | } |
| 373 | 411 | ||
| 374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 412 | static inline void __task_rq_unlock(struct rq *rq) |
| 413 | __releases(rq->lock) | ||
| 414 | { | ||
| 415 | spin_unlock(&rq->lock); | ||
| 416 | } | ||
| 417 | |||
| 418 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | ||
| 375 | __releases(rq->lock) | 419 | __releases(rq->lock) |
| 376 | { | 420 | { |
| 377 | spin_unlock_irqrestore(&rq->lock, *flags); | 421 | spin_unlock_irqrestore(&rq->lock, *flags); |
| @@ -391,7 +435,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 391 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 435 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
| 392 | seq_printf(seq, "timestamp %lu\n", jiffies); | 436 | seq_printf(seq, "timestamp %lu\n", jiffies); |
| 393 | for_each_online_cpu(cpu) { | 437 | for_each_online_cpu(cpu) { |
| 394 | runqueue_t *rq = cpu_rq(cpu); | 438 | struct rq *rq = cpu_rq(cpu); |
| 395 | #ifdef CONFIG_SMP | 439 | #ifdef CONFIG_SMP |
| 396 | struct sched_domain *sd; | 440 | struct sched_domain *sd; |
| 397 | int dcnt = 0; | 441 | int dcnt = 0; |
| @@ -468,9 +512,36 @@ struct file_operations proc_schedstat_operations = { | |||
| 468 | .release = single_release, | 512 | .release = single_release, |
| 469 | }; | 513 | }; |
| 470 | 514 | ||
| 515 | /* | ||
| 516 | * Expects runqueue lock to be held for atomicity of update | ||
| 517 | */ | ||
| 518 | static inline void | ||
| 519 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
| 520 | { | ||
| 521 | if (rq) { | ||
| 522 | rq->rq_sched_info.run_delay += delta_jiffies; | ||
| 523 | rq->rq_sched_info.pcnt++; | ||
| 524 | } | ||
| 525 | } | ||
| 526 | |||
| 527 | /* | ||
| 528 | * Expects runqueue lock to be held for atomicity of update | ||
| 529 | */ | ||
| 530 | static inline void | ||
| 531 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
| 532 | { | ||
| 533 | if (rq) | ||
| 534 | rq->rq_sched_info.cpu_time += delta_jiffies; | ||
| 535 | } | ||
| 471 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 536 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
| 472 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 537 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
| 473 | #else /* !CONFIG_SCHEDSTATS */ | 538 | #else /* !CONFIG_SCHEDSTATS */ |
| 539 | static inline void | ||
| 540 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
| 541 | {} | ||
| 542 | static inline void | ||
| 543 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
| 544 | {} | ||
| 474 | # define schedstat_inc(rq, field) do { } while (0) | 545 | # define schedstat_inc(rq, field) do { } while (0) |
| 475 | # define schedstat_add(rq, field, amt) do { } while (0) | 546 | # define schedstat_add(rq, field, amt) do { } while (0) |
| 476 | #endif | 547 | #endif |
| @@ -478,10 +549,10 @@ struct file_operations proc_schedstat_operations = { | |||
| 478 | /* | 549 | /* |
| 479 | * rq_lock - lock a given runqueue and disable interrupts. | 550 | * rq_lock - lock a given runqueue and disable interrupts. |
| 480 | */ | 551 | */ |
| 481 | static inline runqueue_t *this_rq_lock(void) | 552 | static inline struct rq *this_rq_lock(void) |
| 482 | __acquires(rq->lock) | 553 | __acquires(rq->lock) |
| 483 | { | 554 | { |
| 484 | runqueue_t *rq; | 555 | struct rq *rq; |
| 485 | 556 | ||
| 486 | local_irq_disable(); | 557 | local_irq_disable(); |
| 487 | rq = this_rq(); | 558 | rq = this_rq(); |
| @@ -490,7 +561,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
| 490 | return rq; | 561 | return rq; |
| 491 | } | 562 | } |
| 492 | 563 | ||
| 493 | #ifdef CONFIG_SCHEDSTATS | 564 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 494 | /* | 565 | /* |
| 495 | * Called when a process is dequeued from the active array and given | 566 | * Called when a process is dequeued from the active array and given |
| 496 | * the cpu. We should note that with the exception of interactive | 567 | * the cpu. We should note that with the exception of interactive |
| @@ -506,7 +577,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
| 506 | * long it was from the *first* time it was queued to the time that it | 577 | * long it was from the *first* time it was queued to the time that it |
| 507 | * finally hit a cpu. | 578 | * finally hit a cpu. |
| 508 | */ | 579 | */ |
| 509 | static inline void sched_info_dequeued(task_t *t) | 580 | static inline void sched_info_dequeued(struct task_struct *t) |
| 510 | { | 581 | { |
| 511 | t->sched_info.last_queued = 0; | 582 | t->sched_info.last_queued = 0; |
| 512 | } | 583 | } |
| @@ -516,23 +587,18 @@ static inline void sched_info_dequeued(task_t *t) | |||
| 516 | * long it was waiting to run. We also note when it began so that we | 587 | * long it was waiting to run. We also note when it began so that we |
| 517 | * can keep stats on how long its timeslice is. | 588 | * can keep stats on how long its timeslice is. |
| 518 | */ | 589 | */ |
| 519 | static void sched_info_arrive(task_t *t) | 590 | static void sched_info_arrive(struct task_struct *t) |
| 520 | { | 591 | { |
| 521 | unsigned long now = jiffies, diff = 0; | 592 | unsigned long now = jiffies, delta_jiffies = 0; |
| 522 | struct runqueue *rq = task_rq(t); | ||
| 523 | 593 | ||
| 524 | if (t->sched_info.last_queued) | 594 | if (t->sched_info.last_queued) |
| 525 | diff = now - t->sched_info.last_queued; | 595 | delta_jiffies = now - t->sched_info.last_queued; |
| 526 | sched_info_dequeued(t); | 596 | sched_info_dequeued(t); |
| 527 | t->sched_info.run_delay += diff; | 597 | t->sched_info.run_delay += delta_jiffies; |
| 528 | t->sched_info.last_arrival = now; | 598 | t->sched_info.last_arrival = now; |
| 529 | t->sched_info.pcnt++; | 599 | t->sched_info.pcnt++; |
| 530 | 600 | ||
| 531 | if (!rq) | 601 | rq_sched_info_arrive(task_rq(t), delta_jiffies); |
| 532 | return; | ||
| 533 | |||
| 534 | rq->rq_sched_info.run_delay += diff; | ||
| 535 | rq->rq_sched_info.pcnt++; | ||
| 536 | } | 602 | } |
| 537 | 603 | ||
| 538 | /* | 604 | /* |
| @@ -550,25 +616,23 @@ static void sched_info_arrive(task_t *t) | |||
| 550 | * the timestamp if it is already not set. It's assumed that | 616 | * the timestamp if it is already not set. It's assumed that |
| 551 | * sched_info_dequeued() will clear that stamp when appropriate. | 617 | * sched_info_dequeued() will clear that stamp when appropriate. |
| 552 | */ | 618 | */ |
| 553 | static inline void sched_info_queued(task_t *t) | 619 | static inline void sched_info_queued(struct task_struct *t) |
| 554 | { | 620 | { |
| 555 | if (!t->sched_info.last_queued) | 621 | if (unlikely(sched_info_on())) |
| 556 | t->sched_info.last_queued = jiffies; | 622 | if (!t->sched_info.last_queued) |
| 623 | t->sched_info.last_queued = jiffies; | ||
| 557 | } | 624 | } |
| 558 | 625 | ||
| 559 | /* | 626 | /* |
| 560 | * Called when a process ceases being the active-running process, either | 627 | * Called when a process ceases being the active-running process, either |
| 561 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 628 | * voluntarily or involuntarily. Now we can calculate how long we ran. |
| 562 | */ | 629 | */ |
| 563 | static inline void sched_info_depart(task_t *t) | 630 | static inline void sched_info_depart(struct task_struct *t) |
| 564 | { | 631 | { |
| 565 | struct runqueue *rq = task_rq(t); | 632 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; |
| 566 | unsigned long diff = jiffies - t->sched_info.last_arrival; | ||
| 567 | |||
| 568 | t->sched_info.cpu_time += diff; | ||
| 569 | 633 | ||
| 570 | if (rq) | 634 | t->sched_info.cpu_time += delta_jiffies; |
| 571 | rq->rq_sched_info.cpu_time += diff; | 635 | rq_sched_info_depart(task_rq(t), delta_jiffies); |
| 572 | } | 636 | } |
| 573 | 637 | ||
| 574 | /* | 638 | /* |
| @@ -576,9 +640,10 @@ static inline void sched_info_depart(task_t *t) | |||
| 576 | * their time slice. (This may also be called when switching to or from | 640 | * their time slice. (This may also be called when switching to or from |
| 577 | * the idle task.) We are only called when prev != next. | 641 | * the idle task.) We are only called when prev != next. |
| 578 | */ | 642 | */ |
| 579 | static inline void sched_info_switch(task_t *prev, task_t *next) | 643 | static inline void |
| 644 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
| 580 | { | 645 | { |
| 581 | struct runqueue *rq = task_rq(prev); | 646 | struct rq *rq = task_rq(prev); |
| 582 | 647 | ||
| 583 | /* | 648 | /* |
| 584 | * prev now departs the cpu. It's not interesting to record | 649 | * prev now departs the cpu. It's not interesting to record |
| @@ -591,15 +656,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next) | |||
| 591 | if (next != rq->idle) | 656 | if (next != rq->idle) |
| 592 | sched_info_arrive(next); | 657 | sched_info_arrive(next); |
| 593 | } | 658 | } |
| 659 | static inline void | ||
| 660 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
| 661 | { | ||
| 662 | if (unlikely(sched_info_on())) | ||
| 663 | __sched_info_switch(prev, next); | ||
| 664 | } | ||
| 594 | #else | 665 | #else |
| 595 | #define sched_info_queued(t) do { } while (0) | 666 | #define sched_info_queued(t) do { } while (0) |
| 596 | #define sched_info_switch(t, next) do { } while (0) | 667 | #define sched_info_switch(t, next) do { } while (0) |
| 597 | #endif /* CONFIG_SCHEDSTATS */ | 668 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
| 598 | 669 | ||
| 599 | /* | 670 | /* |
| 600 | * Adding/removing a task to/from a priority array: | 671 | * Adding/removing a task to/from a priority array: |
| 601 | */ | 672 | */ |
| 602 | static void dequeue_task(struct task_struct *p, prio_array_t *array) | 673 | static void dequeue_task(struct task_struct *p, struct prio_array *array) |
| 603 | { | 674 | { |
| 604 | array->nr_active--; | 675 | array->nr_active--; |
| 605 | list_del(&p->run_list); | 676 | list_del(&p->run_list); |
| @@ -607,7 +678,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) | |||
| 607 | __clear_bit(p->prio, array->bitmap); | 678 | __clear_bit(p->prio, array->bitmap); |
| 608 | } | 679 | } |
| 609 | 680 | ||
| 610 | static void enqueue_task(struct task_struct *p, prio_array_t *array) | 681 | static void enqueue_task(struct task_struct *p, struct prio_array *array) |
| 611 | { | 682 | { |
| 612 | sched_info_queued(p); | 683 | sched_info_queued(p); |
| 613 | list_add_tail(&p->run_list, array->queue + p->prio); | 684 | list_add_tail(&p->run_list, array->queue + p->prio); |
| @@ -620,12 +691,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) | |||
| 620 | * Put task to the end of the run list without the overhead of dequeue | 691 | * Put task to the end of the run list without the overhead of dequeue |
| 621 | * followed by enqueue. | 692 | * followed by enqueue. |
| 622 | */ | 693 | */ |
| 623 | static void requeue_task(struct task_struct *p, prio_array_t *array) | 694 | static void requeue_task(struct task_struct *p, struct prio_array *array) |
| 624 | { | 695 | { |
| 625 | list_move_tail(&p->run_list, array->queue + p->prio); | 696 | list_move_tail(&p->run_list, array->queue + p->prio); |
| 626 | } | 697 | } |
| 627 | 698 | ||
| 628 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | 699 | static inline void |
| 700 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | ||
| 629 | { | 701 | { |
| 630 | list_add(&p->run_list, array->queue + p->prio); | 702 | list_add(&p->run_list, array->queue + p->prio); |
| 631 | __set_bit(p->prio, array->bitmap); | 703 | __set_bit(p->prio, array->bitmap); |
| @@ -634,7 +706,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
| 634 | } | 706 | } |
| 635 | 707 | ||
| 636 | /* | 708 | /* |
| 637 | * effective_prio - return the priority that is based on the static | 709 | * __normal_prio - return the priority that is based on the static |
| 638 | * priority but is modified by bonuses/penalties. | 710 | * priority but is modified by bonuses/penalties. |
| 639 | * | 711 | * |
| 640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 712 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
| @@ -647,13 +719,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
| 647 | * | 719 | * |
| 648 | * Both properties are important to certain workloads. | 720 | * Both properties are important to certain workloads. |
| 649 | */ | 721 | */ |
| 650 | static int effective_prio(task_t *p) | 722 | |
| 723 | static inline int __normal_prio(struct task_struct *p) | ||
| 651 | { | 724 | { |
| 652 | int bonus, prio; | 725 | int bonus, prio; |
| 653 | 726 | ||
| 654 | if (rt_task(p)) | ||
| 655 | return p->prio; | ||
| 656 | |||
| 657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 727 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
| 658 | 728 | ||
| 659 | prio = p->static_prio - bonus; | 729 | prio = p->static_prio - bonus; |
| @@ -665,57 +735,165 @@ static int effective_prio(task_t *p) | |||
| 665 | } | 735 | } |
| 666 | 736 | ||
| 667 | /* | 737 | /* |
| 738 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
| 739 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
| 740 | * each task makes to its run queue's load is weighted according to its | ||
| 741 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
| 742 | * scaled version of the new time slice allocation that they receive on time | ||
| 743 | * slice expiry etc. | ||
| 744 | */ | ||
| 745 | |||
| 746 | /* | ||
| 747 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
| 748 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
| 749 | * this code will need modification | ||
| 750 | */ | ||
| 751 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
| 752 | #define LOAD_WEIGHT(lp) \ | ||
| 753 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
| 754 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
| 755 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
| 756 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
| 757 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
| 758 | |||
| 759 | static void set_load_weight(struct task_struct *p) | ||
| 760 | { | ||
| 761 | if (has_rt_policy(p)) { | ||
| 762 | #ifdef CONFIG_SMP | ||
| 763 | if (p == task_rq(p)->migration_thread) | ||
| 764 | /* | ||
| 765 | * The migration thread does the actual balancing. | ||
| 766 | * Giving its load any weight will skew balancing | ||
| 767 | * adversely. | ||
| 768 | */ | ||
| 769 | p->load_weight = 0; | ||
| 770 | else | ||
| 771 | #endif | ||
| 772 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
| 773 | } else | ||
| 774 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
| 775 | } | ||
| 776 | |||
| 777 | static inline void | ||
| 778 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
| 779 | { | ||
| 780 | rq->raw_weighted_load += p->load_weight; | ||
| 781 | } | ||
| 782 | |||
| 783 | static inline void | ||
| 784 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
| 785 | { | ||
| 786 | rq->raw_weighted_load -= p->load_weight; | ||
| 787 | } | ||
| 788 | |||
| 789 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
| 790 | { | ||
| 791 | rq->nr_running++; | ||
| 792 | inc_raw_weighted_load(rq, p); | ||
| 793 | } | ||
| 794 | |||
| 795 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | ||
| 796 | { | ||
| 797 | rq->nr_running--; | ||
| 798 | dec_raw_weighted_load(rq, p); | ||
| 799 | } | ||
| 800 | |||
| 801 | /* | ||
| 802 | * Calculate the expected normal priority: i.e. priority | ||
| 803 | * without taking RT-inheritance into account. Might be | ||
| 804 | * boosted by interactivity modifiers. Changes upon fork, | ||
| 805 | * setprio syscalls, and whenever the interactivity | ||
| 806 | * estimator recalculates. | ||
| 807 | */ | ||
| 808 | static inline int normal_prio(struct task_struct *p) | ||
| 809 | { | ||
| 810 | int prio; | ||
| 811 | |||
| 812 | if (has_rt_policy(p)) | ||
| 813 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
| 814 | else | ||
| 815 | prio = __normal_prio(p); | ||
| 816 | return prio; | ||
| 817 | } | ||
| 818 | |||
| 819 | /* | ||
| 820 | * Calculate the current priority, i.e. the priority | ||
| 821 | * taken into account by the scheduler. This value might | ||
| 822 | * be boosted by RT tasks, or might be boosted by | ||
| 823 | * interactivity modifiers. Will be RT if the task got | ||
| 824 | * RT-boosted. If not then it returns p->normal_prio. | ||
| 825 | */ | ||
| 826 | static int effective_prio(struct task_struct *p) | ||
| 827 | { | ||
| 828 | p->normal_prio = normal_prio(p); | ||
| 829 | /* | ||
| 830 | * If we are RT tasks or we were boosted to RT priority, | ||
| 831 | * keep the priority unchanged. Otherwise, update priority | ||
| 832 | * to the normal priority: | ||
| 833 | */ | ||
| 834 | if (!rt_prio(p->prio)) | ||
| 835 | return p->normal_prio; | ||
| 836 | return p->prio; | ||
| 837 | } | ||
| 838 | |||
| 839 | /* | ||
| 668 | * __activate_task - move a task to the runqueue. | 840 | * __activate_task - move a task to the runqueue. |
| 669 | */ | 841 | */ |
| 670 | static void __activate_task(task_t *p, runqueue_t *rq) | 842 | static void __activate_task(struct task_struct *p, struct rq *rq) |
| 671 | { | 843 | { |
| 672 | prio_array_t *target = rq->active; | 844 | struct prio_array *target = rq->active; |
| 673 | 845 | ||
| 674 | if (batch_task(p)) | 846 | if (batch_task(p)) |
| 675 | target = rq->expired; | 847 | target = rq->expired; |
| 676 | enqueue_task(p, target); | 848 | enqueue_task(p, target); |
| 677 | rq->nr_running++; | 849 | inc_nr_running(p, rq); |
| 678 | } | 850 | } |
| 679 | 851 | ||
| 680 | /* | 852 | /* |
| 681 | * __activate_idle_task - move idle task to the _front_ of runqueue. | 853 | * __activate_idle_task - move idle task to the _front_ of runqueue. |
| 682 | */ | 854 | */ |
| 683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 855 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
| 684 | { | 856 | { |
| 685 | enqueue_task_head(p, rq->active); | 857 | enqueue_task_head(p, rq->active); |
| 686 | rq->nr_running++; | 858 | inc_nr_running(p, rq); |
| 687 | } | 859 | } |
| 688 | 860 | ||
| 689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 861 | /* |
| 862 | * Recalculate p->normal_prio and p->prio after having slept, | ||
| 863 | * updating the sleep-average too: | ||
| 864 | */ | ||
| 865 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
| 690 | { | 866 | { |
| 691 | /* Caller must always ensure 'now >= p->timestamp' */ | 867 | /* Caller must always ensure 'now >= p->timestamp' */ |
| 692 | unsigned long long __sleep_time = now - p->timestamp; | 868 | unsigned long sleep_time = now - p->timestamp; |
| 693 | unsigned long sleep_time; | ||
| 694 | 869 | ||
| 695 | if (batch_task(p)) | 870 | if (batch_task(p)) |
| 696 | sleep_time = 0; | 871 | sleep_time = 0; |
| 697 | else { | ||
| 698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
| 699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
| 700 | else | ||
| 701 | sleep_time = (unsigned long)__sleep_time; | ||
| 702 | } | ||
| 703 | 872 | ||
| 704 | if (likely(sleep_time > 0)) { | 873 | if (likely(sleep_time > 0)) { |
| 705 | /* | 874 | /* |
| 706 | * User tasks that sleep a long time are categorised as | 875 | * This ceiling is set to the lowest priority that would allow |
| 707 | * idle. They will only have their sleep_avg increased to a | 876 | * a task to be reinserted into the active array on timeslice |
| 708 | * level that makes them just interactive priority to stay | 877 | * completion. |
| 709 | * active yet prevent them suddenly becoming cpu hogs and | ||
| 710 | * starving other processes. | ||
| 711 | */ | 878 | */ |
| 712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 879 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
| 713 | unsigned long ceiling; | ||
| 714 | 880 | ||
| 715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 881 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
| 716 | DEF_TIMESLICE); | 882 | /* |
| 717 | if (p->sleep_avg < ceiling) | 883 | * Prevents user tasks from achieving best priority |
| 718 | p->sleep_avg = ceiling; | 884 | * with one single large enough sleep. |
| 885 | */ | ||
| 886 | p->sleep_avg = ceiling; | ||
| 887 | /* | ||
| 888 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
| 889 | * nice(0) task 1ms sleep away from promotion, and | ||
| 890 | * gives it 700ms to round-robin with no chance of | ||
| 891 | * being demoted. This is more than generous, so | ||
| 892 | * mark this sleep as non-interactive to prevent the | ||
| 893 | * on-runqueue bonus logic from intervening should | ||
| 894 | * this task not receive cpu immediately. | ||
| 895 | */ | ||
| 896 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
| 719 | } else { | 897 | } else { |
| 720 | /* | 898 | /* |
| 721 | * Tasks waking from uninterruptible sleep are | 899 | * Tasks waking from uninterruptible sleep are |
| @@ -723,12 +901,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 723 | * are likely to be waiting on I/O | 901 | * are likely to be waiting on I/O |
| 724 | */ | 902 | */ |
| 725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 903 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
| 726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 904 | if (p->sleep_avg >= ceiling) |
| 727 | sleep_time = 0; | 905 | sleep_time = 0; |
| 728 | else if (p->sleep_avg + sleep_time >= | 906 | else if (p->sleep_avg + sleep_time >= |
| 729 | INTERACTIVE_SLEEP(p)) { | 907 | ceiling) { |
| 730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 908 | p->sleep_avg = ceiling; |
| 731 | sleep_time = 0; | 909 | sleep_time = 0; |
| 732 | } | 910 | } |
| 733 | } | 911 | } |
| 734 | 912 | ||
| @@ -742,9 +920,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 742 | */ | 920 | */ |
| 743 | p->sleep_avg += sleep_time; | 921 | p->sleep_avg += sleep_time; |
| 744 | 922 | ||
| 745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
| 746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
| 747 | } | 923 | } |
| 924 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
| 925 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
| 748 | } | 926 | } |
| 749 | 927 | ||
| 750 | return effective_prio(p); | 928 | return effective_prio(p); |
| @@ -756,7 +934,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 756 | * Update all the scheduling statistics stuff. (sleep average | 934 | * Update all the scheduling statistics stuff. (sleep average |
| 757 | * calculation, priority modifiers, etc.) | 935 | * calculation, priority modifiers, etc.) |
| 758 | */ | 936 | */ |
| 759 | static void activate_task(task_t *p, runqueue_t *rq, int local) | 937 | static void activate_task(struct task_struct *p, struct rq *rq, int local) |
| 760 | { | 938 | { |
| 761 | unsigned long long now; | 939 | unsigned long long now; |
| 762 | 940 | ||
| @@ -764,7 +942,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
| 764 | #ifdef CONFIG_SMP | 942 | #ifdef CONFIG_SMP |
| 765 | if (!local) { | 943 | if (!local) { |
| 766 | /* Compensate for drifting sched_clock */ | 944 | /* Compensate for drifting sched_clock */ |
| 767 | runqueue_t *this_rq = this_rq(); | 945 | struct rq *this_rq = this_rq(); |
| 768 | now = (now - this_rq->timestamp_last_tick) | 946 | now = (now - this_rq->timestamp_last_tick) |
| 769 | + rq->timestamp_last_tick; | 947 | + rq->timestamp_last_tick; |
| 770 | } | 948 | } |
| @@ -803,9 +981,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
| 803 | /* | 981 | /* |
| 804 | * deactivate_task - remove a task from the runqueue. | 982 | * deactivate_task - remove a task from the runqueue. |
| 805 | */ | 983 | */ |
| 806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 984 | static void deactivate_task(struct task_struct *p, struct rq *rq) |
| 807 | { | 985 | { |
| 808 | rq->nr_running--; | 986 | dec_nr_running(p, rq); |
| 809 | dequeue_task(p, p->array); | 987 | dequeue_task(p, p->array); |
| 810 | p->array = NULL; | 988 | p->array = NULL; |
| 811 | } | 989 | } |
| @@ -818,7 +996,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
| 818 | * the target CPU. | 996 | * the target CPU. |
| 819 | */ | 997 | */ |
| 820 | #ifdef CONFIG_SMP | 998 | #ifdef CONFIG_SMP |
| 821 | static void resched_task(task_t *p) | 999 | |
| 1000 | #ifndef tsk_is_polling | ||
| 1001 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
| 1002 | #endif | ||
| 1003 | |||
| 1004 | static void resched_task(struct task_struct *p) | ||
| 822 | { | 1005 | { |
| 823 | int cpu; | 1006 | int cpu; |
| 824 | 1007 | ||
| @@ -833,13 +1016,13 @@ static void resched_task(task_t *p) | |||
| 833 | if (cpu == smp_processor_id()) | 1016 | if (cpu == smp_processor_id()) |
| 834 | return; | 1017 | return; |
| 835 | 1018 | ||
| 836 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 1019 | /* NEED_RESCHED must be visible before we test polling */ |
| 837 | smp_mb(); | 1020 | smp_mb(); |
| 838 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 1021 | if (!tsk_is_polling(p)) |
| 839 | smp_send_reschedule(cpu); | 1022 | smp_send_reschedule(cpu); |
| 840 | } | 1023 | } |
| 841 | #else | 1024 | #else |
| 842 | static inline void resched_task(task_t *p) | 1025 | static inline void resched_task(struct task_struct *p) |
| 843 | { | 1026 | { |
| 844 | assert_spin_locked(&task_rq(p)->lock); | 1027 | assert_spin_locked(&task_rq(p)->lock); |
| 845 | set_tsk_need_resched(p); | 1028 | set_tsk_need_resched(p); |
| @@ -850,28 +1033,35 @@ static inline void resched_task(task_t *p) | |||
| 850 | * task_curr - is this task currently executing on a CPU? | 1033 | * task_curr - is this task currently executing on a CPU? |
| 851 | * @p: the task in question. | 1034 | * @p: the task in question. |
| 852 | */ | 1035 | */ |
| 853 | inline int task_curr(const task_t *p) | 1036 | inline int task_curr(const struct task_struct *p) |
| 854 | { | 1037 | { |
| 855 | return cpu_curr(task_cpu(p)) == p; | 1038 | return cpu_curr(task_cpu(p)) == p; |
| 856 | } | 1039 | } |
| 857 | 1040 | ||
| 1041 | /* Used instead of source_load when we know the type == 0 */ | ||
| 1042 | unsigned long weighted_cpuload(const int cpu) | ||
| 1043 | { | ||
| 1044 | return cpu_rq(cpu)->raw_weighted_load; | ||
| 1045 | } | ||
| 1046 | |||
| 858 | #ifdef CONFIG_SMP | 1047 | #ifdef CONFIG_SMP |
| 859 | typedef struct { | 1048 | struct migration_req { |
| 860 | struct list_head list; | 1049 | struct list_head list; |
| 861 | 1050 | ||
| 862 | task_t *task; | 1051 | struct task_struct *task; |
| 863 | int dest_cpu; | 1052 | int dest_cpu; |
| 864 | 1053 | ||
| 865 | struct completion done; | 1054 | struct completion done; |
| 866 | } migration_req_t; | 1055 | }; |
| 867 | 1056 | ||
| 868 | /* | 1057 | /* |
| 869 | * The task's runqueue lock must be held. | 1058 | * The task's runqueue lock must be held. |
| 870 | * Returns true if you have to wait for migration thread. | 1059 | * Returns true if you have to wait for migration thread. |
| 871 | */ | 1060 | */ |
| 872 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | 1061 | static int |
| 1062 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | ||
| 873 | { | 1063 | { |
| 874 | runqueue_t *rq = task_rq(p); | 1064 | struct rq *rq = task_rq(p); |
| 875 | 1065 | ||
| 876 | /* | 1066 | /* |
| 877 | * If the task is not on a runqueue (and not running), then | 1067 | * If the task is not on a runqueue (and not running), then |
| @@ -886,6 +1076,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
| 886 | req->task = p; | 1076 | req->task = p; |
| 887 | req->dest_cpu = dest_cpu; | 1077 | req->dest_cpu = dest_cpu; |
| 888 | list_add(&req->list, &rq->migration_queue); | 1078 | list_add(&req->list, &rq->migration_queue); |
| 1079 | |||
| 889 | return 1; | 1080 | return 1; |
| 890 | } | 1081 | } |
| 891 | 1082 | ||
| @@ -898,10 +1089,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
| 898 | * smp_call_function() if an IPI is sent by the same process we are | 1089 | * smp_call_function() if an IPI is sent by the same process we are |
| 899 | * waiting to become inactive. | 1090 | * waiting to become inactive. |
| 900 | */ | 1091 | */ |
| 901 | void wait_task_inactive(task_t *p) | 1092 | void wait_task_inactive(struct task_struct *p) |
| 902 | { | 1093 | { |
| 903 | unsigned long flags; | 1094 | unsigned long flags; |
| 904 | runqueue_t *rq; | 1095 | struct rq *rq; |
| 905 | int preempted; | 1096 | int preempted; |
| 906 | 1097 | ||
| 907 | repeat: | 1098 | repeat: |
| @@ -932,7 +1123,7 @@ repeat: | |||
| 932 | * to another CPU then no harm is done and the purpose has been | 1123 | * to another CPU then no harm is done and the purpose has been |
| 933 | * achieved as well. | 1124 | * achieved as well. |
| 934 | */ | 1125 | */ |
| 935 | void kick_process(task_t *p) | 1126 | void kick_process(struct task_struct *p) |
| 936 | { | 1127 | { |
| 937 | int cpu; | 1128 | int cpu; |
| 938 | 1129 | ||
| @@ -944,32 +1135,45 @@ void kick_process(task_t *p) | |||
| 944 | } | 1135 | } |
| 945 | 1136 | ||
| 946 | /* | 1137 | /* |
| 947 | * Return a low guess at the load of a migration-source cpu. | 1138 | * Return a low guess at the load of a migration-source cpu weighted |
| 1139 | * according to the scheduling class and "nice" value. | ||
| 948 | * | 1140 | * |
| 949 | * We want to under-estimate the load of migration sources, to | 1141 | * We want to under-estimate the load of migration sources, to |
| 950 | * balance conservatively. | 1142 | * balance conservatively. |
| 951 | */ | 1143 | */ |
| 952 | static inline unsigned long source_load(int cpu, int type) | 1144 | static inline unsigned long source_load(int cpu, int type) |
| 953 | { | 1145 | { |
| 954 | runqueue_t *rq = cpu_rq(cpu); | 1146 | struct rq *rq = cpu_rq(cpu); |
| 955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1147 | |
| 956 | if (type == 0) | 1148 | if (type == 0) |
| 957 | return load_now; | 1149 | return rq->raw_weighted_load; |
| 958 | 1150 | ||
| 959 | return min(rq->cpu_load[type-1], load_now); | 1151 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
| 960 | } | 1152 | } |
| 961 | 1153 | ||
| 962 | /* | 1154 | /* |
| 963 | * Return a high guess at the load of a migration-target cpu | 1155 | * Return a high guess at the load of a migration-target cpu weighted |
| 1156 | * according to the scheduling class and "nice" value. | ||
| 964 | */ | 1157 | */ |
| 965 | static inline unsigned long target_load(int cpu, int type) | 1158 | static inline unsigned long target_load(int cpu, int type) |
| 966 | { | 1159 | { |
| 967 | runqueue_t *rq = cpu_rq(cpu); | 1160 | struct rq *rq = cpu_rq(cpu); |
| 968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1161 | |
| 969 | if (type == 0) | 1162 | if (type == 0) |
| 970 | return load_now; | 1163 | return rq->raw_weighted_load; |
| 1164 | |||
| 1165 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | ||
| 1166 | } | ||
| 971 | 1167 | ||
| 972 | return max(rq->cpu_load[type-1], load_now); | 1168 | /* |
| 1169 | * Return the average load per task on the cpu's run queue | ||
| 1170 | */ | ||
| 1171 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1172 | { | ||
| 1173 | struct rq *rq = cpu_rq(cpu); | ||
| 1174 | unsigned long n = rq->nr_running; | ||
| 1175 | |||
| 1176 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | ||
| 973 | } | 1177 | } |
| 974 | 1178 | ||
| 975 | /* | 1179 | /* |
| @@ -1042,7 +1246,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 1042 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1246 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
| 1043 | 1247 | ||
| 1044 | for_each_cpu_mask(i, tmp) { | 1248 | for_each_cpu_mask(i, tmp) { |
| 1045 | load = source_load(i, 0); | 1249 | load = weighted_cpuload(i); |
| 1046 | 1250 | ||
| 1047 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1251 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| 1048 | min_load = load; | 1252 | min_load = load; |
| @@ -1069,9 +1273,15 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1069 | struct task_struct *t = current; | 1273 | struct task_struct *t = current; |
| 1070 | struct sched_domain *tmp, *sd = NULL; | 1274 | struct sched_domain *tmp, *sd = NULL; |
| 1071 | 1275 | ||
| 1072 | for_each_domain(cpu, tmp) | 1276 | for_each_domain(cpu, tmp) { |
| 1277 | /* | ||
| 1278 | * If power savings logic is enabled for a domain, stop there. | ||
| 1279 | */ | ||
| 1280 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
| 1281 | break; | ||
| 1073 | if (tmp->flags & flag) | 1282 | if (tmp->flags & flag) |
| 1074 | sd = tmp; | 1283 | sd = tmp; |
| 1284 | } | ||
| 1075 | 1285 | ||
| 1076 | while (sd) { | 1286 | while (sd) { |
| 1077 | cpumask_t span; | 1287 | cpumask_t span; |
| @@ -1116,7 +1326,7 @@ nextlevel: | |||
| 1116 | * Returns the CPU we should wake onto. | 1326 | * Returns the CPU we should wake onto. |
| 1117 | */ | 1327 | */ |
| 1118 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1328 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
| 1119 | static int wake_idle(int cpu, task_t *p) | 1329 | static int wake_idle(int cpu, struct task_struct *p) |
| 1120 | { | 1330 | { |
| 1121 | cpumask_t tmp; | 1331 | cpumask_t tmp; |
| 1122 | struct sched_domain *sd; | 1332 | struct sched_domain *sd; |
| @@ -1139,7 +1349,7 @@ static int wake_idle(int cpu, task_t *p) | |||
| 1139 | return cpu; | 1349 | return cpu; |
| 1140 | } | 1350 | } |
| 1141 | #else | 1351 | #else |
| 1142 | static inline int wake_idle(int cpu, task_t *p) | 1352 | static inline int wake_idle(int cpu, struct task_struct *p) |
| 1143 | { | 1353 | { |
| 1144 | return cpu; | 1354 | return cpu; |
| 1145 | } | 1355 | } |
| @@ -1159,15 +1369,15 @@ static inline int wake_idle(int cpu, task_t *p) | |||
| 1159 | * | 1369 | * |
| 1160 | * returns failure only if the task is already active. | 1370 | * returns failure only if the task is already active. |
| 1161 | */ | 1371 | */ |
| 1162 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) | 1372 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
| 1163 | { | 1373 | { |
| 1164 | int cpu, this_cpu, success = 0; | 1374 | int cpu, this_cpu, success = 0; |
| 1165 | unsigned long flags; | 1375 | unsigned long flags; |
| 1166 | long old_state; | 1376 | long old_state; |
| 1167 | runqueue_t *rq; | 1377 | struct rq *rq; |
| 1168 | #ifdef CONFIG_SMP | 1378 | #ifdef CONFIG_SMP |
| 1169 | unsigned long load, this_load; | ||
| 1170 | struct sched_domain *sd, *this_sd = NULL; | 1379 | struct sched_domain *sd, *this_sd = NULL; |
| 1380 | unsigned long load, this_load; | ||
| 1171 | int new_cpu; | 1381 | int new_cpu; |
| 1172 | #endif | 1382 | #endif |
| 1173 | 1383 | ||
| @@ -1221,17 +1431,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
| 1221 | 1431 | ||
| 1222 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1432 | if (this_sd->flags & SD_WAKE_AFFINE) { |
| 1223 | unsigned long tl = this_load; | 1433 | unsigned long tl = this_load; |
| 1434 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 1435 | |||
| 1224 | /* | 1436 | /* |
| 1225 | * If sync wakeup then subtract the (maximum possible) | 1437 | * If sync wakeup then subtract the (maximum possible) |
| 1226 | * effect of the currently running task from the load | 1438 | * effect of the currently running task from the load |
| 1227 | * of the current CPU: | 1439 | * of the current CPU: |
| 1228 | */ | 1440 | */ |
| 1229 | if (sync) | 1441 | if (sync) |
| 1230 | tl -= SCHED_LOAD_SCALE; | 1442 | tl -= current->load_weight; |
| 1231 | 1443 | ||
| 1232 | if ((tl <= load && | 1444 | if ((tl <= load && |
| 1233 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1445 | tl + target_load(cpu, idx) <= tl_per_task) || |
| 1234 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1446 | 100*(tl + p->load_weight) <= imbalance*load) { |
| 1235 | /* | 1447 | /* |
| 1236 | * This domain has SD_WAKE_AFFINE and | 1448 | * This domain has SD_WAKE_AFFINE and |
| 1237 | * p is cache cold in this domain, and | 1449 | * p is cache cold in this domain, and |
| @@ -1315,15 +1527,14 @@ out: | |||
| 1315 | return success; | 1527 | return success; |
| 1316 | } | 1528 | } |
| 1317 | 1529 | ||
| 1318 | int fastcall wake_up_process(task_t *p) | 1530 | int fastcall wake_up_process(struct task_struct *p) |
| 1319 | { | 1531 | { |
| 1320 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1532 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
| 1321 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1533 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
| 1322 | } | 1534 | } |
| 1323 | |||
| 1324 | EXPORT_SYMBOL(wake_up_process); | 1535 | EXPORT_SYMBOL(wake_up_process); |
| 1325 | 1536 | ||
| 1326 | int fastcall wake_up_state(task_t *p, unsigned int state) | 1537 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
| 1327 | { | 1538 | { |
| 1328 | return try_to_wake_up(p, state, 0); | 1539 | return try_to_wake_up(p, state, 0); |
| 1329 | } | 1540 | } |
| @@ -1332,7 +1543,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
| 1332 | * Perform scheduler related setup for a newly forked process p. | 1543 | * Perform scheduler related setup for a newly forked process p. |
| 1333 | * p is forked by current. | 1544 | * p is forked by current. |
| 1334 | */ | 1545 | */ |
| 1335 | void fastcall sched_fork(task_t *p, int clone_flags) | 1546 | void fastcall sched_fork(struct task_struct *p, int clone_flags) |
| 1336 | { | 1547 | { |
| 1337 | int cpu = get_cpu(); | 1548 | int cpu = get_cpu(); |
| 1338 | 1549 | ||
| @@ -1348,10 +1559,17 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1348 | * event cannot wake it up and insert it on the runqueue either. | 1559 | * event cannot wake it up and insert it on the runqueue either. |
| 1349 | */ | 1560 | */ |
| 1350 | p->state = TASK_RUNNING; | 1561 | p->state = TASK_RUNNING; |
| 1562 | |||
| 1563 | /* | ||
| 1564 | * Make sure we do not leak PI boosting priority to the child: | ||
| 1565 | */ | ||
| 1566 | p->prio = current->normal_prio; | ||
| 1567 | |||
| 1351 | INIT_LIST_HEAD(&p->run_list); | 1568 | INIT_LIST_HEAD(&p->run_list); |
| 1352 | p->array = NULL; | 1569 | p->array = NULL; |
| 1353 | #ifdef CONFIG_SCHEDSTATS | 1570 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 1354 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1571 | if (unlikely(sched_info_on())) |
| 1572 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
| 1355 | #endif | 1573 | #endif |
| 1356 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1574 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
| 1357 | p->oncpu = 0; | 1575 | p->oncpu = 0; |
| @@ -1394,11 +1612,11 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1394 | * that must be done for every newly created context, then puts the task | 1612 | * that must be done for every newly created context, then puts the task |
| 1395 | * on the runqueue and wakes it. | 1613 | * on the runqueue and wakes it. |
| 1396 | */ | 1614 | */ |
| 1397 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | 1615 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
| 1398 | { | 1616 | { |
| 1617 | struct rq *rq, *this_rq; | ||
| 1399 | unsigned long flags; | 1618 | unsigned long flags; |
| 1400 | int this_cpu, cpu; | 1619 | int this_cpu, cpu; |
| 1401 | runqueue_t *rq, *this_rq; | ||
| 1402 | 1620 | ||
| 1403 | rq = task_rq_lock(p, &flags); | 1621 | rq = task_rq_lock(p, &flags); |
| 1404 | BUG_ON(p->state != TASK_RUNNING); | 1622 | BUG_ON(p->state != TASK_RUNNING); |
| @@ -1427,10 +1645,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
| 1427 | __activate_task(p, rq); | 1645 | __activate_task(p, rq); |
| 1428 | else { | 1646 | else { |
| 1429 | p->prio = current->prio; | 1647 | p->prio = current->prio; |
| 1648 | p->normal_prio = current->normal_prio; | ||
| 1430 | list_add_tail(&p->run_list, ¤t->run_list); | 1649 | list_add_tail(&p->run_list, ¤t->run_list); |
| 1431 | p->array = current->array; | 1650 | p->array = current->array; |
| 1432 | p->array->nr_active++; | 1651 | p->array->nr_active++; |
| 1433 | rq->nr_running++; | 1652 | inc_nr_running(p, rq); |
| 1434 | } | 1653 | } |
| 1435 | set_need_resched(); | 1654 | set_need_resched(); |
| 1436 | } else | 1655 | } else |
| @@ -1477,10 +1696,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
| 1477 | * artificially, because any timeslice recovered here | 1696 | * artificially, because any timeslice recovered here |
| 1478 | * was given away by the parent in the first place.) | 1697 | * was given away by the parent in the first place.) |
| 1479 | */ | 1698 | */ |
| 1480 | void fastcall sched_exit(task_t *p) | 1699 | void fastcall sched_exit(struct task_struct *p) |
| 1481 | { | 1700 | { |
| 1482 | unsigned long flags; | 1701 | unsigned long flags; |
| 1483 | runqueue_t *rq; | 1702 | struct rq *rq; |
| 1484 | 1703 | ||
| 1485 | /* | 1704 | /* |
| 1486 | * If the child was a (relative-) CPU hog then decrease | 1705 | * If the child was a (relative-) CPU hog then decrease |
| @@ -1511,7 +1730,7 @@ void fastcall sched_exit(task_t *p) | |||
| 1511 | * prepare_task_switch sets up locking and calls architecture specific | 1730 | * prepare_task_switch sets up locking and calls architecture specific |
| 1512 | * hooks. | 1731 | * hooks. |
| 1513 | */ | 1732 | */ |
| 1514 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | 1733 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) |
| 1515 | { | 1734 | { |
| 1516 | prepare_lock_switch(rq, next); | 1735 | prepare_lock_switch(rq, next); |
| 1517 | prepare_arch_switch(next); | 1736 | prepare_arch_switch(next); |
| @@ -1532,31 +1751,31 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | |||
| 1532 | * with the lock held can cause deadlocks; see schedule() for | 1751 | * with the lock held can cause deadlocks; see schedule() for |
| 1533 | * details.) | 1752 | * details.) |
| 1534 | */ | 1753 | */ |
| 1535 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | 1754 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
| 1536 | __releases(rq->lock) | 1755 | __releases(rq->lock) |
| 1537 | { | 1756 | { |
| 1538 | struct mm_struct *mm = rq->prev_mm; | 1757 | struct mm_struct *mm = rq->prev_mm; |
| 1539 | unsigned long prev_task_flags; | 1758 | long prev_state; |
| 1540 | 1759 | ||
| 1541 | rq->prev_mm = NULL; | 1760 | rq->prev_mm = NULL; |
| 1542 | 1761 | ||
| 1543 | /* | 1762 | /* |
| 1544 | * A task struct has one reference for the use as "current". | 1763 | * A task struct has one reference for the use as "current". |
| 1545 | * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and | 1764 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
| 1546 | * calls schedule one last time. The schedule call will never return, | 1765 | * schedule one last time. The schedule call will never return, and |
| 1547 | * and the scheduled task must drop that reference. | 1766 | * the scheduled task must drop that reference. |
| 1548 | * The test for EXIT_ZOMBIE must occur while the runqueue locks are | 1767 | * The test for TASK_DEAD must occur while the runqueue locks are |
| 1549 | * still held, otherwise prev could be scheduled on another cpu, die | 1768 | * still held, otherwise prev could be scheduled on another cpu, die |
| 1550 | * there before we look at prev->state, and then the reference would | 1769 | * there before we look at prev->state, and then the reference would |
| 1551 | * be dropped twice. | 1770 | * be dropped twice. |
| 1552 | * Manfred Spraul <manfred@colorfullife.com> | 1771 | * Manfred Spraul <manfred@colorfullife.com> |
| 1553 | */ | 1772 | */ |
| 1554 | prev_task_flags = prev->flags; | 1773 | prev_state = prev->state; |
| 1555 | finish_arch_switch(prev); | 1774 | finish_arch_switch(prev); |
| 1556 | finish_lock_switch(rq, prev); | 1775 | finish_lock_switch(rq, prev); |
| 1557 | if (mm) | 1776 | if (mm) |
| 1558 | mmdrop(mm); | 1777 | mmdrop(mm); |
| 1559 | if (unlikely(prev_task_flags & PF_DEAD)) { | 1778 | if (unlikely(prev_state == TASK_DEAD)) { |
| 1560 | /* | 1779 | /* |
| 1561 | * Remove function-return probe instances associated with this | 1780 | * Remove function-return probe instances associated with this |
| 1562 | * task and put them back on the free list. | 1781 | * task and put them back on the free list. |
| @@ -1570,10 +1789,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | |||
| 1570 | * schedule_tail - first thing a freshly forked thread must call. | 1789 | * schedule_tail - first thing a freshly forked thread must call. |
| 1571 | * @prev: the thread we just switched away from. | 1790 | * @prev: the thread we just switched away from. |
| 1572 | */ | 1791 | */ |
| 1573 | asmlinkage void schedule_tail(task_t *prev) | 1792 | asmlinkage void schedule_tail(struct task_struct *prev) |
| 1574 | __releases(rq->lock) | 1793 | __releases(rq->lock) |
| 1575 | { | 1794 | { |
| 1576 | runqueue_t *rq = this_rq(); | 1795 | struct rq *rq = this_rq(); |
| 1796 | |||
| 1577 | finish_task_switch(rq, prev); | 1797 | finish_task_switch(rq, prev); |
| 1578 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1798 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
| 1579 | /* In this case, finish_task_switch does not reenable preemption */ | 1799 | /* In this case, finish_task_switch does not reenable preemption */ |
| @@ -1587,8 +1807,9 @@ asmlinkage void schedule_tail(task_t *prev) | |||
| 1587 | * context_switch - switch to the new MM and the new | 1807 | * context_switch - switch to the new MM and the new |
| 1588 | * thread's register state. | 1808 | * thread's register state. |
| 1589 | */ | 1809 | */ |
| 1590 | static inline | 1810 | static inline struct task_struct * |
| 1591 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | 1811 | context_switch(struct rq *rq, struct task_struct *prev, |
| 1812 | struct task_struct *next) | ||
| 1592 | { | 1813 | { |
| 1593 | struct mm_struct *mm = next->mm; | 1814 | struct mm_struct *mm = next->mm; |
| 1594 | struct mm_struct *oldmm = prev->active_mm; | 1815 | struct mm_struct *oldmm = prev->active_mm; |
| @@ -1605,6 +1826,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | |||
| 1605 | WARN_ON(rq->prev_mm); | 1826 | WARN_ON(rq->prev_mm); |
| 1606 | rq->prev_mm = oldmm; | 1827 | rq->prev_mm = oldmm; |
| 1607 | } | 1828 | } |
| 1829 | /* | ||
| 1830 | * Since the runqueue lock will be released by the next | ||
| 1831 | * task (which is an invalid locking op but in the case | ||
| 1832 | * of the scheduler it's an obvious special-case), so we | ||
| 1833 | * do an early lockdep release here: | ||
| 1834 | */ | ||
| 1835 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 1836 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
| 1837 | #endif | ||
| 1608 | 1838 | ||
| 1609 | /* Here we just switch the register state and the stack. */ | 1839 | /* Here we just switch the register state and the stack. */ |
| 1610 | switch_to(prev, next, prev); | 1840 | switch_to(prev, next, prev); |
| @@ -1648,7 +1878,8 @@ unsigned long nr_uninterruptible(void) | |||
| 1648 | 1878 | ||
| 1649 | unsigned long long nr_context_switches(void) | 1879 | unsigned long long nr_context_switches(void) |
| 1650 | { | 1880 | { |
| 1651 | unsigned long long i, sum = 0; | 1881 | int i; |
| 1882 | unsigned long long sum = 0; | ||
| 1652 | 1883 | ||
| 1653 | for_each_possible_cpu(i) | 1884 | for_each_possible_cpu(i) |
| 1654 | sum += cpu_rq(i)->nr_switches; | 1885 | sum += cpu_rq(i)->nr_switches; |
| @@ -1684,15 +1915,21 @@ unsigned long nr_active(void) | |||
| 1684 | #ifdef CONFIG_SMP | 1915 | #ifdef CONFIG_SMP |
| 1685 | 1916 | ||
| 1686 | /* | 1917 | /* |
| 1918 | * Is this task likely cache-hot: | ||
| 1919 | */ | ||
| 1920 | static inline int | ||
| 1921 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
| 1922 | { | ||
| 1923 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | ||
| 1924 | } | ||
| 1925 | |||
| 1926 | /* | ||
| 1687 | * double_rq_lock - safely lock two runqueues | 1927 | * double_rq_lock - safely lock two runqueues |
| 1688 | * | 1928 | * |
| 1689 | * We must take them in cpu order to match code in | ||
| 1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
| 1691 | * | ||
| 1692 | * Note this does not disable interrupts like task_rq_lock, | 1929 | * Note this does not disable interrupts like task_rq_lock, |
| 1693 | * you need to do so manually before calling. | 1930 | * you need to do so manually before calling. |
| 1694 | */ | 1931 | */ |
| 1695 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | 1932 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
| 1696 | __acquires(rq1->lock) | 1933 | __acquires(rq1->lock) |
| 1697 | __acquires(rq2->lock) | 1934 | __acquires(rq2->lock) |
| 1698 | { | 1935 | { |
| @@ -1700,7 +1937,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1700 | spin_lock(&rq1->lock); | 1937 | spin_lock(&rq1->lock); |
| 1701 | __acquire(rq2->lock); /* Fake it out ;) */ | 1938 | __acquire(rq2->lock); /* Fake it out ;) */ |
| 1702 | } else { | 1939 | } else { |
| 1703 | if (rq1->cpu < rq2->cpu) { | 1940 | if (rq1 < rq2) { |
| 1704 | spin_lock(&rq1->lock); | 1941 | spin_lock(&rq1->lock); |
| 1705 | spin_lock(&rq2->lock); | 1942 | spin_lock(&rq2->lock); |
| 1706 | } else { | 1943 | } else { |
| @@ -1716,7 +1953,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1716 | * Note this does not restore interrupts like task_rq_unlock, | 1953 | * Note this does not restore interrupts like task_rq_unlock, |
| 1717 | * you need to do so manually after calling. | 1954 | * you need to do so manually after calling. |
| 1718 | */ | 1955 | */ |
| 1719 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | 1956 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
| 1720 | __releases(rq1->lock) | 1957 | __releases(rq1->lock) |
| 1721 | __releases(rq2->lock) | 1958 | __releases(rq2->lock) |
| 1722 | { | 1959 | { |
| @@ -1730,13 +1967,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1730 | /* | 1967 | /* |
| 1731 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1968 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
| 1732 | */ | 1969 | */ |
| 1733 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | 1970 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) |
| 1734 | __releases(this_rq->lock) | 1971 | __releases(this_rq->lock) |
| 1735 | __acquires(busiest->lock) | 1972 | __acquires(busiest->lock) |
| 1736 | __acquires(this_rq->lock) | 1973 | __acquires(this_rq->lock) |
| 1737 | { | 1974 | { |
| 1738 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1975 | if (unlikely(!spin_trylock(&busiest->lock))) { |
| 1739 | if (busiest->cpu < this_rq->cpu) { | 1976 | if (busiest < this_rq) { |
| 1740 | spin_unlock(&this_rq->lock); | 1977 | spin_unlock(&this_rq->lock); |
| 1741 | spin_lock(&busiest->lock); | 1978 | spin_lock(&busiest->lock); |
| 1742 | spin_lock(&this_rq->lock); | 1979 | spin_lock(&this_rq->lock); |
| @@ -1751,11 +1988,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
| 1751 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1988 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
| 1752 | * the cpu_allowed mask is restored. | 1989 | * the cpu_allowed mask is restored. |
| 1753 | */ | 1990 | */ |
| 1754 | static void sched_migrate_task(task_t *p, int dest_cpu) | 1991 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
| 1755 | { | 1992 | { |
| 1756 | migration_req_t req; | 1993 | struct migration_req req; |
| 1757 | runqueue_t *rq; | ||
| 1758 | unsigned long flags; | 1994 | unsigned long flags; |
| 1995 | struct rq *rq; | ||
| 1759 | 1996 | ||
| 1760 | rq = task_rq_lock(p, &flags); | 1997 | rq = task_rq_lock(p, &flags); |
| 1761 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 1998 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
| @@ -1766,11 +2003,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu) | |||
| 1766 | if (migrate_task(p, dest_cpu, &req)) { | 2003 | if (migrate_task(p, dest_cpu, &req)) { |
| 1767 | /* Need to wait for migration thread (might exit: take ref). */ | 2004 | /* Need to wait for migration thread (might exit: take ref). */ |
| 1768 | struct task_struct *mt = rq->migration_thread; | 2005 | struct task_struct *mt = rq->migration_thread; |
| 2006 | |||
| 1769 | get_task_struct(mt); | 2007 | get_task_struct(mt); |
| 1770 | task_rq_unlock(rq, &flags); | 2008 | task_rq_unlock(rq, &flags); |
| 1771 | wake_up_process(mt); | 2009 | wake_up_process(mt); |
| 1772 | put_task_struct(mt); | 2010 | put_task_struct(mt); |
| 1773 | wait_for_completion(&req.done); | 2011 | wait_for_completion(&req.done); |
| 2012 | |||
| 1774 | return; | 2013 | return; |
| 1775 | } | 2014 | } |
| 1776 | out: | 2015 | out: |
| @@ -1794,14 +2033,14 @@ void sched_exec(void) | |||
| 1794 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2033 | * pull_task - move a task from a remote runqueue to the local runqueue. |
| 1795 | * Both runqueues must be locked. | 2034 | * Both runqueues must be locked. |
| 1796 | */ | 2035 | */ |
| 1797 | static | 2036 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, |
| 1798 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 2037 | struct task_struct *p, struct rq *this_rq, |
| 1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 2038 | struct prio_array *this_array, int this_cpu) |
| 1800 | { | 2039 | { |
| 1801 | dequeue_task(p, src_array); | 2040 | dequeue_task(p, src_array); |
| 1802 | src_rq->nr_running--; | 2041 | dec_nr_running(p, src_rq); |
| 1803 | set_task_cpu(p, this_cpu); | 2042 | set_task_cpu(p, this_cpu); |
| 1804 | this_rq->nr_running++; | 2043 | inc_nr_running(p, this_rq); |
| 1805 | enqueue_task(p, this_array); | 2044 | enqueue_task(p, this_array); |
| 1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2045 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
| 1807 | + this_rq->timestamp_last_tick; | 2046 | + this_rq->timestamp_last_tick; |
| @@ -1817,7 +2056,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
| 1817 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2056 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 1818 | */ | 2057 | */ |
| 1819 | static | 2058 | static |
| 1820 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 2059 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
| 1821 | struct sched_domain *sd, enum idle_type idle, | 2060 | struct sched_domain *sd, enum idle_type idle, |
| 1822 | int *all_pinned) | 2061 | int *all_pinned) |
| 1823 | { | 2062 | { |
| @@ -1848,26 +2087,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
| 1848 | return 1; | 2087 | return 1; |
| 1849 | } | 2088 | } |
| 1850 | 2089 | ||
| 2090 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
| 2091 | |||
| 1851 | /* | 2092 | /* |
| 1852 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2093 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
| 1853 | * as part of a balancing operation within "domain". Returns the number of | 2094 | * load from busiest to this_rq, as part of a balancing operation within |
| 1854 | * tasks moved. | 2095 | * "domain". Returns the number of tasks moved. |
| 1855 | * | 2096 | * |
| 1856 | * Called with both runqueues locked. | 2097 | * Called with both runqueues locked. |
| 1857 | */ | 2098 | */ |
| 1858 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2099 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1859 | unsigned long max_nr_move, struct sched_domain *sd, | 2100 | unsigned long max_nr_move, unsigned long max_load_move, |
| 1860 | enum idle_type idle, int *all_pinned) | 2101 | struct sched_domain *sd, enum idle_type idle, |
| 2102 | int *all_pinned) | ||
| 1861 | { | 2103 | { |
| 1862 | prio_array_t *array, *dst_array; | 2104 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, |
| 2105 | best_prio_seen, skip_for_load; | ||
| 2106 | struct prio_array *array, *dst_array; | ||
| 1863 | struct list_head *head, *curr; | 2107 | struct list_head *head, *curr; |
| 1864 | int idx, pulled = 0, pinned = 0; | 2108 | struct task_struct *tmp; |
| 1865 | task_t *tmp; | 2109 | long rem_load_move; |
| 1866 | 2110 | ||
| 1867 | if (max_nr_move == 0) | 2111 | if (max_nr_move == 0 || max_load_move == 0) |
| 1868 | goto out; | 2112 | goto out; |
| 1869 | 2113 | ||
| 2114 | rem_load_move = max_load_move; | ||
| 1870 | pinned = 1; | 2115 | pinned = 1; |
| 2116 | this_best_prio = rq_best_prio(this_rq); | ||
| 2117 | best_prio = rq_best_prio(busiest); | ||
| 2118 | /* | ||
| 2119 | * Enable handling of the case where there is more than one task | ||
| 2120 | * with the best priority. If the current running task is one | ||
| 2121 | * of those with prio==best_prio we know it won't be moved | ||
| 2122 | * and therefore it's safe to override the skip (based on load) of | ||
| 2123 | * any task we find with that prio. | ||
| 2124 | */ | ||
| 2125 | best_prio_seen = best_prio == busiest->curr->prio; | ||
| 1871 | 2126 | ||
| 1872 | /* | 2127 | /* |
| 1873 | * We first consider expired tasks. Those will likely not be | 2128 | * We first consider expired tasks. Those will likely not be |
| @@ -1903,11 +2158,22 @@ skip_bitmap: | |||
| 1903 | head = array->queue + idx; | 2158 | head = array->queue + idx; |
| 1904 | curr = head->prev; | 2159 | curr = head->prev; |
| 1905 | skip_queue: | 2160 | skip_queue: |
| 1906 | tmp = list_entry(curr, task_t, run_list); | 2161 | tmp = list_entry(curr, struct task_struct, run_list); |
| 1907 | 2162 | ||
| 1908 | curr = curr->prev; | 2163 | curr = curr->prev; |
| 1909 | 2164 | ||
| 1910 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2165 | /* |
| 2166 | * To help distribute high priority tasks accross CPUs we don't | ||
| 2167 | * skip a task if it will be the highest priority task (i.e. smallest | ||
| 2168 | * prio value) on its new queue regardless of its load weight | ||
| 2169 | */ | ||
| 2170 | skip_for_load = tmp->load_weight > rem_load_move; | ||
| 2171 | if (skip_for_load && idx < this_best_prio) | ||
| 2172 | skip_for_load = !best_prio_seen && idx == best_prio; | ||
| 2173 | if (skip_for_load || | ||
| 2174 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
| 2175 | |||
| 2176 | best_prio_seen |= idx == best_prio; | ||
| 1911 | if (curr != head) | 2177 | if (curr != head) |
| 1912 | goto skip_queue; | 2178 | goto skip_queue; |
| 1913 | idx++; | 2179 | idx++; |
| @@ -1921,9 +2187,15 @@ skip_queue: | |||
| 1921 | 2187 | ||
| 1922 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2188 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
| 1923 | pulled++; | 2189 | pulled++; |
| 2190 | rem_load_move -= tmp->load_weight; | ||
| 1924 | 2191 | ||
| 1925 | /* We only want to steal up to the prescribed number of tasks. */ | 2192 | /* |
| 1926 | if (pulled < max_nr_move) { | 2193 | * We only want to steal up to the prescribed number of tasks |
| 2194 | * and the prescribed amount of weighted load. | ||
| 2195 | */ | ||
| 2196 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
| 2197 | if (idx < this_best_prio) | ||
| 2198 | this_best_prio = idx; | ||
| 1927 | if (curr != head) | 2199 | if (curr != head) |
| 1928 | goto skip_queue; | 2200 | goto skip_queue; |
| 1929 | idx++; | 2201 | idx++; |
| @@ -1944,19 +2216,30 @@ out: | |||
| 1944 | 2216 | ||
| 1945 | /* | 2217 | /* |
| 1946 | * find_busiest_group finds and returns the busiest CPU group within the | 2218 | * find_busiest_group finds and returns the busiest CPU group within the |
| 1947 | * domain. It calculates and returns the number of tasks which should be | 2219 | * domain. It calculates and returns the amount of weighted load which |
| 1948 | * moved to restore balance via the imbalance parameter. | 2220 | * should be moved to restore balance via the imbalance parameter. |
| 1949 | */ | 2221 | */ |
| 1950 | static struct sched_group * | 2222 | static struct sched_group * |
| 1951 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2223 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 1952 | unsigned long *imbalance, enum idle_type idle, int *sd_idle) | 2224 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, |
| 2225 | cpumask_t *cpus) | ||
| 1953 | { | 2226 | { |
| 1954 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2227 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 1955 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2228 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| 1956 | unsigned long max_pull; | 2229 | unsigned long max_pull; |
| 2230 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
| 2231 | unsigned long this_load_per_task, this_nr_running; | ||
| 1957 | int load_idx; | 2232 | int load_idx; |
| 2233 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2234 | int power_savings_balance = 1; | ||
| 2235 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
| 2236 | unsigned long min_nr_running = ULONG_MAX; | ||
| 2237 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
| 2238 | #endif | ||
| 1958 | 2239 | ||
| 1959 | max_load = this_load = total_load = total_pwr = 0; | 2240 | max_load = this_load = total_load = total_pwr = 0; |
| 2241 | busiest_load_per_task = busiest_nr_running = 0; | ||
| 2242 | this_load_per_task = this_nr_running = 0; | ||
| 1960 | if (idle == NOT_IDLE) | 2243 | if (idle == NOT_IDLE) |
| 1961 | load_idx = sd->busy_idx; | 2244 | load_idx = sd->busy_idx; |
| 1962 | else if (idle == NEWLY_IDLE) | 2245 | else if (idle == NEWLY_IDLE) |
| @@ -1965,16 +2248,24 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1965 | load_idx = sd->idle_idx; | 2248 | load_idx = sd->idle_idx; |
| 1966 | 2249 | ||
| 1967 | do { | 2250 | do { |
| 1968 | unsigned long load; | 2251 | unsigned long load, group_capacity; |
| 1969 | int local_group; | 2252 | int local_group; |
| 1970 | int i; | 2253 | int i; |
| 2254 | unsigned long sum_nr_running, sum_weighted_load; | ||
| 1971 | 2255 | ||
| 1972 | local_group = cpu_isset(this_cpu, group->cpumask); | 2256 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 1973 | 2257 | ||
| 1974 | /* Tally up the load of all CPUs in the group */ | 2258 | /* Tally up the load of all CPUs in the group */ |
| 1975 | avg_load = 0; | 2259 | sum_weighted_load = sum_nr_running = avg_load = 0; |
| 1976 | 2260 | ||
| 1977 | for_each_cpu_mask(i, group->cpumask) { | 2261 | for_each_cpu_mask(i, group->cpumask) { |
| 2262 | struct rq *rq; | ||
| 2263 | |||
| 2264 | if (!cpu_isset(i, *cpus)) | ||
| 2265 | continue; | ||
| 2266 | |||
| 2267 | rq = cpu_rq(i); | ||
| 2268 | |||
| 1978 | if (*sd_idle && !idle_cpu(i)) | 2269 | if (*sd_idle && !idle_cpu(i)) |
| 1979 | *sd_idle = 0; | 2270 | *sd_idle = 0; |
| 1980 | 2271 | ||
| @@ -1985,6 +2276,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1985 | load = source_load(i, load_idx); | 2276 | load = source_load(i, load_idx); |
| 1986 | 2277 | ||
| 1987 | avg_load += load; | 2278 | avg_load += load; |
| 2279 | sum_nr_running += rq->nr_running; | ||
| 2280 | sum_weighted_load += rq->raw_weighted_load; | ||
| 1988 | } | 2281 | } |
| 1989 | 2282 | ||
| 1990 | total_load += avg_load; | 2283 | total_load += avg_load; |
| @@ -1993,17 +2286,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1993 | /* Adjust by relative CPU power of the group */ | 2286 | /* Adjust by relative CPU power of the group */ |
| 1994 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2287 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 1995 | 2288 | ||
| 2289 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
| 2290 | |||
| 1996 | if (local_group) { | 2291 | if (local_group) { |
| 1997 | this_load = avg_load; | 2292 | this_load = avg_load; |
| 1998 | this = group; | 2293 | this = group; |
| 1999 | } else if (avg_load > max_load) { | 2294 | this_nr_running = sum_nr_running; |
| 2295 | this_load_per_task = sum_weighted_load; | ||
| 2296 | } else if (avg_load > max_load && | ||
| 2297 | sum_nr_running > group_capacity) { | ||
| 2000 | max_load = avg_load; | 2298 | max_load = avg_load; |
| 2001 | busiest = group; | 2299 | busiest = group; |
| 2300 | busiest_nr_running = sum_nr_running; | ||
| 2301 | busiest_load_per_task = sum_weighted_load; | ||
| 2302 | } | ||
| 2303 | |||
| 2304 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2305 | /* | ||
| 2306 | * Busy processors will not participate in power savings | ||
| 2307 | * balance. | ||
| 2308 | */ | ||
| 2309 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 2310 | goto group_next; | ||
| 2311 | |||
| 2312 | /* | ||
| 2313 | * If the local group is idle or completely loaded | ||
| 2314 | * no need to do power savings balance at this domain | ||
| 2315 | */ | ||
| 2316 | if (local_group && (this_nr_running >= group_capacity || | ||
| 2317 | !this_nr_running)) | ||
| 2318 | power_savings_balance = 0; | ||
| 2319 | |||
| 2320 | /* | ||
| 2321 | * If a group is already running at full capacity or idle, | ||
| 2322 | * don't include that group in power savings calculations | ||
| 2323 | */ | ||
| 2324 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
| 2325 | || !sum_nr_running) | ||
| 2326 | goto group_next; | ||
| 2327 | |||
| 2328 | /* | ||
| 2329 | * Calculate the group which has the least non-idle load. | ||
| 2330 | * This is the group from where we need to pick up the load | ||
| 2331 | * for saving power | ||
| 2332 | */ | ||
| 2333 | if ((sum_nr_running < min_nr_running) || | ||
| 2334 | (sum_nr_running == min_nr_running && | ||
| 2335 | first_cpu(group->cpumask) < | ||
| 2336 | first_cpu(group_min->cpumask))) { | ||
| 2337 | group_min = group; | ||
| 2338 | min_nr_running = sum_nr_running; | ||
| 2339 | min_load_per_task = sum_weighted_load / | ||
| 2340 | sum_nr_running; | ||
| 2341 | } | ||
| 2342 | |||
| 2343 | /* | ||
| 2344 | * Calculate the group which is almost near its | ||
| 2345 | * capacity but still has some space to pick up some load | ||
| 2346 | * from other group and save more power | ||
| 2347 | */ | ||
| 2348 | if (sum_nr_running <= group_capacity - 1) { | ||
| 2349 | if (sum_nr_running > leader_nr_running || | ||
| 2350 | (sum_nr_running == leader_nr_running && | ||
| 2351 | first_cpu(group->cpumask) > | ||
| 2352 | first_cpu(group_leader->cpumask))) { | ||
| 2353 | group_leader = group; | ||
| 2354 | leader_nr_running = sum_nr_running; | ||
| 2355 | } | ||
| 2002 | } | 2356 | } |
| 2357 | group_next: | ||
| 2358 | #endif | ||
| 2003 | group = group->next; | 2359 | group = group->next; |
| 2004 | } while (group != sd->groups); | 2360 | } while (group != sd->groups); |
| 2005 | 2361 | ||
| 2006 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2362 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
| 2007 | goto out_balanced; | 2363 | goto out_balanced; |
| 2008 | 2364 | ||
| 2009 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2365 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
| @@ -2012,6 +2368,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2012 | 100*max_load <= sd->imbalance_pct*this_load) | 2368 | 100*max_load <= sd->imbalance_pct*this_load) |
| 2013 | goto out_balanced; | 2369 | goto out_balanced; |
| 2014 | 2370 | ||
| 2371 | busiest_load_per_task /= busiest_nr_running; | ||
| 2015 | /* | 2372 | /* |
| 2016 | * We're trying to get all the cpus to the average_load, so we don't | 2373 | * We're trying to get all the cpus to the average_load, so we don't |
| 2017 | * want to push ourselves above the average load, nor do we wish to | 2374 | * want to push ourselves above the average load, nor do we wish to |
| @@ -2023,21 +2380,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2023 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2380 | * by pulling tasks to us. Be careful of negative numbers as they'll |
| 2024 | * appear as very large values with unsigned longs. | 2381 | * appear as very large values with unsigned longs. |
| 2025 | */ | 2382 | */ |
| 2383 | if (max_load <= busiest_load_per_task) | ||
| 2384 | goto out_balanced; | ||
| 2385 | |||
| 2386 | /* | ||
| 2387 | * In the presence of smp nice balancing, certain scenarios can have | ||
| 2388 | * max load less than avg load(as we skip the groups at or below | ||
| 2389 | * its cpu_power, while calculating max_load..) | ||
| 2390 | */ | ||
| 2391 | if (max_load < avg_load) { | ||
| 2392 | *imbalance = 0; | ||
| 2393 | goto small_imbalance; | ||
| 2394 | } | ||
| 2026 | 2395 | ||
| 2027 | /* Don't want to pull so many tasks that a group would go idle */ | 2396 | /* Don't want to pull so many tasks that a group would go idle */ |
| 2028 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2397 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
| 2029 | 2398 | ||
| 2030 | /* How much load to actually move to equalise the imbalance */ | 2399 | /* How much load to actually move to equalise the imbalance */ |
| 2031 | *imbalance = min(max_pull * busiest->cpu_power, | 2400 | *imbalance = min(max_pull * busiest->cpu_power, |
| 2032 | (avg_load - this_load) * this->cpu_power) | 2401 | (avg_load - this_load) * this->cpu_power) |
| 2033 | / SCHED_LOAD_SCALE; | 2402 | / SCHED_LOAD_SCALE; |
| 2034 | 2403 | ||
| 2035 | if (*imbalance < SCHED_LOAD_SCALE) { | 2404 | /* |
| 2036 | unsigned long pwr_now = 0, pwr_move = 0; | 2405 | * if *imbalance is less than the average load per runnable task |
| 2037 | unsigned long tmp; | 2406 | * there is no gaurantee that any tasks will be moved so we'll have |
| 2407 | * a think about bumping its value to force at least one task to be | ||
| 2408 | * moved | ||
| 2409 | */ | ||
| 2410 | if (*imbalance < busiest_load_per_task) { | ||
| 2411 | unsigned long tmp, pwr_now, pwr_move; | ||
| 2412 | unsigned int imbn; | ||
| 2413 | |||
| 2414 | small_imbalance: | ||
| 2415 | pwr_move = pwr_now = 0; | ||
| 2416 | imbn = 2; | ||
| 2417 | if (this_nr_running) { | ||
| 2418 | this_load_per_task /= this_nr_running; | ||
| 2419 | if (busiest_load_per_task > this_load_per_task) | ||
| 2420 | imbn = 1; | ||
| 2421 | } else | ||
| 2422 | this_load_per_task = SCHED_LOAD_SCALE; | ||
| 2038 | 2423 | ||
| 2039 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2424 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
| 2040 | *imbalance = 1; | 2425 | *imbalance = busiest_load_per_task; |
| 2041 | return busiest; | 2426 | return busiest; |
| 2042 | } | 2427 | } |
| 2043 | 2428 | ||
| @@ -2047,39 +2432,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2047 | * moving them. | 2432 | * moving them. |
| 2048 | */ | 2433 | */ |
| 2049 | 2434 | ||
| 2050 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2435 | pwr_now += busiest->cpu_power * |
| 2051 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2436 | min(busiest_load_per_task, max_load); |
| 2437 | pwr_now += this->cpu_power * | ||
| 2438 | min(this_load_per_task, this_load); | ||
| 2052 | pwr_now /= SCHED_LOAD_SCALE; | 2439 | pwr_now /= SCHED_LOAD_SCALE; |
| 2053 | 2440 | ||
| 2054 | /* Amount of load we'd subtract */ | 2441 | /* Amount of load we'd subtract */ |
| 2055 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2442 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
| 2056 | if (max_load > tmp) | 2443 | if (max_load > tmp) |
| 2057 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2444 | pwr_move += busiest->cpu_power * |
| 2058 | max_load - tmp); | 2445 | min(busiest_load_per_task, max_load - tmp); |
| 2059 | 2446 | ||
| 2060 | /* Amount of load we'd add */ | 2447 | /* Amount of load we'd add */ |
| 2061 | if (max_load*busiest->cpu_power < | 2448 | if (max_load*busiest->cpu_power < |
| 2062 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2449 | busiest_load_per_task*SCHED_LOAD_SCALE) |
| 2063 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2450 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
| 2064 | else | 2451 | else |
| 2065 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2452 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
| 2066 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2453 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
| 2067 | pwr_move /= SCHED_LOAD_SCALE; | 2454 | pwr_move /= SCHED_LOAD_SCALE; |
| 2068 | 2455 | ||
| 2069 | /* Move if we gain throughput */ | 2456 | /* Move if we gain throughput */ |
| 2070 | if (pwr_move <= pwr_now) | 2457 | if (pwr_move <= pwr_now) |
| 2071 | goto out_balanced; | 2458 | goto out_balanced; |
| 2072 | 2459 | ||
| 2073 | *imbalance = 1; | 2460 | *imbalance = busiest_load_per_task; |
| 2074 | return busiest; | ||
| 2075 | } | 2461 | } |
| 2076 | 2462 | ||
| 2077 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
| 2078 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
| 2079 | return busiest; | 2463 | return busiest; |
| 2080 | 2464 | ||
| 2081 | out_balanced: | 2465 | out_balanced: |
| 2466 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2467 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 2468 | goto ret; | ||
| 2082 | 2469 | ||
| 2470 | if (this == group_leader && group_leader != group_min) { | ||
| 2471 | *imbalance = min_load_per_task; | ||
| 2472 | return group_min; | ||
| 2473 | } | ||
| 2474 | ret: | ||
| 2475 | #endif | ||
| 2083 | *imbalance = 0; | 2476 | *imbalance = 0; |
| 2084 | return NULL; | 2477 | return NULL; |
| 2085 | } | 2478 | } |
| @@ -2087,19 +2480,27 @@ out_balanced: | |||
| 2087 | /* | 2480 | /* |
| 2088 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2481 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
| 2089 | */ | 2482 | */ |
| 2090 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2483 | static struct rq * |
| 2091 | enum idle_type idle) | 2484 | find_busiest_queue(struct sched_group *group, enum idle_type idle, |
| 2485 | unsigned long imbalance, cpumask_t *cpus) | ||
| 2092 | { | 2486 | { |
| 2093 | unsigned long load, max_load = 0; | 2487 | struct rq *busiest = NULL, *rq; |
| 2094 | runqueue_t *busiest = NULL; | 2488 | unsigned long max_load = 0; |
| 2095 | int i; | 2489 | int i; |
| 2096 | 2490 | ||
| 2097 | for_each_cpu_mask(i, group->cpumask) { | 2491 | for_each_cpu_mask(i, group->cpumask) { |
| 2098 | load = source_load(i, 0); | ||
| 2099 | 2492 | ||
| 2100 | if (load > max_load) { | 2493 | if (!cpu_isset(i, *cpus)) |
| 2101 | max_load = load; | 2494 | continue; |
| 2102 | busiest = cpu_rq(i); | 2495 | |
| 2496 | rq = cpu_rq(i); | ||
| 2497 | |||
| 2498 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) | ||
| 2499 | continue; | ||
| 2500 | |||
| 2501 | if (rq->raw_weighted_load > max_load) { | ||
| 2502 | max_load = rq->raw_weighted_load; | ||
| 2503 | busiest = rq; | ||
| 2103 | } | 2504 | } |
| 2104 | } | 2505 | } |
| 2105 | 2506 | ||
| @@ -2112,34 +2513,41 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
| 2112 | */ | 2513 | */ |
| 2113 | #define MAX_PINNED_INTERVAL 512 | 2514 | #define MAX_PINNED_INTERVAL 512 |
| 2114 | 2515 | ||
| 2516 | static inline unsigned long minus_1_or_zero(unsigned long n) | ||
| 2517 | { | ||
| 2518 | return n > 0 ? n - 1 : 0; | ||
| 2519 | } | ||
| 2520 | |||
| 2115 | /* | 2521 | /* |
| 2116 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2522 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
| 2117 | * tasks if there is an imbalance. | 2523 | * tasks if there is an imbalance. |
| 2118 | * | 2524 | * |
| 2119 | * Called with this_rq unlocked. | 2525 | * Called with this_rq unlocked. |
| 2120 | */ | 2526 | */ |
| 2121 | static int load_balance(int this_cpu, runqueue_t *this_rq, | 2527 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 2122 | struct sched_domain *sd, enum idle_type idle) | 2528 | struct sched_domain *sd, enum idle_type idle) |
| 2123 | { | 2529 | { |
| 2530 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
| 2124 | struct sched_group *group; | 2531 | struct sched_group *group; |
| 2125 | runqueue_t *busiest; | ||
| 2126 | unsigned long imbalance; | 2532 | unsigned long imbalance; |
| 2127 | int nr_moved, all_pinned = 0; | 2533 | struct rq *busiest; |
| 2128 | int active_balance = 0; | 2534 | cpumask_t cpus = CPU_MASK_ALL; |
| 2129 | int sd_idle = 0; | ||
| 2130 | 2535 | ||
| 2131 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2536 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
| 2537 | !sched_smt_power_savings) | ||
| 2132 | sd_idle = 1; | 2538 | sd_idle = 1; |
| 2133 | 2539 | ||
| 2134 | schedstat_inc(sd, lb_cnt[idle]); | 2540 | schedstat_inc(sd, lb_cnt[idle]); |
| 2135 | 2541 | ||
| 2136 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); | 2542 | redo: |
| 2543 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
| 2544 | &cpus); | ||
| 2137 | if (!group) { | 2545 | if (!group) { |
| 2138 | schedstat_inc(sd, lb_nobusyg[idle]); | 2546 | schedstat_inc(sd, lb_nobusyg[idle]); |
| 2139 | goto out_balanced; | 2547 | goto out_balanced; |
| 2140 | } | 2548 | } |
| 2141 | 2549 | ||
| 2142 | busiest = find_busiest_queue(group, idle); | 2550 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); |
| 2143 | if (!busiest) { | 2551 | if (!busiest) { |
| 2144 | schedstat_inc(sd, lb_nobusyq[idle]); | 2552 | schedstat_inc(sd, lb_nobusyq[idle]); |
| 2145 | goto out_balanced; | 2553 | goto out_balanced; |
| @@ -2159,12 +2567,17 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2159 | */ | 2567 | */ |
| 2160 | double_rq_lock(this_rq, busiest); | 2568 | double_rq_lock(this_rq, busiest); |
| 2161 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2569 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2162 | imbalance, sd, idle, &all_pinned); | 2570 | minus_1_or_zero(busiest->nr_running), |
| 2571 | imbalance, sd, idle, &all_pinned); | ||
| 2163 | double_rq_unlock(this_rq, busiest); | 2572 | double_rq_unlock(this_rq, busiest); |
| 2164 | 2573 | ||
| 2165 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2574 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 2166 | if (unlikely(all_pinned)) | 2575 | if (unlikely(all_pinned)) { |
| 2576 | cpu_clear(cpu_of(busiest), cpus); | ||
| 2577 | if (!cpus_empty(cpus)) | ||
| 2578 | goto redo; | ||
| 2167 | goto out_balanced; | 2579 | goto out_balanced; |
| 2580 | } | ||
| 2168 | } | 2581 | } |
| 2169 | 2582 | ||
| 2170 | if (!nr_moved) { | 2583 | if (!nr_moved) { |
| @@ -2216,7 +2629,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2216 | sd->balance_interval *= 2; | 2629 | sd->balance_interval *= 2; |
| 2217 | } | 2630 | } |
| 2218 | 2631 | ||
| 2219 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2632 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 2633 | !sched_smt_power_savings) | ||
| 2220 | return -1; | 2634 | return -1; |
| 2221 | return nr_moved; | 2635 | return nr_moved; |
| 2222 | 2636 | ||
| @@ -2231,7 +2645,8 @@ out_one_pinned: | |||
| 2231 | (sd->balance_interval < sd->max_interval)) | 2645 | (sd->balance_interval < sd->max_interval)) |
| 2232 | sd->balance_interval *= 2; | 2646 | sd->balance_interval *= 2; |
| 2233 | 2647 | ||
| 2234 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2648 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 2649 | !sched_smt_power_savings) | ||
| 2235 | return -1; | 2650 | return -1; |
| 2236 | return 0; | 2651 | return 0; |
| 2237 | } | 2652 | } |
| @@ -2243,26 +2658,30 @@ out_one_pinned: | |||
| 2243 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2658 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). |
| 2244 | * this_rq is locked. | 2659 | * this_rq is locked. |
| 2245 | */ | 2660 | */ |
| 2246 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | 2661 | static int |
| 2247 | struct sched_domain *sd) | 2662 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
| 2248 | { | 2663 | { |
| 2249 | struct sched_group *group; | 2664 | struct sched_group *group; |
| 2250 | runqueue_t *busiest = NULL; | 2665 | struct rq *busiest = NULL; |
| 2251 | unsigned long imbalance; | 2666 | unsigned long imbalance; |
| 2252 | int nr_moved = 0; | 2667 | int nr_moved = 0; |
| 2253 | int sd_idle = 0; | 2668 | int sd_idle = 0; |
| 2669 | cpumask_t cpus = CPU_MASK_ALL; | ||
| 2254 | 2670 | ||
| 2255 | if (sd->flags & SD_SHARE_CPUPOWER) | 2671 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
| 2256 | sd_idle = 1; | 2672 | sd_idle = 1; |
| 2257 | 2673 | ||
| 2258 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2674 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
| 2259 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); | 2675 | redo: |
| 2676 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | ||
| 2677 | &sd_idle, &cpus); | ||
| 2260 | if (!group) { | 2678 | if (!group) { |
| 2261 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2679 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
| 2262 | goto out_balanced; | 2680 | goto out_balanced; |
| 2263 | } | 2681 | } |
| 2264 | 2682 | ||
| 2265 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2683 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, |
| 2684 | &cpus); | ||
| 2266 | if (!busiest) { | 2685 | if (!busiest) { |
| 2267 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2686 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
| 2268 | goto out_balanced; | 2687 | goto out_balanced; |
| @@ -2277,8 +2696,15 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2277 | /* Attempt to move tasks */ | 2696 | /* Attempt to move tasks */ |
| 2278 | double_lock_balance(this_rq, busiest); | 2697 | double_lock_balance(this_rq, busiest); |
| 2279 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2698 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2699 | minus_1_or_zero(busiest->nr_running), | ||
| 2280 | imbalance, sd, NEWLY_IDLE, NULL); | 2700 | imbalance, sd, NEWLY_IDLE, NULL); |
| 2281 | spin_unlock(&busiest->lock); | 2701 | spin_unlock(&busiest->lock); |
| 2702 | |||
| 2703 | if (!nr_moved) { | ||
| 2704 | cpu_clear(cpu_of(busiest), cpus); | ||
| 2705 | if (!cpus_empty(cpus)) | ||
| 2706 | goto redo; | ||
| 2707 | } | ||
| 2282 | } | 2708 | } |
| 2283 | 2709 | ||
| 2284 | if (!nr_moved) { | 2710 | if (!nr_moved) { |
| @@ -2292,9 +2718,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2292 | 2718 | ||
| 2293 | out_balanced: | 2719 | out_balanced: |
| 2294 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2720 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
| 2295 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2721 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 2722 | !sched_smt_power_savings) | ||
| 2296 | return -1; | 2723 | return -1; |
| 2297 | sd->nr_balance_failed = 0; | 2724 | sd->nr_balance_failed = 0; |
| 2725 | |||
| 2298 | return 0; | 2726 | return 0; |
| 2299 | } | 2727 | } |
| 2300 | 2728 | ||
| @@ -2302,16 +2730,15 @@ out_balanced: | |||
| 2302 | * idle_balance is called by schedule() if this_cpu is about to become | 2730 | * idle_balance is called by schedule() if this_cpu is about to become |
| 2303 | * idle. Attempts to pull tasks from other CPUs. | 2731 | * idle. Attempts to pull tasks from other CPUs. |
| 2304 | */ | 2732 | */ |
| 2305 | static void idle_balance(int this_cpu, runqueue_t *this_rq) | 2733 | static void idle_balance(int this_cpu, struct rq *this_rq) |
| 2306 | { | 2734 | { |
| 2307 | struct sched_domain *sd; | 2735 | struct sched_domain *sd; |
| 2308 | 2736 | ||
| 2309 | for_each_domain(this_cpu, sd) { | 2737 | for_each_domain(this_cpu, sd) { |
| 2310 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2738 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 2311 | if (load_balance_newidle(this_cpu, this_rq, sd)) { | 2739 | /* If we've pulled tasks over stop searching: */ |
| 2312 | /* We've pulled tasks over so stop searching */ | 2740 | if (load_balance_newidle(this_cpu, this_rq, sd)) |
| 2313 | break; | 2741 | break; |
| 2314 | } | ||
| 2315 | } | 2742 | } |
| 2316 | } | 2743 | } |
| 2317 | } | 2744 | } |
| @@ -2324,14 +2751,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
| 2324 | * | 2751 | * |
| 2325 | * Called with busiest_rq locked. | 2752 | * Called with busiest_rq locked. |
| 2326 | */ | 2753 | */ |
| 2327 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2754 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
| 2328 | { | 2755 | { |
| 2329 | struct sched_domain *sd; | ||
| 2330 | runqueue_t *target_rq; | ||
| 2331 | int target_cpu = busiest_rq->push_cpu; | 2756 | int target_cpu = busiest_rq->push_cpu; |
| 2757 | struct sched_domain *sd; | ||
| 2758 | struct rq *target_rq; | ||
| 2332 | 2759 | ||
| 2760 | /* Is there any task to move? */ | ||
| 2333 | if (busiest_rq->nr_running <= 1) | 2761 | if (busiest_rq->nr_running <= 1) |
| 2334 | /* no task to move */ | ||
| 2335 | return; | 2762 | return; |
| 2336 | 2763 | ||
| 2337 | target_rq = cpu_rq(target_cpu); | 2764 | target_rq = cpu_rq(target_cpu); |
| @@ -2347,21 +2774,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
| 2347 | double_lock_balance(busiest_rq, target_rq); | 2774 | double_lock_balance(busiest_rq, target_rq); |
| 2348 | 2775 | ||
| 2349 | /* Search for an sd spanning us and the target CPU. */ | 2776 | /* Search for an sd spanning us and the target CPU. */ |
| 2350 | for_each_domain(target_cpu, sd) | 2777 | for_each_domain(target_cpu, sd) { |
| 2351 | if ((sd->flags & SD_LOAD_BALANCE) && | 2778 | if ((sd->flags & SD_LOAD_BALANCE) && |
| 2352 | cpu_isset(busiest_cpu, sd->span)) | 2779 | cpu_isset(busiest_cpu, sd->span)) |
| 2353 | break; | 2780 | break; |
| 2781 | } | ||
| 2354 | 2782 | ||
| 2355 | if (unlikely(sd == NULL)) | 2783 | if (likely(sd)) { |
| 2356 | goto out; | 2784 | schedstat_inc(sd, alb_cnt); |
| 2357 | |||
| 2358 | schedstat_inc(sd, alb_cnt); | ||
| 2359 | 2785 | ||
| 2360 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2786 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
| 2361 | schedstat_inc(sd, alb_pushed); | 2787 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, |
| 2362 | else | 2788 | NULL)) |
| 2363 | schedstat_inc(sd, alb_failed); | 2789 | schedstat_inc(sd, alb_pushed); |
| 2364 | out: | 2790 | else |
| 2791 | schedstat_inc(sd, alb_failed); | ||
| 2792 | } | ||
| 2365 | spin_unlock(&target_rq->lock); | 2793 | spin_unlock(&target_rq->lock); |
| 2366 | } | 2794 | } |
| 2367 | 2795 | ||
| @@ -2374,23 +2802,27 @@ out: | |||
| 2374 | * Balancing parameters are set up in arch_init_sched_domains. | 2802 | * Balancing parameters are set up in arch_init_sched_domains. |
| 2375 | */ | 2803 | */ |
| 2376 | 2804 | ||
| 2377 | /* Don't have all balancing operations going off at once */ | 2805 | /* Don't have all balancing operations going off at once: */ |
| 2378 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) | 2806 | static inline unsigned long cpu_offset(int cpu) |
| 2807 | { | ||
| 2808 | return jiffies + cpu * HZ / NR_CPUS; | ||
| 2809 | } | ||
| 2379 | 2810 | ||
| 2380 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | 2811 | static void |
| 2381 | enum idle_type idle) | 2812 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) |
| 2382 | { | 2813 | { |
| 2383 | unsigned long old_load, this_load; | 2814 | unsigned long this_load, interval, j = cpu_offset(this_cpu); |
| 2384 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | ||
| 2385 | struct sched_domain *sd; | 2815 | struct sched_domain *sd; |
| 2386 | int i; | 2816 | int i, scale; |
| 2817 | |||
| 2818 | this_load = this_rq->raw_weighted_load; | ||
| 2819 | |||
| 2820 | /* Update our load: */ | ||
| 2821 | for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { | ||
| 2822 | unsigned long old_load, new_load; | ||
| 2387 | 2823 | ||
| 2388 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | ||
| 2389 | /* Update our load */ | ||
| 2390 | for (i = 0; i < 3; i++) { | ||
| 2391 | unsigned long new_load = this_load; | ||
| 2392 | int scale = 1 << i; | ||
| 2393 | old_load = this_rq->cpu_load[i]; | 2824 | old_load = this_rq->cpu_load[i]; |
| 2825 | new_load = this_load; | ||
| 2394 | /* | 2826 | /* |
| 2395 | * Round up the averaging division if load is increasing. This | 2827 | * Round up the averaging division if load is increasing. This |
| 2396 | * prevents us from getting stuck on 9 if the load is 10, for | 2828 | * prevents us from getting stuck on 9 if the load is 10, for |
| @@ -2402,8 +2834,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
| 2402 | } | 2834 | } |
| 2403 | 2835 | ||
| 2404 | for_each_domain(this_cpu, sd) { | 2836 | for_each_domain(this_cpu, sd) { |
| 2405 | unsigned long interval; | ||
| 2406 | |||
| 2407 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2837 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 2408 | continue; | 2838 | continue; |
| 2409 | 2839 | ||
| @@ -2433,17 +2863,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
| 2433 | /* | 2863 | /* |
| 2434 | * on UP we do not need to balance between CPUs: | 2864 | * on UP we do not need to balance between CPUs: |
| 2435 | */ | 2865 | */ |
| 2436 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) | 2866 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) |
| 2437 | { | 2867 | { |
| 2438 | } | 2868 | } |
| 2439 | static inline void idle_balance(int cpu, runqueue_t *rq) | 2869 | static inline void idle_balance(int cpu, struct rq *rq) |
| 2440 | { | 2870 | { |
| 2441 | } | 2871 | } |
| 2442 | #endif | 2872 | #endif |
| 2443 | 2873 | ||
| 2444 | static inline int wake_priority_sleeper(runqueue_t *rq) | 2874 | static inline int wake_priority_sleeper(struct rq *rq) |
| 2445 | { | 2875 | { |
| 2446 | int ret = 0; | 2876 | int ret = 0; |
| 2877 | |||
| 2447 | #ifdef CONFIG_SCHED_SMT | 2878 | #ifdef CONFIG_SCHED_SMT |
| 2448 | spin_lock(&rq->lock); | 2879 | spin_lock(&rq->lock); |
| 2449 | /* | 2880 | /* |
| @@ -2467,25 +2898,26 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
| 2467 | * This is called on clock ticks and on context switches. | 2898 | * This is called on clock ticks and on context switches. |
| 2468 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 2899 | * Bank in p->sched_time the ns elapsed since the last tick or switch. |
| 2469 | */ | 2900 | */ |
| 2470 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, | 2901 | static inline void |
| 2471 | unsigned long long now) | 2902 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
| 2472 | { | 2903 | { |
| 2473 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); | 2904 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); |
| 2474 | p->sched_time += now - last; | ||
| 2475 | } | 2905 | } |
| 2476 | 2906 | ||
| 2477 | /* | 2907 | /* |
| 2478 | * Return current->sched_time plus any more ns on the sched_clock | 2908 | * Return current->sched_time plus any more ns on the sched_clock |
| 2479 | * that have not yet been banked. | 2909 | * that have not yet been banked. |
| 2480 | */ | 2910 | */ |
| 2481 | unsigned long long current_sched_time(const task_t *tsk) | 2911 | unsigned long long current_sched_time(const struct task_struct *p) |
| 2482 | { | 2912 | { |
| 2483 | unsigned long long ns; | 2913 | unsigned long long ns; |
| 2484 | unsigned long flags; | 2914 | unsigned long flags; |
| 2915 | |||
| 2485 | local_irq_save(flags); | 2916 | local_irq_save(flags); |
| 2486 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); | 2917 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); |
| 2487 | ns = tsk->sched_time + (sched_clock() - ns); | 2918 | ns = p->sched_time + sched_clock() - ns; |
| 2488 | local_irq_restore(flags); | 2919 | local_irq_restore(flags); |
| 2920 | |||
| 2489 | return ns; | 2921 | return ns; |
| 2490 | } | 2922 | } |
| 2491 | 2923 | ||
| @@ -2499,11 +2931,16 @@ unsigned long long current_sched_time(const task_t *tsk) | |||
| 2499 | * increasing number of running tasks. We also ignore the interactivity | 2931 | * increasing number of running tasks. We also ignore the interactivity |
| 2500 | * if a better static_prio task has expired: | 2932 | * if a better static_prio task has expired: |
| 2501 | */ | 2933 | */ |
| 2502 | #define EXPIRED_STARVING(rq) \ | 2934 | static inline int expired_starving(struct rq *rq) |
| 2503 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | 2935 | { |
| 2504 | (jiffies - (rq)->expired_timestamp >= \ | 2936 | if (rq->curr->static_prio > rq->best_expired_prio) |
| 2505 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | 2937 | return 1; |
| 2506 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) | 2938 | if (!STARVATION_LIMIT || !rq->expired_timestamp) |
| 2939 | return 0; | ||
| 2940 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | ||
| 2941 | return 1; | ||
| 2942 | return 0; | ||
| 2943 | } | ||
| 2507 | 2944 | ||
| 2508 | /* | 2945 | /* |
| 2509 | * Account user cpu time to a process. | 2946 | * Account user cpu time to a process. |
| @@ -2536,7 +2973,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 2536 | cputime_t cputime) | 2973 | cputime_t cputime) |
| 2537 | { | 2974 | { |
| 2538 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2975 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| 2539 | runqueue_t *rq = this_rq(); | 2976 | struct rq *rq = this_rq(); |
| 2540 | cputime64_t tmp; | 2977 | cputime64_t tmp; |
| 2541 | 2978 | ||
| 2542 | p->stime = cputime_add(p->stime, cputime); | 2979 | p->stime = cputime_add(p->stime, cputime); |
| @@ -2566,7 +3003,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 2566 | { | 3003 | { |
| 2567 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3004 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| 2568 | cputime64_t tmp = cputime_to_cputime64(steal); | 3005 | cputime64_t tmp = cputime_to_cputime64(steal); |
| 2569 | runqueue_t *rq = this_rq(); | 3006 | struct rq *rq = this_rq(); |
| 2570 | 3007 | ||
| 2571 | if (p == rq->idle) { | 3008 | if (p == rq->idle) { |
| 2572 | p->stime = cputime_add(p->stime, steal); | 3009 | p->stime = cputime_add(p->stime, steal); |
| @@ -2587,10 +3024,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 2587 | */ | 3024 | */ |
| 2588 | void scheduler_tick(void) | 3025 | void scheduler_tick(void) |
| 2589 | { | 3026 | { |
| 2590 | int cpu = smp_processor_id(); | ||
| 2591 | runqueue_t *rq = this_rq(); | ||
| 2592 | task_t *p = current; | ||
| 2593 | unsigned long long now = sched_clock(); | 3027 | unsigned long long now = sched_clock(); |
| 3028 | struct task_struct *p = current; | ||
| 3029 | int cpu = smp_processor_id(); | ||
| 3030 | struct rq *rq = cpu_rq(cpu); | ||
| 2594 | 3031 | ||
| 2595 | update_cpu_clock(p, rq, now); | 3032 | update_cpu_clock(p, rq, now); |
| 2596 | 3033 | ||
| @@ -2640,7 +3077,7 @@ void scheduler_tick(void) | |||
| 2640 | 3077 | ||
| 2641 | if (!rq->expired_timestamp) | 3078 | if (!rq->expired_timestamp) |
| 2642 | rq->expired_timestamp = jiffies; | 3079 | rq->expired_timestamp = jiffies; |
| 2643 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | 3080 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { |
| 2644 | enqueue_task(p, rq->expired); | 3081 | enqueue_task(p, rq->expired); |
| 2645 | if (p->static_prio < rq->best_expired_prio) | 3082 | if (p->static_prio < rq->best_expired_prio) |
| 2646 | rq->best_expired_prio = p->static_prio; | 3083 | rq->best_expired_prio = p->static_prio; |
| @@ -2679,55 +3116,42 @@ out: | |||
| 2679 | } | 3116 | } |
| 2680 | 3117 | ||
| 2681 | #ifdef CONFIG_SCHED_SMT | 3118 | #ifdef CONFIG_SCHED_SMT |
| 2682 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | 3119 | static inline void wakeup_busy_runqueue(struct rq *rq) |
| 2683 | { | 3120 | { |
| 2684 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | 3121 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ |
| 2685 | if (rq->curr == rq->idle && rq->nr_running) | 3122 | if (rq->curr == rq->idle && rq->nr_running) |
| 2686 | resched_task(rq->idle); | 3123 | resched_task(rq->idle); |
| 2687 | } | 3124 | } |
| 2688 | 3125 | ||
| 2689 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3126 | /* |
| 3127 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
| 3128 | */ | ||
| 3129 | static void wake_sleeping_dependent(int this_cpu) | ||
| 2690 | { | 3130 | { |
| 2691 | struct sched_domain *tmp, *sd = NULL; | 3131 | struct sched_domain *tmp, *sd = NULL; |
| 2692 | cpumask_t sibling_map; | ||
| 2693 | int i; | 3132 | int i; |
| 2694 | 3133 | ||
| 2695 | for_each_domain(this_cpu, tmp) | 3134 | for_each_domain(this_cpu, tmp) { |
| 2696 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3135 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
| 2697 | sd = tmp; | 3136 | sd = tmp; |
| 3137 | break; | ||
| 3138 | } | ||
| 3139 | } | ||
| 2698 | 3140 | ||
| 2699 | if (!sd) | 3141 | if (!sd) |
| 2700 | return; | 3142 | return; |
| 2701 | 3143 | ||
| 2702 | /* | 3144 | for_each_cpu_mask(i, sd->span) { |
| 2703 | * Unlock the current runqueue because we have to lock in | 3145 | struct rq *smt_rq = cpu_rq(i); |
| 2704 | * CPU order to avoid deadlocks. Caller knows that we might | ||
| 2705 | * unlock. We keep IRQs disabled. | ||
| 2706 | */ | ||
| 2707 | spin_unlock(&this_rq->lock); | ||
| 2708 | |||
| 2709 | sibling_map = sd->span; | ||
| 2710 | |||
| 2711 | for_each_cpu_mask(i, sibling_map) | ||
| 2712 | spin_lock(&cpu_rq(i)->lock); | ||
| 2713 | /* | ||
| 2714 | * We clear this CPU from the mask. This both simplifies the | ||
| 2715 | * inner loop and keps this_rq locked when we exit: | ||
| 2716 | */ | ||
| 2717 | cpu_clear(this_cpu, sibling_map); | ||
| 2718 | 3146 | ||
| 2719 | for_each_cpu_mask(i, sibling_map) { | 3147 | if (i == this_cpu) |
| 2720 | runqueue_t *smt_rq = cpu_rq(i); | 3148 | continue; |
| 3149 | if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
| 3150 | continue; | ||
| 2721 | 3151 | ||
| 2722 | wakeup_busy_runqueue(smt_rq); | 3152 | wakeup_busy_runqueue(smt_rq); |
| 3153 | spin_unlock(&smt_rq->lock); | ||
| 2723 | } | 3154 | } |
| 2724 | |||
| 2725 | for_each_cpu_mask(i, sibling_map) | ||
| 2726 | spin_unlock(&cpu_rq(i)->lock); | ||
| 2727 | /* | ||
| 2728 | * We exit with this_cpu's rq still held and IRQs | ||
| 2729 | * still disabled: | ||
| 2730 | */ | ||
| 2731 | } | 3155 | } |
| 2732 | 3156 | ||
| 2733 | /* | 3157 | /* |
| @@ -2735,57 +3159,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
| 2735 | * utilize, if another task runs on a sibling. This models the | 3159 | * utilize, if another task runs on a sibling. This models the |
| 2736 | * slowdown effect of other tasks running on siblings: | 3160 | * slowdown effect of other tasks running on siblings: |
| 2737 | */ | 3161 | */ |
| 2738 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | 3162 | static inline unsigned long |
| 3163 | smt_slice(struct task_struct *p, struct sched_domain *sd) | ||
| 2739 | { | 3164 | { |
| 2740 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3165 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
| 2741 | } | 3166 | } |
| 2742 | 3167 | ||
| 2743 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3168 | /* |
| 3169 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
| 3170 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
| 3171 | * acquire their lock. As we only trylock the normal locking order does not | ||
| 3172 | * need to be obeyed. | ||
| 3173 | */ | ||
| 3174 | static int | ||
| 3175 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) | ||
| 2744 | { | 3176 | { |
| 2745 | struct sched_domain *tmp, *sd = NULL; | 3177 | struct sched_domain *tmp, *sd = NULL; |
| 2746 | cpumask_t sibling_map; | ||
| 2747 | prio_array_t *array; | ||
| 2748 | int ret = 0, i; | 3178 | int ret = 0, i; |
| 2749 | task_t *p; | ||
| 2750 | 3179 | ||
| 2751 | for_each_domain(this_cpu, tmp) | 3180 | /* kernel/rt threads do not participate in dependent sleeping */ |
| 2752 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3181 | if (!p->mm || rt_task(p)) |
| 3182 | return 0; | ||
| 3183 | |||
| 3184 | for_each_domain(this_cpu, tmp) { | ||
| 3185 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
| 2753 | sd = tmp; | 3186 | sd = tmp; |
| 3187 | break; | ||
| 3188 | } | ||
| 3189 | } | ||
| 2754 | 3190 | ||
| 2755 | if (!sd) | 3191 | if (!sd) |
| 2756 | return 0; | 3192 | return 0; |
| 2757 | 3193 | ||
| 2758 | /* | 3194 | for_each_cpu_mask(i, sd->span) { |
| 2759 | * The same locking rules and details apply as for | 3195 | struct task_struct *smt_curr; |
| 2760 | * wake_sleeping_dependent(): | 3196 | struct rq *smt_rq; |
| 2761 | */ | ||
| 2762 | spin_unlock(&this_rq->lock); | ||
| 2763 | sibling_map = sd->span; | ||
| 2764 | for_each_cpu_mask(i, sibling_map) | ||
| 2765 | spin_lock(&cpu_rq(i)->lock); | ||
| 2766 | cpu_clear(this_cpu, sibling_map); | ||
| 2767 | 3197 | ||
| 2768 | /* | 3198 | if (i == this_cpu) |
| 2769 | * Establish next task to be run - it might have gone away because | 3199 | continue; |
| 2770 | * we released the runqueue lock above: | ||
| 2771 | */ | ||
| 2772 | if (!this_rq->nr_running) | ||
| 2773 | goto out_unlock; | ||
| 2774 | array = this_rq->active; | ||
| 2775 | if (!array->nr_active) | ||
| 2776 | array = this_rq->expired; | ||
| 2777 | BUG_ON(!array->nr_active); | ||
| 2778 | 3200 | ||
| 2779 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3201 | smt_rq = cpu_rq(i); |
| 2780 | task_t, run_list); | 3202 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
| 3203 | continue; | ||
| 2781 | 3204 | ||
| 2782 | for_each_cpu_mask(i, sibling_map) { | 3205 | smt_curr = smt_rq->curr; |
| 2783 | runqueue_t *smt_rq = cpu_rq(i); | ||
| 2784 | task_t *smt_curr = smt_rq->curr; | ||
| 2785 | 3206 | ||
| 2786 | /* Kernel threads do not participate in dependent sleeping */ | 3207 | if (!smt_curr->mm) |
| 2787 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3208 | goto unlock; |
| 2788 | goto check_smt_task; | ||
| 2789 | 3209 | ||
| 2790 | /* | 3210 | /* |
| 2791 | * If a user task with lower static priority than the | 3211 | * If a user task with lower static priority than the |
| @@ -2803,49 +3223,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
| 2803 | if ((jiffies % DEF_TIMESLICE) > | 3223 | if ((jiffies % DEF_TIMESLICE) > |
| 2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3224 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
| 2805 | ret = 1; | 3225 | ret = 1; |
| 2806 | } else | 3226 | } else { |
| 2807 | if (smt_curr->static_prio < p->static_prio && | 3227 | if (smt_curr->static_prio < p->static_prio && |
| 2808 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3228 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
| 2809 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3229 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
| 2810 | ret = 1; | 3230 | ret = 1; |
| 2811 | |||
| 2812 | check_smt_task: | ||
| 2813 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
| 2814 | rt_task(smt_curr)) | ||
| 2815 | continue; | ||
| 2816 | if (!p->mm) { | ||
| 2817 | wakeup_busy_runqueue(smt_rq); | ||
| 2818 | continue; | ||
| 2819 | } | ||
| 2820 | |||
| 2821 | /* | ||
| 2822 | * Reschedule a lower priority task on the SMT sibling for | ||
| 2823 | * it to be put to sleep, or wake it up if it has been put to | ||
| 2824 | * sleep for priority reasons to see if it should run now. | ||
| 2825 | */ | ||
| 2826 | if (rt_task(p)) { | ||
| 2827 | if ((jiffies % DEF_TIMESLICE) > | ||
| 2828 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
| 2829 | resched_task(smt_curr); | ||
| 2830 | } else { | ||
| 2831 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
| 2832 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
| 2833 | resched_task(smt_curr); | ||
| 2834 | else | ||
| 2835 | wakeup_busy_runqueue(smt_rq); | ||
| 2836 | } | 3231 | } |
| 3232 | unlock: | ||
| 3233 | spin_unlock(&smt_rq->lock); | ||
| 2837 | } | 3234 | } |
| 2838 | out_unlock: | ||
| 2839 | for_each_cpu_mask(i, sibling_map) | ||
| 2840 | spin_unlock(&cpu_rq(i)->lock); | ||
| 2841 | return ret; | 3235 | return ret; |
| 2842 | } | 3236 | } |
| 2843 | #else | 3237 | #else |
| 2844 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3238 | static inline void wake_sleeping_dependent(int this_cpu) |
| 2845 | { | 3239 | { |
| 2846 | } | 3240 | } |
| 2847 | 3241 | static inline int | |
| 2848 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3242 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) |
| 2849 | { | 3243 | { |
| 2850 | return 0; | 3244 | return 0; |
| 2851 | } | 3245 | } |
| @@ -2858,12 +3252,13 @@ void fastcall add_preempt_count(int val) | |||
| 2858 | /* | 3252 | /* |
| 2859 | * Underflow? | 3253 | * Underflow? |
| 2860 | */ | 3254 | */ |
| 2861 | BUG_ON((preempt_count() < 0)); | 3255 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
| 3256 | return; | ||
| 2862 | preempt_count() += val; | 3257 | preempt_count() += val; |
| 2863 | /* | 3258 | /* |
| 2864 | * Spinlock count overflowing soon? | 3259 | * Spinlock count overflowing soon? |
| 2865 | */ | 3260 | */ |
| 2866 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3261 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); |
| 2867 | } | 3262 | } |
| 2868 | EXPORT_SYMBOL(add_preempt_count); | 3263 | EXPORT_SYMBOL(add_preempt_count); |
| 2869 | 3264 | ||
| @@ -2872,11 +3267,15 @@ void fastcall sub_preempt_count(int val) | |||
| 2872 | /* | 3267 | /* |
| 2873 | * Underflow? | 3268 | * Underflow? |
| 2874 | */ | 3269 | */ |
| 2875 | BUG_ON(val > preempt_count()); | 3270 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
| 3271 | return; | ||
| 2876 | /* | 3272 | /* |
| 2877 | * Is the spinlock portion underflowing? | 3273 | * Is the spinlock portion underflowing? |
| 2878 | */ | 3274 | */ |
| 2879 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); | 3275 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
| 3276 | !(preempt_count() & PREEMPT_MASK))) | ||
| 3277 | return; | ||
| 3278 | |||
| 2880 | preempt_count() -= val; | 3279 | preempt_count() -= val; |
| 2881 | } | 3280 | } |
| 2882 | EXPORT_SYMBOL(sub_preempt_count); | 3281 | EXPORT_SYMBOL(sub_preempt_count); |
| @@ -2894,14 +3293,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type) | |||
| 2894 | */ | 3293 | */ |
| 2895 | asmlinkage void __sched schedule(void) | 3294 | asmlinkage void __sched schedule(void) |
| 2896 | { | 3295 | { |
| 2897 | long *switch_count; | 3296 | struct task_struct *prev, *next; |
| 2898 | task_t *prev, *next; | 3297 | struct prio_array *array; |
| 2899 | runqueue_t *rq; | ||
| 2900 | prio_array_t *array; | ||
| 2901 | struct list_head *queue; | 3298 | struct list_head *queue; |
| 2902 | unsigned long long now; | 3299 | unsigned long long now; |
| 2903 | unsigned long run_time; | 3300 | unsigned long run_time; |
| 2904 | int cpu, idx, new_prio; | 3301 | int cpu, idx, new_prio; |
| 3302 | long *switch_count; | ||
| 3303 | struct rq *rq; | ||
| 2905 | 3304 | ||
| 2906 | /* | 3305 | /* |
| 2907 | * Test if we are atomic. Since do_exit() needs to call into | 3306 | * Test if we are atomic. Since do_exit() needs to call into |
| @@ -2949,9 +3348,6 @@ need_resched_nonpreemptible: | |||
| 2949 | 3348 | ||
| 2950 | spin_lock_irq(&rq->lock); | 3349 | spin_lock_irq(&rq->lock); |
| 2951 | 3350 | ||
| 2952 | if (unlikely(prev->flags & PF_DEAD)) | ||
| 2953 | prev->state = EXIT_DEAD; | ||
| 2954 | |||
| 2955 | switch_count = &prev->nivcsw; | 3351 | switch_count = &prev->nivcsw; |
| 2956 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3352 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 2957 | switch_count = &prev->nvcsw; | 3353 | switch_count = &prev->nvcsw; |
| @@ -2967,32 +3363,13 @@ need_resched_nonpreemptible: | |||
| 2967 | 3363 | ||
| 2968 | cpu = smp_processor_id(); | 3364 | cpu = smp_processor_id(); |
| 2969 | if (unlikely(!rq->nr_running)) { | 3365 | if (unlikely(!rq->nr_running)) { |
| 2970 | go_idle: | ||
| 2971 | idle_balance(cpu, rq); | 3366 | idle_balance(cpu, rq); |
| 2972 | if (!rq->nr_running) { | 3367 | if (!rq->nr_running) { |
| 2973 | next = rq->idle; | 3368 | next = rq->idle; |
| 2974 | rq->expired_timestamp = 0; | 3369 | rq->expired_timestamp = 0; |
| 2975 | wake_sleeping_dependent(cpu, rq); | 3370 | wake_sleeping_dependent(cpu); |
| 2976 | /* | ||
| 2977 | * wake_sleeping_dependent() might have released | ||
| 2978 | * the runqueue, so break out if we got new | ||
| 2979 | * tasks meanwhile: | ||
| 2980 | */ | ||
| 2981 | if (!rq->nr_running) | ||
| 2982 | goto switch_tasks; | ||
| 2983 | } | ||
| 2984 | } else { | ||
| 2985 | if (dependent_sleeper(cpu, rq)) { | ||
| 2986 | next = rq->idle; | ||
| 2987 | goto switch_tasks; | 3371 | goto switch_tasks; |
| 2988 | } | 3372 | } |
| 2989 | /* | ||
| 2990 | * dependent_sleeper() releases and reacquires the runqueue | ||
| 2991 | * lock, hence go into the idle loop if the rq went | ||
| 2992 | * empty meanwhile: | ||
| 2993 | */ | ||
| 2994 | if (unlikely(!rq->nr_running)) | ||
| 2995 | goto go_idle; | ||
| 2996 | } | 3373 | } |
| 2997 | 3374 | ||
| 2998 | array = rq->active; | 3375 | array = rq->active; |
| @@ -3010,7 +3387,7 @@ go_idle: | |||
| 3010 | 3387 | ||
| 3011 | idx = sched_find_first_bit(array->bitmap); | 3388 | idx = sched_find_first_bit(array->bitmap); |
| 3012 | queue = array->queue + idx; | 3389 | queue = array->queue + idx; |
| 3013 | next = list_entry(queue->next, task_t, run_list); | 3390 | next = list_entry(queue->next, struct task_struct, run_list); |
| 3014 | 3391 | ||
| 3015 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | 3392 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
| 3016 | unsigned long long delta = now - next->timestamp; | 3393 | unsigned long long delta = now - next->timestamp; |
| @@ -3030,6 +3407,8 @@ go_idle: | |||
| 3030 | } | 3407 | } |
| 3031 | } | 3408 | } |
| 3032 | next->sleep_type = SLEEP_NORMAL; | 3409 | next->sleep_type = SLEEP_NORMAL; |
| 3410 | if (dependent_sleeper(cpu, rq, next)) | ||
| 3411 | next = rq->idle; | ||
| 3033 | switch_tasks: | 3412 | switch_tasks: |
| 3034 | if (next == rq->idle) | 3413 | if (next == rq->idle) |
| 3035 | schedstat_inc(rq, sched_goidle); | 3414 | schedstat_inc(rq, sched_goidle); |
| @@ -3071,12 +3450,11 @@ switch_tasks: | |||
| 3071 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3450 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
| 3072 | goto need_resched; | 3451 | goto need_resched; |
| 3073 | } | 3452 | } |
| 3074 | |||
| 3075 | EXPORT_SYMBOL(schedule); | 3453 | EXPORT_SYMBOL(schedule); |
| 3076 | 3454 | ||
| 3077 | #ifdef CONFIG_PREEMPT | 3455 | #ifdef CONFIG_PREEMPT |
| 3078 | /* | 3456 | /* |
| 3079 | * this is is the entry point to schedule() from in-kernel preemption | 3457 | * this is the entry point to schedule() from in-kernel preemption |
| 3080 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3458 | * off of preempt_enable. Kernel preemptions off return from interrupt |
| 3081 | * occur there and call schedule directly. | 3459 | * occur there and call schedule directly. |
| 3082 | */ | 3460 | */ |
| @@ -3116,11 +3494,10 @@ need_resched: | |||
| 3116 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3494 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
| 3117 | goto need_resched; | 3495 | goto need_resched; |
| 3118 | } | 3496 | } |
| 3119 | |||
| 3120 | EXPORT_SYMBOL(preempt_schedule); | 3497 | EXPORT_SYMBOL(preempt_schedule); |
| 3121 | 3498 | ||
| 3122 | /* | 3499 | /* |
| 3123 | * this is is the entry point to schedule() from kernel preemption | 3500 | * this is the entry point to schedule() from kernel preemption |
| 3124 | * off of irq context. | 3501 | * off of irq context. |
| 3125 | * Note, that this is called and return with irqs disabled. This will | 3502 | * Note, that this is called and return with irqs disabled. This will |
| 3126 | * protect us against recursive calling from irq. | 3503 | * protect us against recursive calling from irq. |
| @@ -3132,7 +3509,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3132 | struct task_struct *task = current; | 3509 | struct task_struct *task = current; |
| 3133 | int saved_lock_depth; | 3510 | int saved_lock_depth; |
| 3134 | #endif | 3511 | #endif |
| 3135 | /* Catch callers which need to be fixed*/ | 3512 | /* Catch callers which need to be fixed */ |
| 3136 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3513 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3137 | 3514 | ||
| 3138 | need_resched: | 3515 | need_resched: |
| @@ -3165,10 +3542,8 @@ need_resched: | |||
| 3165 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3542 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
| 3166 | void *key) | 3543 | void *key) |
| 3167 | { | 3544 | { |
| 3168 | task_t *p = curr->private; | 3545 | return try_to_wake_up(curr->private, mode, sync); |
| 3169 | return try_to_wake_up(p, mode, sync); | ||
| 3170 | } | 3546 | } |
| 3171 | |||
| 3172 | EXPORT_SYMBOL(default_wake_function); | 3547 | EXPORT_SYMBOL(default_wake_function); |
| 3173 | 3548 | ||
| 3174 | /* | 3549 | /* |
| @@ -3186,13 +3561,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
| 3186 | struct list_head *tmp, *next; | 3561 | struct list_head *tmp, *next; |
| 3187 | 3562 | ||
| 3188 | list_for_each_safe(tmp, next, &q->task_list) { | 3563 | list_for_each_safe(tmp, next, &q->task_list) { |
| 3189 | wait_queue_t *curr; | 3564 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
| 3190 | unsigned flags; | 3565 | unsigned flags = curr->flags; |
| 3191 | curr = list_entry(tmp, wait_queue_t, task_list); | 3566 | |
| 3192 | flags = curr->flags; | ||
| 3193 | if (curr->func(curr, mode, sync, key) && | 3567 | if (curr->func(curr, mode, sync, key) && |
| 3194 | (flags & WQ_FLAG_EXCLUSIVE) && | 3568 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
| 3195 | !--nr_exclusive) | ||
| 3196 | break; | 3569 | break; |
| 3197 | } | 3570 | } |
| 3198 | } | 3571 | } |
| @@ -3213,7 +3586,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | |||
| 3213 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3586 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
| 3214 | spin_unlock_irqrestore(&q->lock, flags); | 3587 | spin_unlock_irqrestore(&q->lock, flags); |
| 3215 | } | 3588 | } |
| 3216 | |||
| 3217 | EXPORT_SYMBOL(__wake_up); | 3589 | EXPORT_SYMBOL(__wake_up); |
| 3218 | 3590 | ||
| 3219 | /* | 3591 | /* |
| @@ -3282,6 +3654,7 @@ EXPORT_SYMBOL(complete_all); | |||
| 3282 | void fastcall __sched wait_for_completion(struct completion *x) | 3654 | void fastcall __sched wait_for_completion(struct completion *x) |
| 3283 | { | 3655 | { |
| 3284 | might_sleep(); | 3656 | might_sleep(); |
| 3657 | |||
| 3285 | spin_lock_irq(&x->wait.lock); | 3658 | spin_lock_irq(&x->wait.lock); |
| 3286 | if (!x->done) { | 3659 | if (!x->done) { |
| 3287 | DECLARE_WAITQUEUE(wait, current); | 3660 | DECLARE_WAITQUEUE(wait, current); |
| @@ -3426,7 +3799,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | |||
| 3426 | schedule(); | 3799 | schedule(); |
| 3427 | SLEEP_ON_TAIL | 3800 | SLEEP_ON_TAIL |
| 3428 | } | 3801 | } |
| 3429 | |||
| 3430 | EXPORT_SYMBOL(interruptible_sleep_on); | 3802 | EXPORT_SYMBOL(interruptible_sleep_on); |
| 3431 | 3803 | ||
| 3432 | long fastcall __sched | 3804 | long fastcall __sched |
| @@ -3442,7 +3814,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
| 3442 | 3814 | ||
| 3443 | return timeout; | 3815 | return timeout; |
| 3444 | } | 3816 | } |
| 3445 | |||
| 3446 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3817 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
| 3447 | 3818 | ||
| 3448 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3819 | void fastcall __sched sleep_on(wait_queue_head_t *q) |
| @@ -3455,7 +3826,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q) | |||
| 3455 | schedule(); | 3826 | schedule(); |
| 3456 | SLEEP_ON_TAIL | 3827 | SLEEP_ON_TAIL |
| 3457 | } | 3828 | } |
| 3458 | |||
| 3459 | EXPORT_SYMBOL(sleep_on); | 3829 | EXPORT_SYMBOL(sleep_on); |
| 3460 | 3830 | ||
| 3461 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3831 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
| @@ -3473,12 +3843,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
| 3473 | 3843 | ||
| 3474 | EXPORT_SYMBOL(sleep_on_timeout); | 3844 | EXPORT_SYMBOL(sleep_on_timeout); |
| 3475 | 3845 | ||
| 3476 | void set_user_nice(task_t *p, long nice) | 3846 | #ifdef CONFIG_RT_MUTEXES |
| 3847 | |||
| 3848 | /* | ||
| 3849 | * rt_mutex_setprio - set the current priority of a task | ||
| 3850 | * @p: task | ||
| 3851 | * @prio: prio value (kernel-internal form) | ||
| 3852 | * | ||
| 3853 | * This function changes the 'effective' priority of a task. It does | ||
| 3854 | * not touch ->normal_prio like __setscheduler(). | ||
| 3855 | * | ||
| 3856 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
| 3857 | */ | ||
| 3858 | void rt_mutex_setprio(struct task_struct *p, int prio) | ||
| 3859 | { | ||
| 3860 | struct prio_array *array; | ||
| 3861 | unsigned long flags; | ||
| 3862 | struct rq *rq; | ||
| 3863 | int oldprio; | ||
| 3864 | |||
| 3865 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
| 3866 | |||
| 3867 | rq = task_rq_lock(p, &flags); | ||
| 3868 | |||
| 3869 | oldprio = p->prio; | ||
| 3870 | array = p->array; | ||
| 3871 | if (array) | ||
| 3872 | dequeue_task(p, array); | ||
| 3873 | p->prio = prio; | ||
| 3874 | |||
| 3875 | if (array) { | ||
| 3876 | /* | ||
| 3877 | * If changing to an RT priority then queue it | ||
| 3878 | * in the active array! | ||
| 3879 | */ | ||
| 3880 | if (rt_task(p)) | ||
| 3881 | array = rq->active; | ||
| 3882 | enqueue_task(p, array); | ||
| 3883 | /* | ||
| 3884 | * Reschedule if we are currently running on this runqueue and | ||
| 3885 | * our priority decreased, or if we are not currently running on | ||
| 3886 | * this runqueue and our priority is higher than the current's | ||
| 3887 | */ | ||
| 3888 | if (task_running(rq, p)) { | ||
| 3889 | if (p->prio > oldprio) | ||
| 3890 | resched_task(rq->curr); | ||
| 3891 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
| 3892 | resched_task(rq->curr); | ||
| 3893 | } | ||
| 3894 | task_rq_unlock(rq, &flags); | ||
| 3895 | } | ||
| 3896 | |||
| 3897 | #endif | ||
| 3898 | |||
| 3899 | void set_user_nice(struct task_struct *p, long nice) | ||
| 3477 | { | 3900 | { |
| 3901 | struct prio_array *array; | ||
| 3902 | int old_prio, delta; | ||
| 3478 | unsigned long flags; | 3903 | unsigned long flags; |
| 3479 | prio_array_t *array; | 3904 | struct rq *rq; |
| 3480 | runqueue_t *rq; | ||
| 3481 | int old_prio, new_prio, delta; | ||
| 3482 | 3905 | ||
| 3483 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3906 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
| 3484 | return; | 3907 | return; |
| @@ -3493,22 +3916,25 @@ void set_user_nice(task_t *p, long nice) | |||
| 3493 | * it wont have any effect on scheduling until the task is | 3916 | * it wont have any effect on scheduling until the task is |
| 3494 | * not SCHED_NORMAL/SCHED_BATCH: | 3917 | * not SCHED_NORMAL/SCHED_BATCH: |
| 3495 | */ | 3918 | */ |
| 3496 | if (rt_task(p)) { | 3919 | if (has_rt_policy(p)) { |
| 3497 | p->static_prio = NICE_TO_PRIO(nice); | 3920 | p->static_prio = NICE_TO_PRIO(nice); |
| 3498 | goto out_unlock; | 3921 | goto out_unlock; |
| 3499 | } | 3922 | } |
| 3500 | array = p->array; | 3923 | array = p->array; |
| 3501 | if (array) | 3924 | if (array) { |
| 3502 | dequeue_task(p, array); | 3925 | dequeue_task(p, array); |
| 3926 | dec_raw_weighted_load(rq, p); | ||
| 3927 | } | ||
| 3503 | 3928 | ||
| 3504 | old_prio = p->prio; | ||
| 3505 | new_prio = NICE_TO_PRIO(nice); | ||
| 3506 | delta = new_prio - old_prio; | ||
| 3507 | p->static_prio = NICE_TO_PRIO(nice); | 3929 | p->static_prio = NICE_TO_PRIO(nice); |
| 3508 | p->prio += delta; | 3930 | set_load_weight(p); |
| 3931 | old_prio = p->prio; | ||
| 3932 | p->prio = effective_prio(p); | ||
| 3933 | delta = p->prio - old_prio; | ||
| 3509 | 3934 | ||
| 3510 | if (array) { | 3935 | if (array) { |
| 3511 | enqueue_task(p, array); | 3936 | enqueue_task(p, array); |
| 3937 | inc_raw_weighted_load(rq, p); | ||
| 3512 | /* | 3938 | /* |
| 3513 | * If the task increased its priority or is running and | 3939 | * If the task increased its priority or is running and |
| 3514 | * lowered its priority, then reschedule its CPU: | 3940 | * lowered its priority, then reschedule its CPU: |
| @@ -3519,7 +3945,6 @@ void set_user_nice(task_t *p, long nice) | |||
| 3519 | out_unlock: | 3945 | out_unlock: |
| 3520 | task_rq_unlock(rq, &flags); | 3946 | task_rq_unlock(rq, &flags); |
| 3521 | } | 3947 | } |
| 3522 | |||
| 3523 | EXPORT_SYMBOL(set_user_nice); | 3948 | EXPORT_SYMBOL(set_user_nice); |
| 3524 | 3949 | ||
| 3525 | /* | 3950 | /* |
| @@ -3527,10 +3952,11 @@ EXPORT_SYMBOL(set_user_nice); | |||
| 3527 | * @p: task | 3952 | * @p: task |
| 3528 | * @nice: nice value | 3953 | * @nice: nice value |
| 3529 | */ | 3954 | */ |
| 3530 | int can_nice(const task_t *p, const int nice) | 3955 | int can_nice(const struct task_struct *p, const int nice) |
| 3531 | { | 3956 | { |
| 3532 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3957 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
| 3533 | int nice_rlim = 20 - nice; | 3958 | int nice_rlim = 20 - nice; |
| 3959 | |||
| 3534 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3960 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
| 3535 | capable(CAP_SYS_NICE)); | 3961 | capable(CAP_SYS_NICE)); |
| 3536 | } | 3962 | } |
| @@ -3546,8 +3972,7 @@ int can_nice(const task_t *p, const int nice) | |||
| 3546 | */ | 3972 | */ |
| 3547 | asmlinkage long sys_nice(int increment) | 3973 | asmlinkage long sys_nice(int increment) |
| 3548 | { | 3974 | { |
| 3549 | int retval; | 3975 | long nice, retval; |
| 3550 | long nice; | ||
| 3551 | 3976 | ||
| 3552 | /* | 3977 | /* |
| 3553 | * Setpriority might change our priority at the same moment. | 3978 | * Setpriority might change our priority at the same moment. |
| @@ -3586,7 +4011,7 @@ asmlinkage long sys_nice(int increment) | |||
| 3586 | * RT tasks are offset by -200. Normal tasks are centered | 4011 | * RT tasks are offset by -200. Normal tasks are centered |
| 3587 | * around 0, value goes from -16 to +15. | 4012 | * around 0, value goes from -16 to +15. |
| 3588 | */ | 4013 | */ |
| 3589 | int task_prio(const task_t *p) | 4014 | int task_prio(const struct task_struct *p) |
| 3590 | { | 4015 | { |
| 3591 | return p->prio - MAX_RT_PRIO; | 4016 | return p->prio - MAX_RT_PRIO; |
| 3592 | } | 4017 | } |
| @@ -3595,7 +4020,7 @@ int task_prio(const task_t *p) | |||
| 3595 | * task_nice - return the nice value of a given task. | 4020 | * task_nice - return the nice value of a given task. |
| 3596 | * @p: the task in question. | 4021 | * @p: the task in question. |
| 3597 | */ | 4022 | */ |
| 3598 | int task_nice(const task_t *p) | 4023 | int task_nice(const struct task_struct *p) |
| 3599 | { | 4024 | { |
| 3600 | return TASK_NICE(p); | 4025 | return TASK_NICE(p); |
| 3601 | } | 4026 | } |
| @@ -3614,7 +4039,7 @@ int idle_cpu(int cpu) | |||
| 3614 | * idle_task - return the idle task for a given cpu. | 4039 | * idle_task - return the idle task for a given cpu. |
| 3615 | * @cpu: the processor in question. | 4040 | * @cpu: the processor in question. |
| 3616 | */ | 4041 | */ |
| 3617 | task_t *idle_task(int cpu) | 4042 | struct task_struct *idle_task(int cpu) |
| 3618 | { | 4043 | { |
| 3619 | return cpu_rq(cpu)->idle; | 4044 | return cpu_rq(cpu)->idle; |
| 3620 | } | 4045 | } |
| @@ -3623,7 +4048,7 @@ task_t *idle_task(int cpu) | |||
| 3623 | * find_process_by_pid - find a process with a matching PID value. | 4048 | * find_process_by_pid - find a process with a matching PID value. |
| 3624 | * @pid: the pid in question. | 4049 | * @pid: the pid in question. |
| 3625 | */ | 4050 | */ |
| 3626 | static inline task_t *find_process_by_pid(pid_t pid) | 4051 | static inline struct task_struct *find_process_by_pid(pid_t pid) |
| 3627 | { | 4052 | { |
| 3628 | return pid ? find_task_by_pid(pid) : current; | 4053 | return pid ? find_task_by_pid(pid) : current; |
| 3629 | } | 4054 | } |
| @@ -3632,18 +4057,18 @@ static inline task_t *find_process_by_pid(pid_t pid) | |||
| 3632 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4057 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
| 3633 | { | 4058 | { |
| 3634 | BUG_ON(p->array); | 4059 | BUG_ON(p->array); |
| 4060 | |||
| 3635 | p->policy = policy; | 4061 | p->policy = policy; |
| 3636 | p->rt_priority = prio; | 4062 | p->rt_priority = prio; |
| 3637 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 4063 | p->normal_prio = normal_prio(p); |
| 3638 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 4064 | /* we are holding p->pi_lock already */ |
| 3639 | } else { | 4065 | p->prio = rt_mutex_getprio(p); |
| 3640 | p->prio = p->static_prio; | 4066 | /* |
| 3641 | /* | 4067 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
| 3642 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 4068 | */ |
| 3643 | */ | 4069 | if (policy == SCHED_BATCH) |
| 3644 | if (policy == SCHED_BATCH) | 4070 | p->sleep_avg = 0; |
| 3645 | p->sleep_avg = 0; | 4071 | set_load_weight(p); |
| 3646 | } | ||
| 3647 | } | 4072 | } |
| 3648 | 4073 | ||
| 3649 | /** | 4074 | /** |
| @@ -3652,16 +4077,19 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
| 3652 | * @p: the task in question. | 4077 | * @p: the task in question. |
| 3653 | * @policy: new policy. | 4078 | * @policy: new policy. |
| 3654 | * @param: structure containing the new RT priority. | 4079 | * @param: structure containing the new RT priority. |
| 4080 | * | ||
| 4081 | * NOTE: the task may be already dead | ||
| 3655 | */ | 4082 | */ |
| 3656 | int sched_setscheduler(struct task_struct *p, int policy, | 4083 | int sched_setscheduler(struct task_struct *p, int policy, |
| 3657 | struct sched_param *param) | 4084 | struct sched_param *param) |
| 3658 | { | 4085 | { |
| 3659 | int retval; | 4086 | int retval, oldprio, oldpolicy = -1; |
| 3660 | int oldprio, oldpolicy = -1; | 4087 | struct prio_array *array; |
| 3661 | prio_array_t *array; | ||
| 3662 | unsigned long flags; | 4088 | unsigned long flags; |
| 3663 | runqueue_t *rq; | 4089 | struct rq *rq; |
| 3664 | 4090 | ||
| 4091 | /* may grab non-irq protected spin_locks */ | ||
| 4092 | BUG_ON(in_interrupt()); | ||
| 3665 | recheck: | 4093 | recheck: |
| 3666 | /* double check policy once rq lock held */ | 4094 | /* double check policy once rq lock held */ |
| 3667 | if (policy < 0) | 4095 | if (policy < 0) |
| @@ -3678,28 +4106,32 @@ recheck: | |||
| 3678 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4106 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
| 3679 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 4107 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
| 3680 | return -EINVAL; | 4108 | return -EINVAL; |
| 3681 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) | 4109 | if (is_rt_policy(policy) != (param->sched_priority != 0)) |
| 3682 | != (param->sched_priority == 0)) | ||
| 3683 | return -EINVAL; | 4110 | return -EINVAL; |
| 3684 | 4111 | ||
| 3685 | /* | 4112 | /* |
| 3686 | * Allow unprivileged RT tasks to decrease priority: | 4113 | * Allow unprivileged RT tasks to decrease priority: |
| 3687 | */ | 4114 | */ |
| 3688 | if (!capable(CAP_SYS_NICE)) { | 4115 | if (!capable(CAP_SYS_NICE)) { |
| 3689 | /* | 4116 | if (is_rt_policy(policy)) { |
| 3690 | * can't change policy, except between SCHED_NORMAL | 4117 | unsigned long rlim_rtprio; |
| 3691 | * and SCHED_BATCH: | 4118 | unsigned long flags; |
| 3692 | */ | 4119 | |
| 3693 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && | 4120 | if (!lock_task_sighand(p, &flags)) |
| 3694 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && | 4121 | return -ESRCH; |
| 3695 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 4122 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; |
| 3696 | return -EPERM; | 4123 | unlock_task_sighand(p, &flags); |
| 3697 | /* can't increase priority */ | 4124 | |
| 3698 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && | 4125 | /* can't set/change the rt policy */ |
| 3699 | param->sched_priority > p->rt_priority && | 4126 | if (policy != p->policy && !rlim_rtprio) |
| 3700 | param->sched_priority > | 4127 | return -EPERM; |
| 3701 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 4128 | |
| 3702 | return -EPERM; | 4129 | /* can't increase priority */ |
| 4130 | if (param->sched_priority > p->rt_priority && | ||
| 4131 | param->sched_priority > rlim_rtprio) | ||
| 4132 | return -EPERM; | ||
| 4133 | } | ||
| 4134 | |||
| 3703 | /* can't change other user's priorities */ | 4135 | /* can't change other user's priorities */ |
| 3704 | if ((current->euid != p->euid) && | 4136 | if ((current->euid != p->euid) && |
| 3705 | (current->euid != p->uid)) | 4137 | (current->euid != p->uid)) |
| @@ -3710,14 +4142,20 @@ recheck: | |||
| 3710 | if (retval) | 4142 | if (retval) |
| 3711 | return retval; | 4143 | return retval; |
| 3712 | /* | 4144 | /* |
| 4145 | * make sure no PI-waiters arrive (or leave) while we are | ||
| 4146 | * changing the priority of the task: | ||
| 4147 | */ | ||
| 4148 | spin_lock_irqsave(&p->pi_lock, flags); | ||
| 4149 | /* | ||
| 3713 | * To be able to change p->policy safely, the apropriate | 4150 | * To be able to change p->policy safely, the apropriate |
| 3714 | * runqueue lock must be held. | 4151 | * runqueue lock must be held. |
| 3715 | */ | 4152 | */ |
| 3716 | rq = task_rq_lock(p, &flags); | 4153 | rq = __task_rq_lock(p); |
| 3717 | /* recheck policy now with rq lock held */ | 4154 | /* recheck policy now with rq lock held */ |
| 3718 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4155 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
| 3719 | policy = oldpolicy = -1; | 4156 | policy = oldpolicy = -1; |
| 3720 | task_rq_unlock(rq, &flags); | 4157 | __task_rq_unlock(rq); |
| 4158 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 3721 | goto recheck; | 4159 | goto recheck; |
| 3722 | } | 4160 | } |
| 3723 | array = p->array; | 4161 | array = p->array; |
| @@ -3738,7 +4176,11 @@ recheck: | |||
| 3738 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4176 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
| 3739 | resched_task(rq->curr); | 4177 | resched_task(rq->curr); |
| 3740 | } | 4178 | } |
| 3741 | task_rq_unlock(rq, &flags); | 4179 | __task_rq_unlock(rq); |
| 4180 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 4181 | |||
| 4182 | rt_mutex_adjust_pi(p); | ||
| 4183 | |||
| 3742 | return 0; | 4184 | return 0; |
| 3743 | } | 4185 | } |
| 3744 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4186 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
| @@ -3746,22 +4188,22 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
| 3746 | static int | 4188 | static int |
| 3747 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4189 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
| 3748 | { | 4190 | { |
| 3749 | int retval; | ||
| 3750 | struct sched_param lparam; | 4191 | struct sched_param lparam; |
| 3751 | struct task_struct *p; | 4192 | struct task_struct *p; |
| 4193 | int retval; | ||
| 3752 | 4194 | ||
| 3753 | if (!param || pid < 0) | 4195 | if (!param || pid < 0) |
| 3754 | return -EINVAL; | 4196 | return -EINVAL; |
| 3755 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 4197 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
| 3756 | return -EFAULT; | 4198 | return -EFAULT; |
| 3757 | read_lock_irq(&tasklist_lock); | 4199 | |
| 4200 | rcu_read_lock(); | ||
| 4201 | retval = -ESRCH; | ||
| 3758 | p = find_process_by_pid(pid); | 4202 | p = find_process_by_pid(pid); |
| 3759 | if (!p) { | 4203 | if (p != NULL) |
| 3760 | read_unlock_irq(&tasklist_lock); | 4204 | retval = sched_setscheduler(p, policy, &lparam); |
| 3761 | return -ESRCH; | 4205 | rcu_read_unlock(); |
| 3762 | } | 4206 | |
| 3763 | retval = sched_setscheduler(p, policy, &lparam); | ||
| 3764 | read_unlock_irq(&tasklist_lock); | ||
| 3765 | return retval; | 4207 | return retval; |
| 3766 | } | 4208 | } |
| 3767 | 4209 | ||
| @@ -3797,8 +4239,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
| 3797 | */ | 4239 | */ |
| 3798 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4240 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
| 3799 | { | 4241 | { |
| 4242 | struct task_struct *p; | ||
| 3800 | int retval = -EINVAL; | 4243 | int retval = -EINVAL; |
| 3801 | task_t *p; | ||
| 3802 | 4244 | ||
| 3803 | if (pid < 0) | 4245 | if (pid < 0) |
| 3804 | goto out_nounlock; | 4246 | goto out_nounlock; |
| @@ -3825,8 +4267,8 @@ out_nounlock: | |||
| 3825 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4267 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
| 3826 | { | 4268 | { |
| 3827 | struct sched_param lp; | 4269 | struct sched_param lp; |
| 4270 | struct task_struct *p; | ||
| 3828 | int retval = -EINVAL; | 4271 | int retval = -EINVAL; |
| 3829 | task_t *p; | ||
| 3830 | 4272 | ||
| 3831 | if (!param || pid < 0) | 4273 | if (!param || pid < 0) |
| 3832 | goto out_nounlock; | 4274 | goto out_nounlock; |
| @@ -3859,9 +4301,9 @@ out_unlock: | |||
| 3859 | 4301 | ||
| 3860 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4302 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
| 3861 | { | 4303 | { |
| 3862 | task_t *p; | ||
| 3863 | int retval; | ||
| 3864 | cpumask_t cpus_allowed; | 4304 | cpumask_t cpus_allowed; |
| 4305 | struct task_struct *p; | ||
| 4306 | int retval; | ||
| 3865 | 4307 | ||
| 3866 | lock_cpu_hotplug(); | 4308 | lock_cpu_hotplug(); |
| 3867 | read_lock(&tasklist_lock); | 4309 | read_lock(&tasklist_lock); |
| @@ -3947,8 +4389,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | |||
| 3947 | 4389 | ||
| 3948 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4390 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
| 3949 | { | 4391 | { |
| 4392 | struct task_struct *p; | ||
| 3950 | int retval; | 4393 | int retval; |
| 3951 | task_t *p; | ||
| 3952 | 4394 | ||
| 3953 | lock_cpu_hotplug(); | 4395 | lock_cpu_hotplug(); |
| 3954 | read_lock(&tasklist_lock); | 4396 | read_lock(&tasklist_lock); |
| @@ -4007,9 +4449,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
| 4007 | */ | 4449 | */ |
| 4008 | asmlinkage long sys_sched_yield(void) | 4450 | asmlinkage long sys_sched_yield(void) |
| 4009 | { | 4451 | { |
| 4010 | runqueue_t *rq = this_rq_lock(); | 4452 | struct rq *rq = this_rq_lock(); |
| 4011 | prio_array_t *array = current->array; | 4453 | struct prio_array *array = current->array, *target = rq->expired; |
| 4012 | prio_array_t *target = rq->expired; | ||
| 4013 | 4454 | ||
| 4014 | schedstat_inc(rq, yld_cnt); | 4455 | schedstat_inc(rq, yld_cnt); |
| 4015 | /* | 4456 | /* |
| @@ -4043,6 +4484,7 @@ asmlinkage long sys_sched_yield(void) | |||
| 4043 | * no need to preempt or enable interrupts: | 4484 | * no need to preempt or enable interrupts: |
| 4044 | */ | 4485 | */ |
| 4045 | __release(rq->lock); | 4486 | __release(rq->lock); |
| 4487 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
| 4046 | _raw_spin_unlock(&rq->lock); | 4488 | _raw_spin_unlock(&rq->lock); |
| 4047 | preempt_enable_no_resched(); | 4489 | preempt_enable_no_resched(); |
| 4048 | 4490 | ||
| @@ -4051,7 +4493,16 @@ asmlinkage long sys_sched_yield(void) | |||
| 4051 | return 0; | 4493 | return 0; |
| 4052 | } | 4494 | } |
| 4053 | 4495 | ||
| 4054 | static inline void __cond_resched(void) | 4496 | static inline int __resched_legal(int expected_preempt_count) |
| 4497 | { | ||
| 4498 | if (unlikely(preempt_count() != expected_preempt_count)) | ||
| 4499 | return 0; | ||
| 4500 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
| 4501 | return 0; | ||
| 4502 | return 1; | ||
| 4503 | } | ||
| 4504 | |||
| 4505 | static void __cond_resched(void) | ||
| 4055 | { | 4506 | { |
| 4056 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 4507 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 4057 | __might_sleep(__FILE__, __LINE__); | 4508 | __might_sleep(__FILE__, __LINE__); |
| @@ -4061,10 +4512,6 @@ static inline void __cond_resched(void) | |||
| 4061 | * PREEMPT_ACTIVE, which could trigger a second | 4512 | * PREEMPT_ACTIVE, which could trigger a second |
| 4062 | * cond_resched() call. | 4513 | * cond_resched() call. |
| 4063 | */ | 4514 | */ |
| 4064 | if (unlikely(preempt_count())) | ||
| 4065 | return; | ||
| 4066 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
| 4067 | return; | ||
| 4068 | do { | 4515 | do { |
| 4069 | add_preempt_count(PREEMPT_ACTIVE); | 4516 | add_preempt_count(PREEMPT_ACTIVE); |
| 4070 | schedule(); | 4517 | schedule(); |
| @@ -4074,13 +4521,12 @@ static inline void __cond_resched(void) | |||
| 4074 | 4521 | ||
| 4075 | int __sched cond_resched(void) | 4522 | int __sched cond_resched(void) |
| 4076 | { | 4523 | { |
| 4077 | if (need_resched()) { | 4524 | if (need_resched() && __resched_legal(0)) { |
| 4078 | __cond_resched(); | 4525 | __cond_resched(); |
| 4079 | return 1; | 4526 | return 1; |
| 4080 | } | 4527 | } |
| 4081 | return 0; | 4528 | return 0; |
| 4082 | } | 4529 | } |
| 4083 | |||
| 4084 | EXPORT_SYMBOL(cond_resched); | 4530 | EXPORT_SYMBOL(cond_resched); |
| 4085 | 4531 | ||
| 4086 | /* | 4532 | /* |
| @@ -4101,7 +4547,8 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 4101 | ret = 1; | 4547 | ret = 1; |
| 4102 | spin_lock(lock); | 4548 | spin_lock(lock); |
| 4103 | } | 4549 | } |
| 4104 | if (need_resched()) { | 4550 | if (need_resched() && __resched_legal(1)) { |
| 4551 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
| 4105 | _raw_spin_unlock(lock); | 4552 | _raw_spin_unlock(lock); |
| 4106 | preempt_enable_no_resched(); | 4553 | preempt_enable_no_resched(); |
| 4107 | __cond_resched(); | 4554 | __cond_resched(); |
| @@ -4110,25 +4557,24 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 4110 | } | 4557 | } |
| 4111 | return ret; | 4558 | return ret; |
| 4112 | } | 4559 | } |
| 4113 | |||
| 4114 | EXPORT_SYMBOL(cond_resched_lock); | 4560 | EXPORT_SYMBOL(cond_resched_lock); |
| 4115 | 4561 | ||
| 4116 | int __sched cond_resched_softirq(void) | 4562 | int __sched cond_resched_softirq(void) |
| 4117 | { | 4563 | { |
| 4118 | BUG_ON(!in_softirq()); | 4564 | BUG_ON(!in_softirq()); |
| 4119 | 4565 | ||
| 4120 | if (need_resched()) { | 4566 | if (need_resched() && __resched_legal(0)) { |
| 4121 | __local_bh_enable(); | 4567 | raw_local_irq_disable(); |
| 4568 | _local_bh_enable(); | ||
| 4569 | raw_local_irq_enable(); | ||
| 4122 | __cond_resched(); | 4570 | __cond_resched(); |
| 4123 | local_bh_disable(); | 4571 | local_bh_disable(); |
| 4124 | return 1; | 4572 | return 1; |
| 4125 | } | 4573 | } |
| 4126 | return 0; | 4574 | return 0; |
| 4127 | } | 4575 | } |
| 4128 | |||
| 4129 | EXPORT_SYMBOL(cond_resched_softirq); | 4576 | EXPORT_SYMBOL(cond_resched_softirq); |
| 4130 | 4577 | ||
| 4131 | |||
| 4132 | /** | 4578 | /** |
| 4133 | * yield - yield the current processor to other threads. | 4579 | * yield - yield the current processor to other threads. |
| 4134 | * | 4580 | * |
| @@ -4140,7 +4586,6 @@ void __sched yield(void) | |||
| 4140 | set_current_state(TASK_RUNNING); | 4586 | set_current_state(TASK_RUNNING); |
| 4141 | sys_sched_yield(); | 4587 | sys_sched_yield(); |
| 4142 | } | 4588 | } |
| 4143 | |||
| 4144 | EXPORT_SYMBOL(yield); | 4589 | EXPORT_SYMBOL(yield); |
| 4145 | 4590 | ||
| 4146 | /* | 4591 | /* |
| @@ -4152,23 +4597,26 @@ EXPORT_SYMBOL(yield); | |||
| 4152 | */ | 4597 | */ |
| 4153 | void __sched io_schedule(void) | 4598 | void __sched io_schedule(void) |
| 4154 | { | 4599 | { |
| 4155 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4600 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
| 4156 | 4601 | ||
| 4602 | delayacct_blkio_start(); | ||
| 4157 | atomic_inc(&rq->nr_iowait); | 4603 | atomic_inc(&rq->nr_iowait); |
| 4158 | schedule(); | 4604 | schedule(); |
| 4159 | atomic_dec(&rq->nr_iowait); | 4605 | atomic_dec(&rq->nr_iowait); |
| 4606 | delayacct_blkio_end(); | ||
| 4160 | } | 4607 | } |
| 4161 | |||
| 4162 | EXPORT_SYMBOL(io_schedule); | 4608 | EXPORT_SYMBOL(io_schedule); |
| 4163 | 4609 | ||
| 4164 | long __sched io_schedule_timeout(long timeout) | 4610 | long __sched io_schedule_timeout(long timeout) |
| 4165 | { | 4611 | { |
| 4166 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4612 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
| 4167 | long ret; | 4613 | long ret; |
| 4168 | 4614 | ||
| 4615 | delayacct_blkio_start(); | ||
| 4169 | atomic_inc(&rq->nr_iowait); | 4616 | atomic_inc(&rq->nr_iowait); |
| 4170 | ret = schedule_timeout(timeout); | 4617 | ret = schedule_timeout(timeout); |
| 4171 | atomic_dec(&rq->nr_iowait); | 4618 | atomic_dec(&rq->nr_iowait); |
| 4619 | delayacct_blkio_end(); | ||
| 4172 | return ret; | 4620 | return ret; |
| 4173 | } | 4621 | } |
| 4174 | 4622 | ||
| @@ -4230,9 +4678,9 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
| 4230 | asmlinkage | 4678 | asmlinkage |
| 4231 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4679 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
| 4232 | { | 4680 | { |
| 4681 | struct task_struct *p; | ||
| 4233 | int retval = -EINVAL; | 4682 | int retval = -EINVAL; |
| 4234 | struct timespec t; | 4683 | struct timespec t; |
| 4235 | task_t *p; | ||
| 4236 | 4684 | ||
| 4237 | if (pid < 0) | 4685 | if (pid < 0) |
| 4238 | goto out_nounlock; | 4686 | goto out_nounlock; |
| @@ -4247,7 +4695,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
| 4247 | if (retval) | 4695 | if (retval) |
| 4248 | goto out_unlock; | 4696 | goto out_unlock; |
| 4249 | 4697 | ||
| 4250 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4698 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
| 4251 | 0 : task_timeslice(p), &t); | 4699 | 0 : task_timeslice(p), &t); |
| 4252 | read_unlock(&tasklist_lock); | 4700 | read_unlock(&tasklist_lock); |
| 4253 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4701 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
| @@ -4260,35 +4708,36 @@ out_unlock: | |||
| 4260 | 4708 | ||
| 4261 | static inline struct task_struct *eldest_child(struct task_struct *p) | 4709 | static inline struct task_struct *eldest_child(struct task_struct *p) |
| 4262 | { | 4710 | { |
| 4263 | if (list_empty(&p->children)) return NULL; | 4711 | if (list_empty(&p->children)) |
| 4712 | return NULL; | ||
| 4264 | return list_entry(p->children.next,struct task_struct,sibling); | 4713 | return list_entry(p->children.next,struct task_struct,sibling); |
| 4265 | } | 4714 | } |
| 4266 | 4715 | ||
| 4267 | static inline struct task_struct *older_sibling(struct task_struct *p) | 4716 | static inline struct task_struct *older_sibling(struct task_struct *p) |
| 4268 | { | 4717 | { |
| 4269 | if (p->sibling.prev==&p->parent->children) return NULL; | 4718 | if (p->sibling.prev==&p->parent->children) |
| 4719 | return NULL; | ||
| 4270 | return list_entry(p->sibling.prev,struct task_struct,sibling); | 4720 | return list_entry(p->sibling.prev,struct task_struct,sibling); |
| 4271 | } | 4721 | } |
| 4272 | 4722 | ||
| 4273 | static inline struct task_struct *younger_sibling(struct task_struct *p) | 4723 | static inline struct task_struct *younger_sibling(struct task_struct *p) |
| 4274 | { | 4724 | { |
| 4275 | if (p->sibling.next==&p->parent->children) return NULL; | 4725 | if (p->sibling.next==&p->parent->children) |
| 4726 | return NULL; | ||
| 4276 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4727 | return list_entry(p->sibling.next,struct task_struct,sibling); |
| 4277 | } | 4728 | } |
| 4278 | 4729 | ||
| 4279 | static void show_task(task_t *p) | 4730 | static const char stat_nam[] = "RSDTtZX"; |
| 4731 | |||
| 4732 | static void show_task(struct task_struct *p) | ||
| 4280 | { | 4733 | { |
| 4281 | task_t *relative; | 4734 | struct task_struct *relative; |
| 4282 | unsigned state; | ||
| 4283 | unsigned long free = 0; | 4735 | unsigned long free = 0; |
| 4284 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; | 4736 | unsigned state; |
| 4285 | 4737 | ||
| 4286 | printk("%-13.13s ", p->comm); | ||
| 4287 | state = p->state ? __ffs(p->state) + 1 : 0; | 4738 | state = p->state ? __ffs(p->state) + 1 : 0; |
| 4288 | if (state < ARRAY_SIZE(stat_nam)) | 4739 | printk("%-13.13s %c", p->comm, |
| 4289 | printk(stat_nam[state]); | 4740 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
| 4290 | else | ||
| 4291 | printk("?"); | ||
| 4292 | #if (BITS_PER_LONG == 32) | 4741 | #if (BITS_PER_LONG == 32) |
| 4293 | if (state == TASK_RUNNING) | 4742 | if (state == TASK_RUNNING) |
| 4294 | printk(" running "); | 4743 | printk(" running "); |
| @@ -4332,7 +4781,7 @@ static void show_task(task_t *p) | |||
| 4332 | 4781 | ||
| 4333 | void show_state(void) | 4782 | void show_state(void) |
| 4334 | { | 4783 | { |
| 4335 | task_t *g, *p; | 4784 | struct task_struct *g, *p; |
| 4336 | 4785 | ||
| 4337 | #if (BITS_PER_LONG == 32) | 4786 | #if (BITS_PER_LONG == 32) |
| 4338 | printk("\n" | 4787 | printk("\n" |
| @@ -4354,7 +4803,7 @@ void show_state(void) | |||
| 4354 | } while_each_thread(g, p); | 4803 | } while_each_thread(g, p); |
| 4355 | 4804 | ||
| 4356 | read_unlock(&tasklist_lock); | 4805 | read_unlock(&tasklist_lock); |
| 4357 | mutex_debug_show_all_locks(); | 4806 | debug_show_all_locks(); |
| 4358 | } | 4807 | } |
| 4359 | 4808 | ||
| 4360 | /** | 4809 | /** |
| @@ -4365,15 +4814,15 @@ void show_state(void) | |||
| 4365 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4814 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
| 4366 | * flag, to make booting more robust. | 4815 | * flag, to make booting more robust. |
| 4367 | */ | 4816 | */ |
| 4368 | void __devinit init_idle(task_t *idle, int cpu) | 4817 | void __devinit init_idle(struct task_struct *idle, int cpu) |
| 4369 | { | 4818 | { |
| 4370 | runqueue_t *rq = cpu_rq(cpu); | 4819 | struct rq *rq = cpu_rq(cpu); |
| 4371 | unsigned long flags; | 4820 | unsigned long flags; |
| 4372 | 4821 | ||
| 4373 | idle->timestamp = sched_clock(); | 4822 | idle->timestamp = sched_clock(); |
| 4374 | idle->sleep_avg = 0; | 4823 | idle->sleep_avg = 0; |
| 4375 | idle->array = NULL; | 4824 | idle->array = NULL; |
| 4376 | idle->prio = MAX_PRIO; | 4825 | idle->prio = idle->normal_prio = MAX_PRIO; |
| 4377 | idle->state = TASK_RUNNING; | 4826 | idle->state = TASK_RUNNING; |
| 4378 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4827 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
| 4379 | set_task_cpu(idle, cpu); | 4828 | set_task_cpu(idle, cpu); |
| @@ -4406,7 +4855,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
| 4406 | /* | 4855 | /* |
| 4407 | * This is how migration works: | 4856 | * This is how migration works: |
| 4408 | * | 4857 | * |
| 4409 | * 1) we queue a migration_req_t structure in the source CPU's | 4858 | * 1) we queue a struct migration_req structure in the source CPU's |
| 4410 | * runqueue and wake up that CPU's migration thread. | 4859 | * runqueue and wake up that CPU's migration thread. |
| 4411 | * 2) we down() the locked semaphore => thread blocks. | 4860 | * 2) we down() the locked semaphore => thread blocks. |
| 4412 | * 3) migration thread wakes up (implicitly it forces the migrated | 4861 | * 3) migration thread wakes up (implicitly it forces the migrated |
| @@ -4428,12 +4877,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
| 4428 | * task must not exit() & deallocate itself prematurely. The | 4877 | * task must not exit() & deallocate itself prematurely. The |
| 4429 | * call is not atomic; no spinlocks may be held. | 4878 | * call is not atomic; no spinlocks may be held. |
| 4430 | */ | 4879 | */ |
| 4431 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) | 4880 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
| 4432 | { | 4881 | { |
| 4882 | struct migration_req req; | ||
| 4433 | unsigned long flags; | 4883 | unsigned long flags; |
| 4884 | struct rq *rq; | ||
| 4434 | int ret = 0; | 4885 | int ret = 0; |
| 4435 | migration_req_t req; | ||
| 4436 | runqueue_t *rq; | ||
| 4437 | 4886 | ||
| 4438 | rq = task_rq_lock(p, &flags); | 4887 | rq = task_rq_lock(p, &flags); |
| 4439 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4888 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
| @@ -4456,9 +4905,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) | |||
| 4456 | } | 4905 | } |
| 4457 | out: | 4906 | out: |
| 4458 | task_rq_unlock(rq, &flags); | 4907 | task_rq_unlock(rq, &flags); |
| 4908 | |||
| 4459 | return ret; | 4909 | return ret; |
| 4460 | } | 4910 | } |
| 4461 | |||
| 4462 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4911 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
| 4463 | 4912 | ||
| 4464 | /* | 4913 | /* |
| @@ -4469,13 +4918,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
| 4469 | * | 4918 | * |
| 4470 | * So we race with normal scheduler movements, but that's OK, as long | 4919 | * So we race with normal scheduler movements, but that's OK, as long |
| 4471 | * as the task is no longer on this CPU. | 4920 | * as the task is no longer on this CPU. |
| 4921 | * | ||
| 4922 | * Returns non-zero if task was successfully migrated. | ||
| 4472 | */ | 4923 | */ |
| 4473 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4924 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
| 4474 | { | 4925 | { |
| 4475 | runqueue_t *rq_dest, *rq_src; | 4926 | struct rq *rq_dest, *rq_src; |
| 4927 | int ret = 0; | ||
| 4476 | 4928 | ||
| 4477 | if (unlikely(cpu_is_offline(dest_cpu))) | 4929 | if (unlikely(cpu_is_offline(dest_cpu))) |
| 4478 | return; | 4930 | return ret; |
| 4479 | 4931 | ||
| 4480 | rq_src = cpu_rq(src_cpu); | 4932 | rq_src = cpu_rq(src_cpu); |
| 4481 | rq_dest = cpu_rq(dest_cpu); | 4933 | rq_dest = cpu_rq(dest_cpu); |
| @@ -4499,13 +4951,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 4499 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 4951 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick |
| 4500 | + rq_dest->timestamp_last_tick; | 4952 | + rq_dest->timestamp_last_tick; |
| 4501 | deactivate_task(p, rq_src); | 4953 | deactivate_task(p, rq_src); |
| 4502 | activate_task(p, rq_dest, 0); | 4954 | __activate_task(p, rq_dest); |
| 4503 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4955 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
| 4504 | resched_task(rq_dest->curr); | 4956 | resched_task(rq_dest->curr); |
| 4505 | } | 4957 | } |
| 4506 | 4958 | ret = 1; | |
| 4507 | out: | 4959 | out: |
| 4508 | double_rq_unlock(rq_src, rq_dest); | 4960 | double_rq_unlock(rq_src, rq_dest); |
| 4961 | return ret; | ||
| 4509 | } | 4962 | } |
| 4510 | 4963 | ||
| 4511 | /* | 4964 | /* |
| @@ -4515,16 +4968,16 @@ out: | |||
| 4515 | */ | 4968 | */ |
| 4516 | static int migration_thread(void *data) | 4969 | static int migration_thread(void *data) |
| 4517 | { | 4970 | { |
| 4518 | runqueue_t *rq; | ||
| 4519 | int cpu = (long)data; | 4971 | int cpu = (long)data; |
| 4972 | struct rq *rq; | ||
| 4520 | 4973 | ||
| 4521 | rq = cpu_rq(cpu); | 4974 | rq = cpu_rq(cpu); |
| 4522 | BUG_ON(rq->migration_thread != current); | 4975 | BUG_ON(rq->migration_thread != current); |
| 4523 | 4976 | ||
| 4524 | set_current_state(TASK_INTERRUPTIBLE); | 4977 | set_current_state(TASK_INTERRUPTIBLE); |
| 4525 | while (!kthread_should_stop()) { | 4978 | while (!kthread_should_stop()) { |
| 4979 | struct migration_req *req; | ||
| 4526 | struct list_head *head; | 4980 | struct list_head *head; |
| 4527 | migration_req_t *req; | ||
| 4528 | 4981 | ||
| 4529 | try_to_freeze(); | 4982 | try_to_freeze(); |
| 4530 | 4983 | ||
| @@ -4548,7 +5001,7 @@ static int migration_thread(void *data) | |||
| 4548 | set_current_state(TASK_INTERRUPTIBLE); | 5001 | set_current_state(TASK_INTERRUPTIBLE); |
| 4549 | continue; | 5002 | continue; |
| 4550 | } | 5003 | } |
| 4551 | req = list_entry(head->next, migration_req_t, list); | 5004 | req = list_entry(head->next, struct migration_req, list); |
| 4552 | list_del_init(head->next); | 5005 | list_del_init(head->next); |
| 4553 | 5006 | ||
| 4554 | spin_unlock(&rq->lock); | 5007 | spin_unlock(&rq->lock); |
| @@ -4573,36 +5026,42 @@ wait_to_die: | |||
| 4573 | 5026 | ||
| 4574 | #ifdef CONFIG_HOTPLUG_CPU | 5027 | #ifdef CONFIG_HOTPLUG_CPU |
| 4575 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 5028 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
| 4576 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 5029 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
| 4577 | { | 5030 | { |
| 4578 | int dest_cpu; | 5031 | unsigned long flags; |
| 4579 | cpumask_t mask; | 5032 | cpumask_t mask; |
| 5033 | struct rq *rq; | ||
| 5034 | int dest_cpu; | ||
| 4580 | 5035 | ||
| 5036 | restart: | ||
| 4581 | /* On same node? */ | 5037 | /* On same node? */ |
| 4582 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5038 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
| 4583 | cpus_and(mask, mask, tsk->cpus_allowed); | 5039 | cpus_and(mask, mask, p->cpus_allowed); |
| 4584 | dest_cpu = any_online_cpu(mask); | 5040 | dest_cpu = any_online_cpu(mask); |
| 4585 | 5041 | ||
| 4586 | /* On any allowed CPU? */ | 5042 | /* On any allowed CPU? */ |
| 4587 | if (dest_cpu == NR_CPUS) | 5043 | if (dest_cpu == NR_CPUS) |
| 4588 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5044 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 4589 | 5045 | ||
| 4590 | /* No more Mr. Nice Guy. */ | 5046 | /* No more Mr. Nice Guy. */ |
| 4591 | if (dest_cpu == NR_CPUS) { | 5047 | if (dest_cpu == NR_CPUS) { |
| 4592 | cpus_setall(tsk->cpus_allowed); | 5048 | rq = task_rq_lock(p, &flags); |
| 4593 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5049 | cpus_setall(p->cpus_allowed); |
| 5050 | dest_cpu = any_online_cpu(p->cpus_allowed); | ||
| 5051 | task_rq_unlock(rq, &flags); | ||
| 4594 | 5052 | ||
| 4595 | /* | 5053 | /* |
| 4596 | * Don't tell them about moving exiting tasks or | 5054 | * Don't tell them about moving exiting tasks or |
| 4597 | * kernel threads (both mm NULL), since they never | 5055 | * kernel threads (both mm NULL), since they never |
| 4598 | * leave kernel. | 5056 | * leave kernel. |
| 4599 | */ | 5057 | */ |
| 4600 | if (tsk->mm && printk_ratelimit()) | 5058 | if (p->mm && printk_ratelimit()) |
| 4601 | printk(KERN_INFO "process %d (%s) no " | 5059 | printk(KERN_INFO "process %d (%s) no " |
| 4602 | "longer affine to cpu%d\n", | 5060 | "longer affine to cpu%d\n", |
| 4603 | tsk->pid, tsk->comm, dead_cpu); | 5061 | p->pid, p->comm, dead_cpu); |
| 4604 | } | 5062 | } |
| 4605 | __migrate_task(tsk, dead_cpu, dest_cpu); | 5063 | if (!__migrate_task(p, dead_cpu, dest_cpu)) |
| 5064 | goto restart; | ||
| 4606 | } | 5065 | } |
| 4607 | 5066 | ||
| 4608 | /* | 5067 | /* |
| @@ -4612,9 +5071,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
| 4612 | * their home CPUs. So we just add the counter to another CPU's counter, | 5071 | * their home CPUs. So we just add the counter to another CPU's counter, |
| 4613 | * to keep the global sum constant after CPU-down: | 5072 | * to keep the global sum constant after CPU-down: |
| 4614 | */ | 5073 | */ |
| 4615 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) | 5074 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
| 4616 | { | 5075 | { |
| 4617 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5076 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
| 4618 | unsigned long flags; | 5077 | unsigned long flags; |
| 4619 | 5078 | ||
| 4620 | local_irq_save(flags); | 5079 | local_irq_save(flags); |
| @@ -4628,48 +5087,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src) | |||
| 4628 | /* Run through task list and migrate tasks from the dead cpu. */ | 5087 | /* Run through task list and migrate tasks from the dead cpu. */ |
| 4629 | static void migrate_live_tasks(int src_cpu) | 5088 | static void migrate_live_tasks(int src_cpu) |
| 4630 | { | 5089 | { |
| 4631 | struct task_struct *tsk, *t; | 5090 | struct task_struct *p, *t; |
| 4632 | 5091 | ||
| 4633 | write_lock_irq(&tasklist_lock); | 5092 | write_lock_irq(&tasklist_lock); |
| 4634 | 5093 | ||
| 4635 | do_each_thread(t, tsk) { | 5094 | do_each_thread(t, p) { |
| 4636 | if (tsk == current) | 5095 | if (p == current) |
| 4637 | continue; | 5096 | continue; |
| 4638 | 5097 | ||
| 4639 | if (task_cpu(tsk) == src_cpu) | 5098 | if (task_cpu(p) == src_cpu) |
| 4640 | move_task_off_dead_cpu(src_cpu, tsk); | 5099 | move_task_off_dead_cpu(src_cpu, p); |
| 4641 | } while_each_thread(t, tsk); | 5100 | } while_each_thread(t, p); |
| 4642 | 5101 | ||
| 4643 | write_unlock_irq(&tasklist_lock); | 5102 | write_unlock_irq(&tasklist_lock); |
| 4644 | } | 5103 | } |
| 4645 | 5104 | ||
| 4646 | /* Schedules idle task to be the next runnable task on current CPU. | 5105 | /* Schedules idle task to be the next runnable task on current CPU. |
| 4647 | * It does so by boosting its priority to highest possible and adding it to | 5106 | * It does so by boosting its priority to highest possible and adding it to |
| 4648 | * the _front_ of runqueue. Used by CPU offline code. | 5107 | * the _front_ of the runqueue. Used by CPU offline code. |
| 4649 | */ | 5108 | */ |
| 4650 | void sched_idle_next(void) | 5109 | void sched_idle_next(void) |
| 4651 | { | 5110 | { |
| 4652 | int cpu = smp_processor_id(); | 5111 | int this_cpu = smp_processor_id(); |
| 4653 | runqueue_t *rq = this_rq(); | 5112 | struct rq *rq = cpu_rq(this_cpu); |
| 4654 | struct task_struct *p = rq->idle; | 5113 | struct task_struct *p = rq->idle; |
| 4655 | unsigned long flags; | 5114 | unsigned long flags; |
| 4656 | 5115 | ||
| 4657 | /* cpu has to be offline */ | 5116 | /* cpu has to be offline */ |
| 4658 | BUG_ON(cpu_online(cpu)); | 5117 | BUG_ON(cpu_online(this_cpu)); |
| 4659 | 5118 | ||
| 4660 | /* Strictly not necessary since rest of the CPUs are stopped by now | 5119 | /* |
| 4661 | * and interrupts disabled on current cpu. | 5120 | * Strictly not necessary since rest of the CPUs are stopped by now |
| 5121 | * and interrupts disabled on the current cpu. | ||
| 4662 | */ | 5122 | */ |
| 4663 | spin_lock_irqsave(&rq->lock, flags); | 5123 | spin_lock_irqsave(&rq->lock, flags); |
| 4664 | 5124 | ||
| 4665 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5125 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
| 4666 | /* Add idle task to _front_ of it's priority queue */ | 5126 | |
| 5127 | /* Add idle task to the _front_ of its priority queue: */ | ||
| 4667 | __activate_idle_task(p, rq); | 5128 | __activate_idle_task(p, rq); |
| 4668 | 5129 | ||
| 4669 | spin_unlock_irqrestore(&rq->lock, flags); | 5130 | spin_unlock_irqrestore(&rq->lock, flags); |
| 4670 | } | 5131 | } |
| 4671 | 5132 | ||
| 4672 | /* Ensures that the idle task is using init_mm right before its cpu goes | 5133 | /* |
| 5134 | * Ensures that the idle task is using init_mm right before its cpu goes | ||
| 4673 | * offline. | 5135 | * offline. |
| 4674 | */ | 5136 | */ |
| 4675 | void idle_task_exit(void) | 5137 | void idle_task_exit(void) |
| @@ -4683,17 +5145,17 @@ void idle_task_exit(void) | |||
| 4683 | mmdrop(mm); | 5145 | mmdrop(mm); |
| 4684 | } | 5146 | } |
| 4685 | 5147 | ||
| 4686 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | 5148 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
| 4687 | { | 5149 | { |
| 4688 | struct runqueue *rq = cpu_rq(dead_cpu); | 5150 | struct rq *rq = cpu_rq(dead_cpu); |
| 4689 | 5151 | ||
| 4690 | /* Must be exiting, otherwise would be on tasklist. */ | 5152 | /* Must be exiting, otherwise would be on tasklist. */ |
| 4691 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); | 5153 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); |
| 4692 | 5154 | ||
| 4693 | /* Cannot have done final schedule yet: would have vanished. */ | 5155 | /* Cannot have done final schedule yet: would have vanished. */ |
| 4694 | BUG_ON(tsk->flags & PF_DEAD); | 5156 | BUG_ON(p->state == TASK_DEAD); |
| 4695 | 5157 | ||
| 4696 | get_task_struct(tsk); | 5158 | get_task_struct(p); |
| 4697 | 5159 | ||
| 4698 | /* | 5160 | /* |
| 4699 | * Drop lock around migration; if someone else moves it, | 5161 | * Drop lock around migration; if someone else moves it, |
| @@ -4701,25 +5163,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | |||
| 4701 | * fine. | 5163 | * fine. |
| 4702 | */ | 5164 | */ |
| 4703 | spin_unlock_irq(&rq->lock); | 5165 | spin_unlock_irq(&rq->lock); |
| 4704 | move_task_off_dead_cpu(dead_cpu, tsk); | 5166 | move_task_off_dead_cpu(dead_cpu, p); |
| 4705 | spin_lock_irq(&rq->lock); | 5167 | spin_lock_irq(&rq->lock); |
| 4706 | 5168 | ||
| 4707 | put_task_struct(tsk); | 5169 | put_task_struct(p); |
| 4708 | } | 5170 | } |
| 4709 | 5171 | ||
| 4710 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5172 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
| 4711 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5173 | static void migrate_dead_tasks(unsigned int dead_cpu) |
| 4712 | { | 5174 | { |
| 4713 | unsigned arr, i; | 5175 | struct rq *rq = cpu_rq(dead_cpu); |
| 4714 | struct runqueue *rq = cpu_rq(dead_cpu); | 5176 | unsigned int arr, i; |
| 4715 | 5177 | ||
| 4716 | for (arr = 0; arr < 2; arr++) { | 5178 | for (arr = 0; arr < 2; arr++) { |
| 4717 | for (i = 0; i < MAX_PRIO; i++) { | 5179 | for (i = 0; i < MAX_PRIO; i++) { |
| 4718 | struct list_head *list = &rq->arrays[arr].queue[i]; | 5180 | struct list_head *list = &rq->arrays[arr].queue[i]; |
| 5181 | |||
| 4719 | while (!list_empty(list)) | 5182 | while (!list_empty(list)) |
| 4720 | migrate_dead(dead_cpu, | 5183 | migrate_dead(dead_cpu, list_entry(list->next, |
| 4721 | list_entry(list->next, task_t, | 5184 | struct task_struct, run_list)); |
| 4722 | run_list)); | ||
| 4723 | } | 5185 | } |
| 4724 | } | 5186 | } |
| 4725 | } | 5187 | } |
| @@ -4729,13 +5191,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
| 4729 | * migration_call - callback that gets triggered when a CPU is added. | 5191 | * migration_call - callback that gets triggered when a CPU is added. |
| 4730 | * Here we can start up the necessary migration thread for the new CPU. | 5192 | * Here we can start up the necessary migration thread for the new CPU. |
| 4731 | */ | 5193 | */ |
| 4732 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5194 | static int __cpuinit |
| 4733 | void *hcpu) | 5195 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| 4734 | { | 5196 | { |
| 4735 | int cpu = (long)hcpu; | ||
| 4736 | struct task_struct *p; | 5197 | struct task_struct *p; |
| 4737 | struct runqueue *rq; | 5198 | int cpu = (long)hcpu; |
| 4738 | unsigned long flags; | 5199 | unsigned long flags; |
| 5200 | struct rq *rq; | ||
| 4739 | 5201 | ||
| 4740 | switch (action) { | 5202 | switch (action) { |
| 4741 | case CPU_UP_PREPARE: | 5203 | case CPU_UP_PREPARE: |
| @@ -4750,18 +5212,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4750 | task_rq_unlock(rq, &flags); | 5212 | task_rq_unlock(rq, &flags); |
| 4751 | cpu_rq(cpu)->migration_thread = p; | 5213 | cpu_rq(cpu)->migration_thread = p; |
| 4752 | break; | 5214 | break; |
| 5215 | |||
| 4753 | case CPU_ONLINE: | 5216 | case CPU_ONLINE: |
| 4754 | /* Strictly unneccessary, as first user will wake it. */ | 5217 | /* Strictly unneccessary, as first user will wake it. */ |
| 4755 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5218 | wake_up_process(cpu_rq(cpu)->migration_thread); |
| 4756 | break; | 5219 | break; |
| 5220 | |||
| 4757 | #ifdef CONFIG_HOTPLUG_CPU | 5221 | #ifdef CONFIG_HOTPLUG_CPU |
| 4758 | case CPU_UP_CANCELED: | 5222 | case CPU_UP_CANCELED: |
| 5223 | if (!cpu_rq(cpu)->migration_thread) | ||
| 5224 | break; | ||
| 4759 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5225 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
| 4760 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5226 | kthread_bind(cpu_rq(cpu)->migration_thread, |
| 4761 | any_online_cpu(cpu_online_map)); | 5227 | any_online_cpu(cpu_online_map)); |
| 4762 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5228 | kthread_stop(cpu_rq(cpu)->migration_thread); |
| 4763 | cpu_rq(cpu)->migration_thread = NULL; | 5229 | cpu_rq(cpu)->migration_thread = NULL; |
| 4764 | break; | 5230 | break; |
| 5231 | |||
| 4765 | case CPU_DEAD: | 5232 | case CPU_DEAD: |
| 4766 | migrate_live_tasks(cpu); | 5233 | migrate_live_tasks(cpu); |
| 4767 | rq = cpu_rq(cpu); | 5234 | rq = cpu_rq(cpu); |
| @@ -4782,9 +5249,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4782 | * the requestors. */ | 5249 | * the requestors. */ |
| 4783 | spin_lock_irq(&rq->lock); | 5250 | spin_lock_irq(&rq->lock); |
| 4784 | while (!list_empty(&rq->migration_queue)) { | 5251 | while (!list_empty(&rq->migration_queue)) { |
| 4785 | migration_req_t *req; | 5252 | struct migration_req *req; |
| 5253 | |||
| 4786 | req = list_entry(rq->migration_queue.next, | 5254 | req = list_entry(rq->migration_queue.next, |
| 4787 | migration_req_t, list); | 5255 | struct migration_req, list); |
| 4788 | list_del_init(&req->list); | 5256 | list_del_init(&req->list); |
| 4789 | complete(&req->done); | 5257 | complete(&req->done); |
| 4790 | } | 5258 | } |
| @@ -4798,7 +5266,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4798 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5266 | /* Register at highest priority so that task migration (migrate_all_tasks) |
| 4799 | * happens before everything else. | 5267 | * happens before everything else. |
| 4800 | */ | 5268 | */ |
| 4801 | static struct notifier_block migration_notifier = { | 5269 | static struct notifier_block __cpuinitdata migration_notifier = { |
| 4802 | .notifier_call = migration_call, | 5270 | .notifier_call = migration_call, |
| 4803 | .priority = 10 | 5271 | .priority = 10 |
| 4804 | }; | 5272 | }; |
| @@ -4806,10 +5274,14 @@ static struct notifier_block migration_notifier = { | |||
| 4806 | int __init migration_init(void) | 5274 | int __init migration_init(void) |
| 4807 | { | 5275 | { |
| 4808 | void *cpu = (void *)(long)smp_processor_id(); | 5276 | void *cpu = (void *)(long)smp_processor_id(); |
| 4809 | /* Start one for boot CPU. */ | 5277 | int err; |
| 4810 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5278 | |
| 5279 | /* Start one for the boot CPU: */ | ||
| 5280 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | ||
| 5281 | BUG_ON(err == NOTIFY_BAD); | ||
| 4811 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5282 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 4812 | register_cpu_notifier(&migration_notifier); | 5283 | register_cpu_notifier(&migration_notifier); |
| 5284 | |||
| 4813 | return 0; | 5285 | return 0; |
| 4814 | } | 5286 | } |
| 4815 | #endif | 5287 | #endif |
| @@ -4905,7 +5377,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 4905 | } while (sd); | 5377 | } while (sd); |
| 4906 | } | 5378 | } |
| 4907 | #else | 5379 | #else |
| 4908 | #define sched_domain_debug(sd, cpu) {} | 5380 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| 4909 | #endif | 5381 | #endif |
| 4910 | 5382 | ||
| 4911 | static int sd_degenerate(struct sched_domain *sd) | 5383 | static int sd_degenerate(struct sched_domain *sd) |
| @@ -4931,8 +5403,8 @@ static int sd_degenerate(struct sched_domain *sd) | |||
| 4931 | return 1; | 5403 | return 1; |
| 4932 | } | 5404 | } |
| 4933 | 5405 | ||
| 4934 | static int sd_parent_degenerate(struct sched_domain *sd, | 5406 | static int |
| 4935 | struct sched_domain *parent) | 5407 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
| 4936 | { | 5408 | { |
| 4937 | unsigned long cflags = sd->flags, pflags = parent->flags; | 5409 | unsigned long cflags = sd->flags, pflags = parent->flags; |
| 4938 | 5410 | ||
| @@ -4965,7 +5437,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
| 4965 | */ | 5437 | */ |
| 4966 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 5438 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
| 4967 | { | 5439 | { |
| 4968 | runqueue_t *rq = cpu_rq(cpu); | 5440 | struct rq *rq = cpu_rq(cpu); |
| 4969 | struct sched_domain *tmp; | 5441 | struct sched_domain *tmp; |
| 4970 | 5442 | ||
| 4971 | /* Remove the sched domains which do not contribute to scheduling. */ | 5443 | /* Remove the sched domains which do not contribute to scheduling. */ |
| @@ -5227,8 +5699,8 @@ static void touch_cache(void *__cache, unsigned long __size) | |||
| 5227 | /* | 5699 | /* |
| 5228 | * Measure the cache-cost of one task migration. Returns in units of nsec. | 5700 | * Measure the cache-cost of one task migration. Returns in units of nsec. |
| 5229 | */ | 5701 | */ |
| 5230 | static unsigned long long measure_one(void *cache, unsigned long size, | 5702 | static unsigned long long |
| 5231 | int source, int target) | 5703 | measure_one(void *cache, unsigned long size, int source, int target) |
| 5232 | { | 5704 | { |
| 5233 | cpumask_t mask, saved_mask; | 5705 | cpumask_t mask, saved_mask; |
| 5234 | unsigned long long t0, t1, t2, t3, cost; | 5706 | unsigned long long t0, t1, t2, t3, cost; |
| @@ -5380,7 +5852,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
| 5380 | cache = vmalloc(max_size); | 5852 | cache = vmalloc(max_size); |
| 5381 | if (!cache) { | 5853 | if (!cache) { |
| 5382 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 5854 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); |
| 5383 | return 1000000; // return 1 msec on very small boxen | 5855 | return 1000000; /* return 1 msec on very small boxen */ |
| 5384 | } | 5856 | } |
| 5385 | 5857 | ||
| 5386 | while (size <= max_size) { | 5858 | while (size <= max_size) { |
| @@ -5578,9 +6050,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
| 5578 | */ | 6050 | */ |
| 5579 | static cpumask_t sched_domain_node_span(int node) | 6051 | static cpumask_t sched_domain_node_span(int node) |
| 5580 | { | 6052 | { |
| 5581 | int i; | ||
| 5582 | cpumask_t span, nodemask; | ||
| 5583 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 6053 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
| 6054 | cpumask_t span, nodemask; | ||
| 6055 | int i; | ||
| 5584 | 6056 | ||
| 5585 | cpus_clear(span); | 6057 | cpus_clear(span); |
| 5586 | bitmap_zero(used_nodes, MAX_NUMNODES); | 6058 | bitmap_zero(used_nodes, MAX_NUMNODES); |
| @@ -5591,6 +6063,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
| 5591 | 6063 | ||
| 5592 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6064 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
| 5593 | int next_node = find_next_best_node(node, used_nodes); | 6065 | int next_node = find_next_best_node(node, used_nodes); |
| 6066 | |||
| 5594 | nodemask = node_to_cpumask(next_node); | 6067 | nodemask = node_to_cpumask(next_node); |
| 5595 | cpus_or(span, span, nodemask); | 6068 | cpus_or(span, span, nodemask); |
| 5596 | } | 6069 | } |
| @@ -5599,22 +6072,27 @@ static cpumask_t sched_domain_node_span(int node) | |||
| 5599 | } | 6072 | } |
| 5600 | #endif | 6073 | #endif |
| 5601 | 6074 | ||
| 6075 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
| 6076 | |||
| 5602 | /* | 6077 | /* |
| 5603 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 6078 | * SMT sched-domains: |
| 5604 | * can switch it on easily if needed. | ||
| 5605 | */ | 6079 | */ |
| 5606 | #ifdef CONFIG_SCHED_SMT | 6080 | #ifdef CONFIG_SCHED_SMT |
| 5607 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6081 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
| 5608 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6082 | static struct sched_group sched_group_cpus[NR_CPUS]; |
| 6083 | |||
| 5609 | static int cpu_to_cpu_group(int cpu) | 6084 | static int cpu_to_cpu_group(int cpu) |
| 5610 | { | 6085 | { |
| 5611 | return cpu; | 6086 | return cpu; |
| 5612 | } | 6087 | } |
| 5613 | #endif | 6088 | #endif |
| 5614 | 6089 | ||
| 6090 | /* | ||
| 6091 | * multi-core sched-domains: | ||
| 6092 | */ | ||
| 5615 | #ifdef CONFIG_SCHED_MC | 6093 | #ifdef CONFIG_SCHED_MC |
| 5616 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6094 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
| 5617 | static struct sched_group sched_group_core[NR_CPUS]; | 6095 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
| 5618 | #endif | 6096 | #endif |
| 5619 | 6097 | ||
| 5620 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6098 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
| @@ -5630,10 +6108,11 @@ static int cpu_to_core_group(int cpu) | |||
| 5630 | #endif | 6108 | #endif |
| 5631 | 6109 | ||
| 5632 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6110 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
| 5633 | static struct sched_group sched_group_phys[NR_CPUS]; | 6111 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
| 6112 | |||
| 5634 | static int cpu_to_phys_group(int cpu) | 6113 | static int cpu_to_phys_group(int cpu) |
| 5635 | { | 6114 | { |
| 5636 | #if defined(CONFIG_SCHED_MC) | 6115 | #ifdef CONFIG_SCHED_MC |
| 5637 | cpumask_t mask = cpu_coregroup_map(cpu); | 6116 | cpumask_t mask = cpu_coregroup_map(cpu); |
| 5638 | return first_cpu(mask); | 6117 | return first_cpu(mask); |
| 5639 | #elif defined(CONFIG_SCHED_SMT) | 6118 | #elif defined(CONFIG_SCHED_SMT) |
| @@ -5687,13 +6166,74 @@ next_sg: | |||
| 5687 | } | 6166 | } |
| 5688 | #endif | 6167 | #endif |
| 5689 | 6168 | ||
| 6169 | /* Free memory allocated for various sched_group structures */ | ||
| 6170 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
| 6171 | { | ||
| 6172 | int cpu; | ||
| 6173 | #ifdef CONFIG_NUMA | ||
| 6174 | int i; | ||
| 6175 | |||
| 6176 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 6177 | struct sched_group *sched_group_allnodes | ||
| 6178 | = sched_group_allnodes_bycpu[cpu]; | ||
| 6179 | struct sched_group **sched_group_nodes | ||
| 6180 | = sched_group_nodes_bycpu[cpu]; | ||
| 6181 | |||
| 6182 | if (sched_group_allnodes) { | ||
| 6183 | kfree(sched_group_allnodes); | ||
| 6184 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
| 6185 | } | ||
| 6186 | |||
| 6187 | if (!sched_group_nodes) | ||
| 6188 | continue; | ||
| 6189 | |||
| 6190 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 6191 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 6192 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
| 6193 | |||
| 6194 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 6195 | if (cpus_empty(nodemask)) | ||
| 6196 | continue; | ||
| 6197 | |||
| 6198 | if (sg == NULL) | ||
| 6199 | continue; | ||
| 6200 | sg = sg->next; | ||
| 6201 | next_sg: | ||
| 6202 | oldsg = sg; | ||
| 6203 | sg = sg->next; | ||
| 6204 | kfree(oldsg); | ||
| 6205 | if (oldsg != sched_group_nodes[i]) | ||
| 6206 | goto next_sg; | ||
| 6207 | } | ||
| 6208 | kfree(sched_group_nodes); | ||
| 6209 | sched_group_nodes_bycpu[cpu] = NULL; | ||
| 6210 | } | ||
| 6211 | #endif | ||
| 6212 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 6213 | if (sched_group_phys_bycpu[cpu]) { | ||
| 6214 | kfree(sched_group_phys_bycpu[cpu]); | ||
| 6215 | sched_group_phys_bycpu[cpu] = NULL; | ||
| 6216 | } | ||
| 6217 | #ifdef CONFIG_SCHED_MC | ||
| 6218 | if (sched_group_core_bycpu[cpu]) { | ||
| 6219 | kfree(sched_group_core_bycpu[cpu]); | ||
| 6220 | sched_group_core_bycpu[cpu] = NULL; | ||
| 6221 | } | ||
| 6222 | #endif | ||
| 6223 | } | ||
| 6224 | } | ||
| 6225 | |||
| 5690 | /* | 6226 | /* |
| 5691 | * Build sched domains for a given set of cpus and attach the sched domains | 6227 | * Build sched domains for a given set of cpus and attach the sched domains |
| 5692 | * to the individual cpus | 6228 | * to the individual cpus |
| 5693 | */ | 6229 | */ |
| 5694 | void build_sched_domains(const cpumask_t *cpu_map) | 6230 | static int build_sched_domains(const cpumask_t *cpu_map) |
| 5695 | { | 6231 | { |
| 5696 | int i; | 6232 | int i; |
| 6233 | struct sched_group *sched_group_phys = NULL; | ||
| 6234 | #ifdef CONFIG_SCHED_MC | ||
| 6235 | struct sched_group *sched_group_core = NULL; | ||
| 6236 | #endif | ||
| 5697 | #ifdef CONFIG_NUMA | 6237 | #ifdef CONFIG_NUMA |
| 5698 | struct sched_group **sched_group_nodes = NULL; | 6238 | struct sched_group **sched_group_nodes = NULL; |
| 5699 | struct sched_group *sched_group_allnodes = NULL; | 6239 | struct sched_group *sched_group_allnodes = NULL; |
| @@ -5701,11 +6241,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5701 | /* | 6241 | /* |
| 5702 | * Allocate the per-node list of sched groups | 6242 | * Allocate the per-node list of sched groups |
| 5703 | */ | 6243 | */ |
| 5704 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6244 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
| 5705 | GFP_ATOMIC); | 6245 | GFP_KERNEL); |
| 5706 | if (!sched_group_nodes) { | 6246 | if (!sched_group_nodes) { |
| 5707 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6247 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 5708 | return; | 6248 | return -ENOMEM; |
| 5709 | } | 6249 | } |
| 5710 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6250 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
| 5711 | #endif | 6251 | #endif |
| @@ -5731,7 +6271,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5731 | if (!sched_group_allnodes) { | 6271 | if (!sched_group_allnodes) { |
| 5732 | printk(KERN_WARNING | 6272 | printk(KERN_WARNING |
| 5733 | "Can not alloc allnodes sched group\n"); | 6273 | "Can not alloc allnodes sched group\n"); |
| 5734 | break; | 6274 | goto error; |
| 5735 | } | 6275 | } |
| 5736 | sched_group_allnodes_bycpu[i] | 6276 | sched_group_allnodes_bycpu[i] |
| 5737 | = sched_group_allnodes; | 6277 | = sched_group_allnodes; |
| @@ -5752,6 +6292,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5752 | cpus_and(sd->span, sd->span, *cpu_map); | 6292 | cpus_and(sd->span, sd->span, *cpu_map); |
| 5753 | #endif | 6293 | #endif |
| 5754 | 6294 | ||
| 6295 | if (!sched_group_phys) { | ||
| 6296 | sched_group_phys | ||
| 6297 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
| 6298 | GFP_KERNEL); | ||
| 6299 | if (!sched_group_phys) { | ||
| 6300 | printk (KERN_WARNING "Can not alloc phys sched" | ||
| 6301 | "group\n"); | ||
| 6302 | goto error; | ||
| 6303 | } | ||
| 6304 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
| 6305 | } | ||
| 6306 | |||
| 5755 | p = sd; | 6307 | p = sd; |
| 5756 | sd = &per_cpu(phys_domains, i); | 6308 | sd = &per_cpu(phys_domains, i); |
| 5757 | group = cpu_to_phys_group(i); | 6309 | group = cpu_to_phys_group(i); |
| @@ -5761,6 +6313,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5761 | sd->groups = &sched_group_phys[group]; | 6313 | sd->groups = &sched_group_phys[group]; |
| 5762 | 6314 | ||
| 5763 | #ifdef CONFIG_SCHED_MC | 6315 | #ifdef CONFIG_SCHED_MC |
| 6316 | if (!sched_group_core) { | ||
| 6317 | sched_group_core | ||
| 6318 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
| 6319 | GFP_KERNEL); | ||
| 6320 | if (!sched_group_core) { | ||
| 6321 | printk (KERN_WARNING "Can not alloc core sched" | ||
| 6322 | "group\n"); | ||
| 6323 | goto error; | ||
| 6324 | } | ||
| 6325 | sched_group_core_bycpu[i] = sched_group_core; | ||
| 6326 | } | ||
| 6327 | |||
| 5764 | p = sd; | 6328 | p = sd; |
| 5765 | sd = &per_cpu(core_domains, i); | 6329 | sd = &per_cpu(core_domains, i); |
| 5766 | group = cpu_to_core_group(i); | 6330 | group = cpu_to_core_group(i); |
| @@ -5844,24 +6408,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5844 | domainspan = sched_domain_node_span(i); | 6408 | domainspan = sched_domain_node_span(i); |
| 5845 | cpus_and(domainspan, domainspan, *cpu_map); | 6409 | cpus_and(domainspan, domainspan, *cpu_map); |
| 5846 | 6410 | ||
| 5847 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6411 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
| 6412 | if (!sg) { | ||
| 6413 | printk(KERN_WARNING "Can not alloc domain group for " | ||
| 6414 | "node %d\n", i); | ||
| 6415 | goto error; | ||
| 6416 | } | ||
| 5848 | sched_group_nodes[i] = sg; | 6417 | sched_group_nodes[i] = sg; |
| 5849 | for_each_cpu_mask(j, nodemask) { | 6418 | for_each_cpu_mask(j, nodemask) { |
| 5850 | struct sched_domain *sd; | 6419 | struct sched_domain *sd; |
| 5851 | sd = &per_cpu(node_domains, j); | 6420 | sd = &per_cpu(node_domains, j); |
| 5852 | sd->groups = sg; | 6421 | sd->groups = sg; |
| 5853 | if (sd->groups == NULL) { | ||
| 5854 | /* Turn off balancing if we have no groups */ | ||
| 5855 | sd->flags = 0; | ||
| 5856 | } | ||
| 5857 | } | ||
| 5858 | if (!sg) { | ||
| 5859 | printk(KERN_WARNING | ||
| 5860 | "Can not alloc domain group for node %d\n", i); | ||
| 5861 | continue; | ||
| 5862 | } | 6422 | } |
| 5863 | sg->cpu_power = 0; | 6423 | sg->cpu_power = 0; |
| 5864 | sg->cpumask = nodemask; | 6424 | sg->cpumask = nodemask; |
| 6425 | sg->next = sg; | ||
| 5865 | cpus_or(covered, covered, nodemask); | 6426 | cpus_or(covered, covered, nodemask); |
| 5866 | prev = sg; | 6427 | prev = sg; |
| 5867 | 6428 | ||
| @@ -5880,54 +6441,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5880 | if (cpus_empty(tmp)) | 6441 | if (cpus_empty(tmp)) |
| 5881 | continue; | 6442 | continue; |
| 5882 | 6443 | ||
| 5883 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6444 | sg = kmalloc_node(sizeof(struct sched_group), |
| 6445 | GFP_KERNEL, i); | ||
| 5884 | if (!sg) { | 6446 | if (!sg) { |
| 5885 | printk(KERN_WARNING | 6447 | printk(KERN_WARNING |
| 5886 | "Can not alloc domain group for node %d\n", j); | 6448 | "Can not alloc domain group for node %d\n", j); |
| 5887 | break; | 6449 | goto error; |
| 5888 | } | 6450 | } |
| 5889 | sg->cpu_power = 0; | 6451 | sg->cpu_power = 0; |
| 5890 | sg->cpumask = tmp; | 6452 | sg->cpumask = tmp; |
| 6453 | sg->next = prev->next; | ||
| 5891 | cpus_or(covered, covered, tmp); | 6454 | cpus_or(covered, covered, tmp); |
| 5892 | prev->next = sg; | 6455 | prev->next = sg; |
| 5893 | prev = sg; | 6456 | prev = sg; |
| 5894 | } | 6457 | } |
| 5895 | prev->next = sched_group_nodes[i]; | ||
| 5896 | } | 6458 | } |
| 5897 | #endif | 6459 | #endif |
| 5898 | 6460 | ||
| 5899 | /* Calculate CPU power for physical packages and nodes */ | 6461 | /* Calculate CPU power for physical packages and nodes */ |
| 6462 | #ifdef CONFIG_SCHED_SMT | ||
| 5900 | for_each_cpu_mask(i, *cpu_map) { | 6463 | for_each_cpu_mask(i, *cpu_map) { |
| 5901 | int power; | ||
| 5902 | struct sched_domain *sd; | 6464 | struct sched_domain *sd; |
| 5903 | #ifdef CONFIG_SCHED_SMT | ||
| 5904 | sd = &per_cpu(cpu_domains, i); | 6465 | sd = &per_cpu(cpu_domains, i); |
| 5905 | power = SCHED_LOAD_SCALE; | 6466 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
| 5906 | sd->groups->cpu_power = power; | 6467 | } |
| 5907 | #endif | 6468 | #endif |
| 5908 | #ifdef CONFIG_SCHED_MC | 6469 | #ifdef CONFIG_SCHED_MC |
| 6470 | for_each_cpu_mask(i, *cpu_map) { | ||
| 6471 | int power; | ||
| 6472 | struct sched_domain *sd; | ||
| 5909 | sd = &per_cpu(core_domains, i); | 6473 | sd = &per_cpu(core_domains, i); |
| 5910 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6474 | if (sched_smt_power_savings) |
| 6475 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
| 6476 | else | ||
| 6477 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
| 5911 | * SCHED_LOAD_SCALE / 10; | 6478 | * SCHED_LOAD_SCALE / 10; |
| 5912 | sd->groups->cpu_power = power; | 6479 | sd->groups->cpu_power = power; |
| 6480 | } | ||
| 6481 | #endif | ||
| 5913 | 6482 | ||
| 6483 | for_each_cpu_mask(i, *cpu_map) { | ||
| 6484 | struct sched_domain *sd; | ||
| 6485 | #ifdef CONFIG_SCHED_MC | ||
| 5914 | sd = &per_cpu(phys_domains, i); | 6486 | sd = &per_cpu(phys_domains, i); |
| 6487 | if (i != first_cpu(sd->groups->cpumask)) | ||
| 6488 | continue; | ||
| 5915 | 6489 | ||
| 5916 | /* | 6490 | sd->groups->cpu_power = 0; |
| 5917 | * This has to be < 2 * SCHED_LOAD_SCALE | 6491 | if (sched_mc_power_savings || sched_smt_power_savings) { |
| 5918 | * Lets keep it SCHED_LOAD_SCALE, so that | 6492 | int j; |
| 5919 | * while calculating NUMA group's cpu_power | 6493 | |
| 5920 | * we can simply do | 6494 | for_each_cpu_mask(j, sd->groups->cpumask) { |
| 5921 | * numa_group->cpu_power += phys_group->cpu_power; | 6495 | struct sched_domain *sd1; |
| 5922 | * | 6496 | sd1 = &per_cpu(core_domains, j); |
| 5923 | * See "only add power once for each physical pkg" | 6497 | /* |
| 5924 | * comment below | 6498 | * for each core we will add once |
| 5925 | */ | 6499 | * to the group in physical domain |
| 5926 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6500 | */ |
| 6501 | if (j != first_cpu(sd1->groups->cpumask)) | ||
| 6502 | continue; | ||
| 6503 | |||
| 6504 | if (sched_smt_power_savings) | ||
| 6505 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
| 6506 | else | ||
| 6507 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
| 6508 | } | ||
| 6509 | } else | ||
| 6510 | /* | ||
| 6511 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
| 6512 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
| 6513 | * while calculating NUMA group's cpu_power | ||
| 6514 | * we can simply do | ||
| 6515 | * numa_group->cpu_power += phys_group->cpu_power; | ||
| 6516 | * | ||
| 6517 | * See "only add power once for each physical pkg" | ||
| 6518 | * comment below | ||
| 6519 | */ | ||
| 6520 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
| 5927 | #else | 6521 | #else |
| 6522 | int power; | ||
| 5928 | sd = &per_cpu(phys_domains, i); | 6523 | sd = &per_cpu(phys_domains, i); |
| 5929 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6524 | if (sched_smt_power_savings) |
| 5930 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6525 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
| 6526 | else | ||
| 6527 | power = SCHED_LOAD_SCALE; | ||
| 5931 | sd->groups->cpu_power = power; | 6528 | sd->groups->cpu_power = power; |
| 5932 | #endif | 6529 | #endif |
| 5933 | } | 6530 | } |
| @@ -5936,7 +6533,12 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5936 | for (i = 0; i < MAX_NUMNODES; i++) | 6533 | for (i = 0; i < MAX_NUMNODES; i++) |
| 5937 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6534 | init_numa_sched_groups_power(sched_group_nodes[i]); |
| 5938 | 6535 | ||
| 5939 | init_numa_sched_groups_power(sched_group_allnodes); | 6536 | if (sched_group_allnodes) { |
| 6537 | int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); | ||
| 6538 | struct sched_group *sg = &sched_group_allnodes[group]; | ||
| 6539 | |||
| 6540 | init_numa_sched_groups_power(sg); | ||
| 6541 | } | ||
| 5940 | #endif | 6542 | #endif |
| 5941 | 6543 | ||
| 5942 | /* Attach the domains */ | 6544 | /* Attach the domains */ |
| @@ -5955,13 +6557,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5955 | * Tune cache-hot values: | 6557 | * Tune cache-hot values: |
| 5956 | */ | 6558 | */ |
| 5957 | calibrate_migration_costs(cpu_map); | 6559 | calibrate_migration_costs(cpu_map); |
| 6560 | |||
| 6561 | return 0; | ||
| 6562 | |||
| 6563 | error: | ||
| 6564 | free_sched_groups(cpu_map); | ||
| 6565 | return -ENOMEM; | ||
| 5958 | } | 6566 | } |
| 5959 | /* | 6567 | /* |
| 5960 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6568 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| 5961 | */ | 6569 | */ |
| 5962 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6570 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
| 5963 | { | 6571 | { |
| 5964 | cpumask_t cpu_default_map; | 6572 | cpumask_t cpu_default_map; |
| 6573 | int err; | ||
| 5965 | 6574 | ||
| 5966 | /* | 6575 | /* |
| 5967 | * Setup mask for cpus without special case scheduling requirements. | 6576 | * Setup mask for cpus without special case scheduling requirements. |
| @@ -5970,51 +6579,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 5970 | */ | 6579 | */ |
| 5971 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6580 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
| 5972 | 6581 | ||
| 5973 | build_sched_domains(&cpu_default_map); | 6582 | err = build_sched_domains(&cpu_default_map); |
| 6583 | |||
| 6584 | return err; | ||
| 5974 | } | 6585 | } |
| 5975 | 6586 | ||
| 5976 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6587 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
| 5977 | { | 6588 | { |
| 5978 | #ifdef CONFIG_NUMA | 6589 | free_sched_groups(cpu_map); |
| 5979 | int i; | ||
| 5980 | int cpu; | ||
| 5981 | |||
| 5982 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 5983 | struct sched_group *sched_group_allnodes | ||
| 5984 | = sched_group_allnodes_bycpu[cpu]; | ||
| 5985 | struct sched_group **sched_group_nodes | ||
| 5986 | = sched_group_nodes_bycpu[cpu]; | ||
| 5987 | |||
| 5988 | if (sched_group_allnodes) { | ||
| 5989 | kfree(sched_group_allnodes); | ||
| 5990 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
| 5991 | } | ||
| 5992 | |||
| 5993 | if (!sched_group_nodes) | ||
| 5994 | continue; | ||
| 5995 | |||
| 5996 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 5997 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 5998 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
| 5999 | |||
| 6000 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 6001 | if (cpus_empty(nodemask)) | ||
| 6002 | continue; | ||
| 6003 | |||
| 6004 | if (sg == NULL) | ||
| 6005 | continue; | ||
| 6006 | sg = sg->next; | ||
| 6007 | next_sg: | ||
| 6008 | oldsg = sg; | ||
| 6009 | sg = sg->next; | ||
| 6010 | kfree(oldsg); | ||
| 6011 | if (oldsg != sched_group_nodes[i]) | ||
| 6012 | goto next_sg; | ||
| 6013 | } | ||
| 6014 | kfree(sched_group_nodes); | ||
| 6015 | sched_group_nodes_bycpu[cpu] = NULL; | ||
| 6016 | } | ||
| 6017 | #endif | ||
| 6018 | } | 6590 | } |
| 6019 | 6591 | ||
| 6020 | /* | 6592 | /* |
| @@ -6039,9 +6611,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6039 | * correct sched domains | 6611 | * correct sched domains |
| 6040 | * Call with hotplug lock held | 6612 | * Call with hotplug lock held |
| 6041 | */ | 6613 | */ |
| 6042 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6614 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
| 6043 | { | 6615 | { |
| 6044 | cpumask_t change_map; | 6616 | cpumask_t change_map; |
| 6617 | int err = 0; | ||
| 6045 | 6618 | ||
| 6046 | cpus_and(*partition1, *partition1, cpu_online_map); | 6619 | cpus_and(*partition1, *partition1, cpu_online_map); |
| 6047 | cpus_and(*partition2, *partition2, cpu_online_map); | 6620 | cpus_and(*partition2, *partition2, cpu_online_map); |
| @@ -6050,10 +6623,89 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
| 6050 | /* Detach sched domains from all of the affected cpus */ | 6623 | /* Detach sched domains from all of the affected cpus */ |
| 6051 | detach_destroy_domains(&change_map); | 6624 | detach_destroy_domains(&change_map); |
| 6052 | if (!cpus_empty(*partition1)) | 6625 | if (!cpus_empty(*partition1)) |
| 6053 | build_sched_domains(partition1); | 6626 | err = build_sched_domains(partition1); |
| 6054 | if (!cpus_empty(*partition2)) | 6627 | if (!err && !cpus_empty(*partition2)) |
| 6055 | build_sched_domains(partition2); | 6628 | err = build_sched_domains(partition2); |
| 6629 | |||
| 6630 | return err; | ||
| 6631 | } | ||
| 6632 | |||
| 6633 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 6634 | int arch_reinit_sched_domains(void) | ||
| 6635 | { | ||
| 6636 | int err; | ||
| 6637 | |||
| 6638 | lock_cpu_hotplug(); | ||
| 6639 | detach_destroy_domains(&cpu_online_map); | ||
| 6640 | err = arch_init_sched_domains(&cpu_online_map); | ||
| 6641 | unlock_cpu_hotplug(); | ||
| 6642 | |||
| 6643 | return err; | ||
| 6644 | } | ||
| 6645 | |||
| 6646 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
| 6647 | { | ||
| 6648 | int ret; | ||
| 6649 | |||
| 6650 | if (buf[0] != '0' && buf[0] != '1') | ||
| 6651 | return -EINVAL; | ||
| 6652 | |||
| 6653 | if (smt) | ||
| 6654 | sched_smt_power_savings = (buf[0] == '1'); | ||
| 6655 | else | ||
| 6656 | sched_mc_power_savings = (buf[0] == '1'); | ||
| 6657 | |||
| 6658 | ret = arch_reinit_sched_domains(); | ||
| 6659 | |||
| 6660 | return ret ? ret : count; | ||
| 6661 | } | ||
| 6662 | |||
| 6663 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
| 6664 | { | ||
| 6665 | int err = 0; | ||
| 6666 | |||
| 6667 | #ifdef CONFIG_SCHED_SMT | ||
| 6668 | if (smt_capable()) | ||
| 6669 | err = sysfs_create_file(&cls->kset.kobj, | ||
| 6670 | &attr_sched_smt_power_savings.attr); | ||
| 6671 | #endif | ||
| 6672 | #ifdef CONFIG_SCHED_MC | ||
| 6673 | if (!err && mc_capable()) | ||
| 6674 | err = sysfs_create_file(&cls->kset.kobj, | ||
| 6675 | &attr_sched_mc_power_savings.attr); | ||
| 6676 | #endif | ||
| 6677 | return err; | ||
| 6678 | } | ||
| 6679 | #endif | ||
| 6680 | |||
| 6681 | #ifdef CONFIG_SCHED_MC | ||
| 6682 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
| 6683 | { | ||
| 6684 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
| 6685 | } | ||
| 6686 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | ||
| 6687 | const char *buf, size_t count) | ||
| 6688 | { | ||
| 6689 | return sched_power_savings_store(buf, count, 0); | ||
| 6690 | } | ||
| 6691 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
| 6692 | sched_mc_power_savings_store); | ||
| 6693 | #endif | ||
| 6694 | |||
| 6695 | #ifdef CONFIG_SCHED_SMT | ||
| 6696 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
| 6697 | { | ||
| 6698 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
| 6699 | } | ||
| 6700 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | ||
| 6701 | const char *buf, size_t count) | ||
| 6702 | { | ||
| 6703 | return sched_power_savings_store(buf, count, 1); | ||
| 6056 | } | 6704 | } |
| 6705 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
| 6706 | sched_smt_power_savings_store); | ||
| 6707 | #endif | ||
| 6708 | |||
| 6057 | 6709 | ||
| 6058 | #ifdef CONFIG_HOTPLUG_CPU | 6710 | #ifdef CONFIG_HOTPLUG_CPU |
| 6059 | /* | 6711 | /* |
| @@ -6108,6 +6760,7 @@ int in_sched_functions(unsigned long addr) | |||
| 6108 | { | 6760 | { |
| 6109 | /* Linker adds these: start and end of __sched functions */ | 6761 | /* Linker adds these: start and end of __sched functions */ |
| 6110 | extern char __sched_text_start[], __sched_text_end[]; | 6762 | extern char __sched_text_start[], __sched_text_end[]; |
| 6763 | |||
| 6111 | return in_lock_functions(addr) || | 6764 | return in_lock_functions(addr) || |
| 6112 | (addr >= (unsigned long)__sched_text_start | 6765 | (addr >= (unsigned long)__sched_text_start |
| 6113 | && addr < (unsigned long)__sched_text_end); | 6766 | && addr < (unsigned long)__sched_text_end); |
| @@ -6115,14 +6768,15 @@ int in_sched_functions(unsigned long addr) | |||
| 6115 | 6768 | ||
| 6116 | void __init sched_init(void) | 6769 | void __init sched_init(void) |
| 6117 | { | 6770 | { |
| 6118 | runqueue_t *rq; | ||
| 6119 | int i, j, k; | 6771 | int i, j, k; |
| 6120 | 6772 | ||
| 6121 | for_each_possible_cpu(i) { | 6773 | for_each_possible_cpu(i) { |
| 6122 | prio_array_t *array; | 6774 | struct prio_array *array; |
| 6775 | struct rq *rq; | ||
| 6123 | 6776 | ||
| 6124 | rq = cpu_rq(i); | 6777 | rq = cpu_rq(i); |
| 6125 | spin_lock_init(&rq->lock); | 6778 | spin_lock_init(&rq->lock); |
| 6779 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
| 6126 | rq->nr_running = 0; | 6780 | rq->nr_running = 0; |
| 6127 | rq->active = rq->arrays; | 6781 | rq->active = rq->arrays; |
| 6128 | rq->expired = rq->arrays + 1; | 6782 | rq->expired = rq->arrays + 1; |
| @@ -6134,9 +6788,9 @@ void __init sched_init(void) | |||
| 6134 | rq->cpu_load[j] = 0; | 6788 | rq->cpu_load[j] = 0; |
| 6135 | rq->active_balance = 0; | 6789 | rq->active_balance = 0; |
| 6136 | rq->push_cpu = 0; | 6790 | rq->push_cpu = 0; |
| 6791 | rq->cpu = i; | ||
| 6137 | rq->migration_thread = NULL; | 6792 | rq->migration_thread = NULL; |
| 6138 | INIT_LIST_HEAD(&rq->migration_queue); | 6793 | INIT_LIST_HEAD(&rq->migration_queue); |
| 6139 | rq->cpu = i; | ||
| 6140 | #endif | 6794 | #endif |
| 6141 | atomic_set(&rq->nr_iowait, 0); | 6795 | atomic_set(&rq->nr_iowait, 0); |
| 6142 | 6796 | ||
| @@ -6151,6 +6805,12 @@ void __init sched_init(void) | |||
| 6151 | } | 6805 | } |
| 6152 | } | 6806 | } |
| 6153 | 6807 | ||
| 6808 | set_load_weight(&init_task); | ||
| 6809 | |||
| 6810 | #ifdef CONFIG_RT_MUTEXES | ||
| 6811 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | ||
| 6812 | #endif | ||
| 6813 | |||
| 6154 | /* | 6814 | /* |
| 6155 | * The boot idle thread does lazy MMU switching as well: | 6815 | * The boot idle thread does lazy MMU switching as well: |
| 6156 | */ | 6816 | */ |
| @@ -6169,7 +6829,7 @@ void __init sched_init(void) | |||
| 6169 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6829 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 6170 | void __might_sleep(char *file, int line) | 6830 | void __might_sleep(char *file, int line) |
| 6171 | { | 6831 | { |
| 6172 | #if defined(in_atomic) | 6832 | #ifdef in_atomic |
| 6173 | static unsigned long prev_jiffy; /* ratelimiting */ | 6833 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 6174 | 6834 | ||
| 6175 | if ((in_atomic() || irqs_disabled()) && | 6835 | if ((in_atomic() || irqs_disabled()) && |
| @@ -6191,17 +6851,18 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 6191 | #ifdef CONFIG_MAGIC_SYSRQ | 6851 | #ifdef CONFIG_MAGIC_SYSRQ |
| 6192 | void normalize_rt_tasks(void) | 6852 | void normalize_rt_tasks(void) |
| 6193 | { | 6853 | { |
| 6854 | struct prio_array *array; | ||
| 6194 | struct task_struct *p; | 6855 | struct task_struct *p; |
| 6195 | prio_array_t *array; | ||
| 6196 | unsigned long flags; | 6856 | unsigned long flags; |
| 6197 | runqueue_t *rq; | 6857 | struct rq *rq; |
| 6198 | 6858 | ||
| 6199 | read_lock_irq(&tasklist_lock); | 6859 | read_lock_irq(&tasklist_lock); |
| 6200 | for_each_process (p) { | 6860 | for_each_process(p) { |
| 6201 | if (!rt_task(p)) | 6861 | if (!rt_task(p)) |
| 6202 | continue; | 6862 | continue; |
| 6203 | 6863 | ||
| 6204 | rq = task_rq_lock(p, &flags); | 6864 | spin_lock_irqsave(&p->pi_lock, flags); |
| 6865 | rq = __task_rq_lock(p); | ||
| 6205 | 6866 | ||
| 6206 | array = p->array; | 6867 | array = p->array; |
| 6207 | if (array) | 6868 | if (array) |
| @@ -6212,7 +6873,8 @@ void normalize_rt_tasks(void) | |||
| 6212 | resched_task(rq->curr); | 6873 | resched_task(rq->curr); |
| 6213 | } | 6874 | } |
| 6214 | 6875 | ||
| 6215 | task_rq_unlock(rq, &flags); | 6876 | __task_rq_unlock(rq); |
| 6877 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 6216 | } | 6878 | } |
| 6217 | read_unlock_irq(&tasklist_lock); | 6879 | read_unlock_irq(&tasklist_lock); |
| 6218 | } | 6880 | } |
| @@ -6236,7 +6898,7 @@ void normalize_rt_tasks(void) | |||
| 6236 | * | 6898 | * |
| 6237 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6899 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 6238 | */ | 6900 | */ |
| 6239 | task_t *curr_task(int cpu) | 6901 | struct task_struct *curr_task(int cpu) |
| 6240 | { | 6902 | { |
| 6241 | return cpu_curr(cpu); | 6903 | return cpu_curr(cpu); |
| 6242 | } | 6904 | } |
| @@ -6256,7 +6918,7 @@ task_t *curr_task(int cpu) | |||
| 6256 | * | 6918 | * |
| 6257 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6919 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 6258 | */ | 6920 | */ |
| 6259 | void set_curr_task(int cpu, task_t *p) | 6921 | void set_curr_task(int cpu, struct task_struct *p) |
| 6260 | { | 6922 | { |
| 6261 | cpu_curr(cpu) = p; | 6923 | cpu_curr(cpu) = p; |
| 6262 | } | 6924 | } |
