diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 1444 |
1 files changed, 831 insertions, 613 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 6c10fa796ca0..bba57adb9504 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 96 | /* | 96 | /* |
| 97 | * Some helpers for converting nanosecond timing to jiffy resolution | 97 | * Some helpers for converting nanosecond timing to jiffy resolution |
| 98 | */ | 98 | */ |
| 99 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 99 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) |
| 100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 100 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
| 101 | 101 | ||
| 102 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 102 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
| @@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 105 | /* | 105 | /* |
| 106 | * These are the 'tuning knobs' of the scheduler: | 106 | * These are the 'tuning knobs' of the scheduler: |
| 107 | * | 107 | * |
| 108 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 108 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
| 109 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
| 110 | * Timeslices get refilled after they expire. | 109 | * Timeslices get refilled after they expire. |
| 111 | */ | 110 | */ |
| 112 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
| 113 | #define DEF_TIMESLICE (100 * HZ / 1000) | 111 | #define DEF_TIMESLICE (100 * HZ / 1000) |
| 114 | 112 | ||
| 115 | #ifdef CONFIG_SMP | 113 | #ifdef CONFIG_SMP |
| @@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
| 133 | } | 131 | } |
| 134 | #endif | 132 | #endif |
| 135 | 133 | ||
| 136 | #define SCALE_PRIO(x, prio) \ | ||
| 137 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | ||
| 138 | |||
| 139 | /* | ||
| 140 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
| 141 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
| 142 | */ | ||
| 143 | static unsigned int static_prio_timeslice(int static_prio) | ||
| 144 | { | ||
| 145 | if (static_prio == NICE_TO_PRIO(19)) | ||
| 146 | return 1; | ||
| 147 | |||
| 148 | if (static_prio < NICE_TO_PRIO(0)) | ||
| 149 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | ||
| 150 | else | ||
| 151 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | ||
| 152 | } | ||
| 153 | |||
| 154 | static inline int rt_policy(int policy) | 134 | static inline int rt_policy(int policy) |
| 155 | { | 135 | { |
| 156 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 136 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) |
| @@ -171,31 +151,91 @@ struct rt_prio_array { | |||
| 171 | struct list_head queue[MAX_RT_PRIO]; | 151 | struct list_head queue[MAX_RT_PRIO]; |
| 172 | }; | 152 | }; |
| 173 | 153 | ||
| 174 | struct load_stat { | 154 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 175 | struct load_weight load; | 155 | |
| 176 | u64 load_update_start, load_update_last; | 156 | struct cfs_rq; |
| 177 | unsigned long delta_fair, delta_exec, delta_stat; | 157 | |
| 158 | /* task group related information */ | ||
| 159 | struct task_group { | ||
| 160 | /* schedulable entities of this group on each cpu */ | ||
| 161 | struct sched_entity **se; | ||
| 162 | /* runqueue "owned" by this group on each cpu */ | ||
| 163 | struct cfs_rq **cfs_rq; | ||
| 164 | unsigned long shares; | ||
| 165 | /* spinlock to serialize modification to shares */ | ||
| 166 | spinlock_t lock; | ||
| 167 | }; | ||
| 168 | |||
| 169 | /* Default task group's sched entity on each cpu */ | ||
| 170 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | ||
| 171 | /* Default task group's cfs_rq on each cpu */ | ||
| 172 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | ||
| 173 | |||
| 174 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | ||
| 175 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | ||
| 176 | |||
| 177 | /* Default task group. | ||
| 178 | * Every task in system belong to this group at bootup. | ||
| 179 | */ | ||
| 180 | struct task_group init_task_group = { | ||
| 181 | .se = init_sched_entity_p, | ||
| 182 | .cfs_rq = init_cfs_rq_p, | ||
| 178 | }; | 183 | }; |
| 179 | 184 | ||
| 185 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 186 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | ||
| 187 | #else | ||
| 188 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | ||
| 189 | #endif | ||
| 190 | |||
| 191 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | ||
| 192 | |||
| 193 | /* return group to which a task belongs */ | ||
| 194 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 195 | { | ||
| 196 | struct task_group *tg; | ||
| 197 | |||
| 198 | #ifdef CONFIG_FAIR_USER_SCHED | ||
| 199 | tg = p->user->tg; | ||
| 200 | #else | ||
| 201 | tg = &init_task_group; | ||
| 202 | #endif | ||
| 203 | |||
| 204 | return tg; | ||
| 205 | } | ||
| 206 | |||
| 207 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
| 208 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
| 209 | { | ||
| 210 | p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; | ||
| 211 | p->se.parent = task_group(p)->se[task_cpu(p)]; | ||
| 212 | } | ||
| 213 | |||
| 214 | #else | ||
| 215 | |||
| 216 | static inline void set_task_cfs_rq(struct task_struct *p) { } | ||
| 217 | |||
| 218 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 219 | |||
| 180 | /* CFS-related fields in a runqueue */ | 220 | /* CFS-related fields in a runqueue */ |
| 181 | struct cfs_rq { | 221 | struct cfs_rq { |
| 182 | struct load_weight load; | 222 | struct load_weight load; |
| 183 | unsigned long nr_running; | 223 | unsigned long nr_running; |
| 184 | 224 | ||
| 185 | s64 fair_clock; | ||
| 186 | u64 exec_clock; | 225 | u64 exec_clock; |
| 187 | s64 wait_runtime; | 226 | u64 min_vruntime; |
| 188 | u64 sleeper_bonus; | ||
| 189 | unsigned long wait_runtime_overruns, wait_runtime_underruns; | ||
| 190 | 227 | ||
| 191 | struct rb_root tasks_timeline; | 228 | struct rb_root tasks_timeline; |
| 192 | struct rb_node *rb_leftmost; | 229 | struct rb_node *rb_leftmost; |
| 193 | struct rb_node *rb_load_balance_curr; | 230 | struct rb_node *rb_load_balance_curr; |
| 194 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 195 | /* 'curr' points to currently running entity on this cfs_rq. | 231 | /* 'curr' points to currently running entity on this cfs_rq. |
| 196 | * It is set to NULL otherwise (i.e when none are currently running). | 232 | * It is set to NULL otherwise (i.e when none are currently running). |
| 197 | */ | 233 | */ |
| 198 | struct sched_entity *curr; | 234 | struct sched_entity *curr; |
| 235 | |||
| 236 | unsigned long nr_spread_over; | ||
| 237 | |||
| 238 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 199 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 239 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
| 200 | 240 | ||
| 201 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 241 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
| @@ -206,6 +246,8 @@ struct cfs_rq { | |||
| 206 | * list is used during load balance. | 246 | * list is used during load balance. |
| 207 | */ | 247 | */ |
| 208 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 248 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ |
| 249 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
| 250 | struct rcu_head rcu; | ||
| 209 | #endif | 251 | #endif |
| 210 | }; | 252 | }; |
| 211 | 253 | ||
| @@ -237,7 +279,7 @@ struct rq { | |||
| 237 | #ifdef CONFIG_NO_HZ | 279 | #ifdef CONFIG_NO_HZ |
| 238 | unsigned char in_nohz_recently; | 280 | unsigned char in_nohz_recently; |
| 239 | #endif | 281 | #endif |
| 240 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ | 282 | struct load_weight load; /* capture load from *all* tasks on this cpu */ |
| 241 | unsigned long nr_load_updates; | 283 | unsigned long nr_load_updates; |
| 242 | u64 nr_switches; | 284 | u64 nr_switches; |
| 243 | 285 | ||
| @@ -289,16 +331,19 @@ struct rq { | |||
| 289 | unsigned long yld_exp_empty; | 331 | unsigned long yld_exp_empty; |
| 290 | unsigned long yld_act_empty; | 332 | unsigned long yld_act_empty; |
| 291 | unsigned long yld_both_empty; | 333 | unsigned long yld_both_empty; |
| 292 | unsigned long yld_cnt; | 334 | unsigned long yld_count; |
| 293 | 335 | ||
| 294 | /* schedule() stats */ | 336 | /* schedule() stats */ |
| 295 | unsigned long sched_switch; | 337 | unsigned long sched_switch; |
| 296 | unsigned long sched_cnt; | 338 | unsigned long sched_count; |
| 297 | unsigned long sched_goidle; | 339 | unsigned long sched_goidle; |
| 298 | 340 | ||
| 299 | /* try_to_wake_up() stats */ | 341 | /* try_to_wake_up() stats */ |
| 300 | unsigned long ttwu_cnt; | 342 | unsigned long ttwu_count; |
| 301 | unsigned long ttwu_local; | 343 | unsigned long ttwu_local; |
| 344 | |||
| 345 | /* BKL stats */ | ||
| 346 | unsigned long bkl_count; | ||
| 302 | #endif | 347 | #endif |
| 303 | struct lock_class_key rq_lock_key; | 348 | struct lock_class_key rq_lock_key; |
| 304 | }; | 349 | }; |
| @@ -383,6 +428,37 @@ static void update_rq_clock(struct rq *rq) | |||
| 383 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 428 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 384 | 429 | ||
| 385 | /* | 430 | /* |
| 431 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
| 432 | */ | ||
| 433 | #ifdef CONFIG_SCHED_DEBUG | ||
| 434 | # define const_debug __read_mostly | ||
| 435 | #else | ||
| 436 | # define const_debug static const | ||
| 437 | #endif | ||
| 438 | |||
| 439 | /* | ||
| 440 | * Debugging: various feature bits | ||
| 441 | */ | ||
| 442 | enum { | ||
| 443 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | ||
| 444 | SCHED_FEAT_START_DEBIT = 2, | ||
| 445 | SCHED_FEAT_TREE_AVG = 4, | ||
| 446 | SCHED_FEAT_APPROX_AVG = 8, | ||
| 447 | SCHED_FEAT_WAKEUP_PREEMPT = 16, | ||
| 448 | SCHED_FEAT_PREEMPT_RESTRICT = 32, | ||
| 449 | }; | ||
| 450 | |||
| 451 | const_debug unsigned int sysctl_sched_features = | ||
| 452 | SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | | ||
| 453 | SCHED_FEAT_START_DEBIT *1 | | ||
| 454 | SCHED_FEAT_TREE_AVG *0 | | ||
| 455 | SCHED_FEAT_APPROX_AVG *0 | | ||
| 456 | SCHED_FEAT_WAKEUP_PREEMPT *1 | | ||
| 457 | SCHED_FEAT_PREEMPT_RESTRICT *1; | ||
| 458 | |||
| 459 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | ||
| 460 | |||
| 461 | /* | ||
| 386 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 462 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
| 387 | * clock constructed from sched_clock(): | 463 | * clock constructed from sched_clock(): |
| 388 | */ | 464 | */ |
| @@ -400,18 +476,7 @@ unsigned long long cpu_clock(int cpu) | |||
| 400 | 476 | ||
| 401 | return now; | 477 | return now; |
| 402 | } | 478 | } |
| 403 | 479 | EXPORT_SYMBOL_GPL(cpu_clock); | |
| 404 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 405 | /* Change a task's ->cfs_rq if it moves across CPUs */ | ||
| 406 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
| 407 | { | ||
| 408 | p->se.cfs_rq = &task_rq(p)->cfs; | ||
| 409 | } | ||
| 410 | #else | ||
| 411 | static inline void set_task_cfs_rq(struct task_struct *p) | ||
| 412 | { | ||
| 413 | } | ||
| 414 | #endif | ||
| 415 | 480 | ||
| 416 | #ifndef prepare_arch_switch | 481 | #ifndef prepare_arch_switch |
| 417 | # define prepare_arch_switch(next) do { } while (0) | 482 | # define prepare_arch_switch(next) do { } while (0) |
| @@ -497,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 497 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 562 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
| 498 | __acquires(rq->lock) | 563 | __acquires(rq->lock) |
| 499 | { | 564 | { |
| 500 | struct rq *rq; | 565 | for (;;) { |
| 501 | 566 | struct rq *rq = task_rq(p); | |
| 502 | repeat_lock_task: | 567 | spin_lock(&rq->lock); |
| 503 | rq = task_rq(p); | 568 | if (likely(rq == task_rq(p))) |
| 504 | spin_lock(&rq->lock); | 569 | return rq; |
| 505 | if (unlikely(rq != task_rq(p))) { | ||
| 506 | spin_unlock(&rq->lock); | 570 | spin_unlock(&rq->lock); |
| 507 | goto repeat_lock_task; | ||
| 508 | } | 571 | } |
| 509 | return rq; | ||
| 510 | } | 572 | } |
| 511 | 573 | ||
| 512 | /* | 574 | /* |
| @@ -519,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 519 | { | 581 | { |
| 520 | struct rq *rq; | 582 | struct rq *rq; |
| 521 | 583 | ||
| 522 | repeat_lock_task: | 584 | for (;;) { |
| 523 | local_irq_save(*flags); | 585 | local_irq_save(*flags); |
| 524 | rq = task_rq(p); | 586 | rq = task_rq(p); |
| 525 | spin_lock(&rq->lock); | 587 | spin_lock(&rq->lock); |
| 526 | if (unlikely(rq != task_rq(p))) { | 588 | if (likely(rq == task_rq(p))) |
| 589 | return rq; | ||
| 527 | spin_unlock_irqrestore(&rq->lock, *flags); | 590 | spin_unlock_irqrestore(&rq->lock, *flags); |
| 528 | goto repeat_lock_task; | ||
| 529 | } | 591 | } |
| 530 | return rq; | ||
| 531 | } | 592 | } |
| 532 | 593 | ||
| 533 | static inline void __task_rq_unlock(struct rq *rq) | 594 | static void __task_rq_unlock(struct rq *rq) |
| 534 | __releases(rq->lock) | 595 | __releases(rq->lock) |
| 535 | { | 596 | { |
| 536 | spin_unlock(&rq->lock); | 597 | spin_unlock(&rq->lock); |
| @@ -545,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
| 545 | /* | 606 | /* |
| 546 | * this_rq_lock - lock this runqueue and disable interrupts. | 607 | * this_rq_lock - lock this runqueue and disable interrupts. |
| 547 | */ | 608 | */ |
| 548 | static inline struct rq *this_rq_lock(void) | 609 | static struct rq *this_rq_lock(void) |
| 549 | __acquires(rq->lock) | 610 | __acquires(rq->lock) |
| 550 | { | 611 | { |
| 551 | struct rq *rq; | 612 | struct rq *rq; |
| @@ -645,19 +706,6 @@ static inline void resched_task(struct task_struct *p) | |||
| 645 | } | 706 | } |
| 646 | #endif | 707 | #endif |
| 647 | 708 | ||
| 648 | static u64 div64_likely32(u64 divident, unsigned long divisor) | ||
| 649 | { | ||
| 650 | #if BITS_PER_LONG == 32 | ||
| 651 | if (likely(divident <= 0xffffffffULL)) | ||
| 652 | return (u32)divident / divisor; | ||
| 653 | do_div(divident, divisor); | ||
| 654 | |||
| 655 | return divident; | ||
| 656 | #else | ||
| 657 | return divident / divisor; | ||
| 658 | #endif | ||
| 659 | } | ||
| 660 | |||
| 661 | #if BITS_PER_LONG == 32 | 709 | #if BITS_PER_LONG == 32 |
| 662 | # define WMULT_CONST (~0UL) | 710 | # define WMULT_CONST (~0UL) |
| 663 | #else | 711 | #else |
| @@ -699,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | |||
| 699 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | 747 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
| 700 | } | 748 | } |
| 701 | 749 | ||
| 702 | static void update_load_add(struct load_weight *lw, unsigned long inc) | 750 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 703 | { | 751 | { |
| 704 | lw->weight += inc; | 752 | lw->weight += inc; |
| 705 | lw->inv_weight = 0; | ||
| 706 | } | 753 | } |
| 707 | 754 | ||
| 708 | static void update_load_sub(struct load_weight *lw, unsigned long dec) | 755 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
| 709 | { | 756 | { |
| 710 | lw->weight -= dec; | 757 | lw->weight -= dec; |
| 711 | lw->inv_weight = 0; | ||
| 712 | } | 758 | } |
| 713 | 759 | ||
| 714 | /* | 760 | /* |
| @@ -784,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 784 | int *this_best_prio, struct rq_iterator *iterator); | 830 | int *this_best_prio, struct rq_iterator *iterator); |
| 785 | 831 | ||
| 786 | #include "sched_stats.h" | 832 | #include "sched_stats.h" |
| 787 | #include "sched_rt.c" | ||
| 788 | #include "sched_fair.c" | ||
| 789 | #include "sched_idletask.c" | 833 | #include "sched_idletask.c" |
| 834 | #include "sched_fair.c" | ||
| 835 | #include "sched_rt.c" | ||
| 790 | #ifdef CONFIG_SCHED_DEBUG | 836 | #ifdef CONFIG_SCHED_DEBUG |
| 791 | # include "sched_debug.c" | 837 | # include "sched_debug.c" |
| 792 | #endif | 838 | #endif |
| 793 | 839 | ||
| 794 | #define sched_class_highest (&rt_sched_class) | 840 | #define sched_class_highest (&rt_sched_class) |
| 795 | 841 | ||
| 796 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | ||
| 797 | { | ||
| 798 | if (rq->curr != rq->idle && ls->load.weight) { | ||
| 799 | ls->delta_exec += ls->delta_stat; | ||
| 800 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | ||
| 801 | ls->delta_stat = 0; | ||
| 802 | } | ||
| 803 | } | ||
| 804 | |||
| 805 | /* | 842 | /* |
| 806 | * Update delta_exec, delta_fair fields for rq. | 843 | * Update delta_exec, delta_fair fields for rq. |
| 807 | * | 844 | * |
| 808 | * delta_fair clock advances at a rate inversely proportional to | 845 | * delta_fair clock advances at a rate inversely proportional to |
| 809 | * total load (rq->ls.load.weight) on the runqueue, while | 846 | * total load (rq->load.weight) on the runqueue, while |
| 810 | * delta_exec advances at the same rate as wall-clock (provided | 847 | * delta_exec advances at the same rate as wall-clock (provided |
| 811 | * cpu is not idle). | 848 | * cpu is not idle). |
| 812 | * | 849 | * |
| @@ -814,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls) | |||
| 814 | * runqueue over any given interval. This (smoothened) load is used | 851 | * runqueue over any given interval. This (smoothened) load is used |
| 815 | * during load balance. | 852 | * during load balance. |
| 816 | * | 853 | * |
| 817 | * This function is called /before/ updating rq->ls.load | 854 | * This function is called /before/ updating rq->load |
| 818 | * and when switching tasks. | 855 | * and when switching tasks. |
| 819 | */ | 856 | */ |
| 820 | static void update_curr_load(struct rq *rq) | ||
| 821 | { | ||
| 822 | struct load_stat *ls = &rq->ls; | ||
| 823 | u64 start; | ||
| 824 | |||
| 825 | start = ls->load_update_start; | ||
| 826 | ls->load_update_start = rq->clock; | ||
| 827 | ls->delta_stat += rq->clock - start; | ||
| 828 | /* | ||
| 829 | * Stagger updates to ls->delta_fair. Very frequent updates | ||
| 830 | * can be expensive. | ||
| 831 | */ | ||
| 832 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | ||
| 833 | __update_curr_load(rq, ls); | ||
| 834 | } | ||
| 835 | |||
| 836 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 857 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
| 837 | { | 858 | { |
| 838 | update_curr_load(rq); | 859 | update_load_add(&rq->load, p->se.load.weight); |
| 839 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
| 840 | } | 860 | } |
| 841 | 861 | ||
| 842 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | 862 | static inline void dec_load(struct rq *rq, const struct task_struct *p) |
| 843 | { | 863 | { |
| 844 | update_curr_load(rq); | 864 | update_load_sub(&rq->load, p->se.load.weight); |
| 845 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
| 846 | } | 865 | } |
| 847 | 866 | ||
| 848 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 867 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
| @@ -859,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) | |||
| 859 | 878 | ||
| 860 | static void set_load_weight(struct task_struct *p) | 879 | static void set_load_weight(struct task_struct *p) |
| 861 | { | 880 | { |
| 862 | p->se.wait_runtime = 0; | ||
| 863 | |||
| 864 | if (task_has_rt_policy(p)) { | 881 | if (task_has_rt_policy(p)) { |
| 865 | p->se.load.weight = prio_to_weight[0] * 2; | 882 | p->se.load.weight = prio_to_weight[0] * 2; |
| 866 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | 883 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
| @@ -952,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 952 | } | 969 | } |
| 953 | 970 | ||
| 954 | /* | 971 | /* |
| 955 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
| 956 | */ | ||
| 957 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
| 958 | { | ||
| 959 | update_rq_clock(rq); | ||
| 960 | |||
| 961 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 962 | rq->nr_uninterruptible--; | ||
| 963 | |||
| 964 | enqueue_task(rq, p, 0); | ||
| 965 | inc_nr_running(p, rq); | ||
| 966 | } | ||
| 967 | |||
| 968 | /* | ||
| 969 | * deactivate_task - remove a task from the runqueue. | 972 | * deactivate_task - remove a task from the runqueue. |
| 970 | */ | 973 | */ |
| 971 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 974 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
| @@ -989,32 +992,50 @@ inline int task_curr(const struct task_struct *p) | |||
| 989 | /* Used instead of source_load when we know the type == 0 */ | 992 | /* Used instead of source_load when we know the type == 0 */ |
| 990 | unsigned long weighted_cpuload(const int cpu) | 993 | unsigned long weighted_cpuload(const int cpu) |
| 991 | { | 994 | { |
| 992 | return cpu_rq(cpu)->ls.load.weight; | 995 | return cpu_rq(cpu)->load.weight; |
| 993 | } | 996 | } |
| 994 | 997 | ||
| 995 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 998 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
| 996 | { | 999 | { |
| 997 | #ifdef CONFIG_SMP | 1000 | #ifdef CONFIG_SMP |
| 998 | task_thread_info(p)->cpu = cpu; | 1001 | task_thread_info(p)->cpu = cpu; |
| 999 | set_task_cfs_rq(p); | ||
| 1000 | #endif | 1002 | #endif |
| 1003 | set_task_cfs_rq(p); | ||
| 1001 | } | 1004 | } |
| 1002 | 1005 | ||
| 1003 | #ifdef CONFIG_SMP | 1006 | #ifdef CONFIG_SMP |
| 1004 | 1007 | ||
| 1008 | /* | ||
| 1009 | * Is this task likely cache-hot: | ||
| 1010 | */ | ||
| 1011 | static inline int | ||
| 1012 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
| 1013 | { | ||
| 1014 | s64 delta; | ||
| 1015 | |||
| 1016 | if (p->sched_class != &fair_sched_class) | ||
| 1017 | return 0; | ||
| 1018 | |||
| 1019 | if (sysctl_sched_migration_cost == -1) | ||
| 1020 | return 1; | ||
| 1021 | if (sysctl_sched_migration_cost == 0) | ||
| 1022 | return 0; | ||
| 1023 | |||
| 1024 | delta = now - p->se.exec_start; | ||
| 1025 | |||
| 1026 | return delta < (s64)sysctl_sched_migration_cost; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | |||
| 1005 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1030 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 1006 | { | 1031 | { |
| 1007 | int old_cpu = task_cpu(p); | 1032 | int old_cpu = task_cpu(p); |
| 1008 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 1033 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); |
| 1009 | u64 clock_offset, fair_clock_offset; | 1034 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), |
| 1035 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | ||
| 1036 | u64 clock_offset; | ||
| 1010 | 1037 | ||
| 1011 | clock_offset = old_rq->clock - new_rq->clock; | 1038 | clock_offset = old_rq->clock - new_rq->clock; |
| 1012 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; | ||
| 1013 | |||
| 1014 | if (p->se.wait_start_fair) | ||
| 1015 | p->se.wait_start_fair -= fair_clock_offset; | ||
| 1016 | if (p->se.sleep_start_fair) | ||
| 1017 | p->se.sleep_start_fair -= fair_clock_offset; | ||
| 1018 | 1039 | ||
| 1019 | #ifdef CONFIG_SCHEDSTATS | 1040 | #ifdef CONFIG_SCHEDSTATS |
| 1020 | if (p->se.wait_start) | 1041 | if (p->se.wait_start) |
| @@ -1023,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1023 | p->se.sleep_start -= clock_offset; | 1044 | p->se.sleep_start -= clock_offset; |
| 1024 | if (p->se.block_start) | 1045 | if (p->se.block_start) |
| 1025 | p->se.block_start -= clock_offset; | 1046 | p->se.block_start -= clock_offset; |
| 1047 | if (old_cpu != new_cpu) { | ||
| 1048 | schedstat_inc(p, se.nr_migrations); | ||
| 1049 | if (task_hot(p, old_rq->clock, NULL)) | ||
| 1050 | schedstat_inc(p, se.nr_forced2_migrations); | ||
| 1051 | } | ||
| 1026 | #endif | 1052 | #endif |
| 1053 | p->se.vruntime -= old_cfsrq->min_vruntime - | ||
| 1054 | new_cfsrq->min_vruntime; | ||
| 1027 | 1055 | ||
| 1028 | __set_task_cpu(p, new_cpu); | 1056 | __set_task_cpu(p, new_cpu); |
| 1029 | } | 1057 | } |
| @@ -1078,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p) | |||
| 1078 | int running, on_rq; | 1106 | int running, on_rq; |
| 1079 | struct rq *rq; | 1107 | struct rq *rq; |
| 1080 | 1108 | ||
| 1081 | repeat: | 1109 | for (;;) { |
| 1082 | /* | 1110 | /* |
| 1083 | * We do the initial early heuristics without holding | 1111 | * We do the initial early heuristics without holding |
| 1084 | * any task-queue locks at all. We'll only try to get | 1112 | * any task-queue locks at all. We'll only try to get |
| 1085 | * the runqueue lock when things look like they will | 1113 | * the runqueue lock when things look like they will |
| 1086 | * work out! | 1114 | * work out! |
| 1087 | */ | 1115 | */ |
| 1088 | rq = task_rq(p); | 1116 | rq = task_rq(p); |
| 1089 | 1117 | ||
| 1090 | /* | 1118 | /* |
| 1091 | * If the task is actively running on another CPU | 1119 | * If the task is actively running on another CPU |
| 1092 | * still, just relax and busy-wait without holding | 1120 | * still, just relax and busy-wait without holding |
| 1093 | * any locks. | 1121 | * any locks. |
| 1094 | * | 1122 | * |
| 1095 | * NOTE! Since we don't hold any locks, it's not | 1123 | * NOTE! Since we don't hold any locks, it's not |
| 1096 | * even sure that "rq" stays as the right runqueue! | 1124 | * even sure that "rq" stays as the right runqueue! |
| 1097 | * But we don't care, since "task_running()" will | 1125 | * But we don't care, since "task_running()" will |
| 1098 | * return false if the runqueue has changed and p | 1126 | * return false if the runqueue has changed and p |
| 1099 | * is actually now running somewhere else! | 1127 | * is actually now running somewhere else! |
| 1100 | */ | 1128 | */ |
| 1101 | while (task_running(rq, p)) | 1129 | while (task_running(rq, p)) |
| 1102 | cpu_relax(); | 1130 | cpu_relax(); |
| 1103 | 1131 | ||
| 1104 | /* | 1132 | /* |
| 1105 | * Ok, time to look more closely! We need the rq | 1133 | * Ok, time to look more closely! We need the rq |
| 1106 | * lock now, to be *sure*. If we're wrong, we'll | 1134 | * lock now, to be *sure*. If we're wrong, we'll |
| 1107 | * just go back and repeat. | 1135 | * just go back and repeat. |
| 1108 | */ | 1136 | */ |
| 1109 | rq = task_rq_lock(p, &flags); | 1137 | rq = task_rq_lock(p, &flags); |
| 1110 | running = task_running(rq, p); | 1138 | running = task_running(rq, p); |
| 1111 | on_rq = p->se.on_rq; | 1139 | on_rq = p->se.on_rq; |
| 1112 | task_rq_unlock(rq, &flags); | 1140 | task_rq_unlock(rq, &flags); |
| 1113 | 1141 | ||
| 1114 | /* | 1142 | /* |
| 1115 | * Was it really running after all now that we | 1143 | * Was it really running after all now that we |
| 1116 | * checked with the proper locks actually held? | 1144 | * checked with the proper locks actually held? |
| 1117 | * | 1145 | * |
| 1118 | * Oops. Go back and try again.. | 1146 | * Oops. Go back and try again.. |
| 1119 | */ | 1147 | */ |
| 1120 | if (unlikely(running)) { | 1148 | if (unlikely(running)) { |
| 1121 | cpu_relax(); | 1149 | cpu_relax(); |
| 1122 | goto repeat; | 1150 | continue; |
| 1123 | } | 1151 | } |
| 1124 | 1152 | ||
| 1125 | /* | 1153 | /* |
| 1126 | * It's not enough that it's not actively running, | 1154 | * It's not enough that it's not actively running, |
| 1127 | * it must be off the runqueue _entirely_, and not | 1155 | * it must be off the runqueue _entirely_, and not |
| 1128 | * preempted! | 1156 | * preempted! |
| 1129 | * | 1157 | * |
| 1130 | * So if it wa still runnable (but just not actively | 1158 | * So if it wa still runnable (but just not actively |
| 1131 | * running right now), it's preempted, and we should | 1159 | * running right now), it's preempted, and we should |
| 1132 | * yield - it could be a while. | 1160 | * yield - it could be a while. |
| 1133 | */ | 1161 | */ |
| 1134 | if (unlikely(on_rq)) { | 1162 | if (unlikely(on_rq)) { |
| 1135 | yield(); | 1163 | schedule_timeout_uninterruptible(1); |
| 1136 | goto repeat; | 1164 | continue; |
| 1137 | } | 1165 | } |
| 1138 | 1166 | ||
| 1139 | /* | 1167 | /* |
| 1140 | * Ahh, all good. It wasn't running, and it wasn't | 1168 | * Ahh, all good. It wasn't running, and it wasn't |
| 1141 | * runnable, which means that it will never become | 1169 | * runnable, which means that it will never become |
| 1142 | * running in the future either. We're all done! | 1170 | * running in the future either. We're all done! |
| 1143 | */ | 1171 | */ |
| 1172 | break; | ||
| 1173 | } | ||
| 1144 | } | 1174 | } |
| 1145 | 1175 | ||
| 1146 | /*** | 1176 | /*** |
| @@ -1174,7 +1204,7 @@ void kick_process(struct task_struct *p) | |||
| 1174 | * We want to under-estimate the load of migration sources, to | 1204 | * We want to under-estimate the load of migration sources, to |
| 1175 | * balance conservatively. | 1205 | * balance conservatively. |
| 1176 | */ | 1206 | */ |
| 1177 | static inline unsigned long source_load(int cpu, int type) | 1207 | static unsigned long source_load(int cpu, int type) |
| 1178 | { | 1208 | { |
| 1179 | struct rq *rq = cpu_rq(cpu); | 1209 | struct rq *rq = cpu_rq(cpu); |
| 1180 | unsigned long total = weighted_cpuload(cpu); | 1210 | unsigned long total = weighted_cpuload(cpu); |
| @@ -1189,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type) | |||
| 1189 | * Return a high guess at the load of a migration-target cpu weighted | 1219 | * Return a high guess at the load of a migration-target cpu weighted |
| 1190 | * according to the scheduling class and "nice" value. | 1220 | * according to the scheduling class and "nice" value. |
| 1191 | */ | 1221 | */ |
| 1192 | static inline unsigned long target_load(int cpu, int type) | 1222 | static unsigned long target_load(int cpu, int type) |
| 1193 | { | 1223 | { |
| 1194 | struct rq *rq = cpu_rq(cpu); | 1224 | struct rq *rq = cpu_rq(cpu); |
| 1195 | unsigned long total = weighted_cpuload(cpu); | 1225 | unsigned long total = weighted_cpuload(cpu); |
| @@ -1231,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1231 | 1261 | ||
| 1232 | /* Skip over this group if it has no CPUs allowed */ | 1262 | /* Skip over this group if it has no CPUs allowed */ |
| 1233 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1263 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
| 1234 | goto nextgroup; | 1264 | continue; |
| 1235 | 1265 | ||
| 1236 | local_group = cpu_isset(this_cpu, group->cpumask); | 1266 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 1237 | 1267 | ||
| @@ -1259,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
| 1259 | min_load = avg_load; | 1289 | min_load = avg_load; |
| 1260 | idlest = group; | 1290 | idlest = group; |
| 1261 | } | 1291 | } |
| 1262 | nextgroup: | 1292 | } while (group = group->next, group != sd->groups); |
| 1263 | group = group->next; | ||
| 1264 | } while (group != sd->groups); | ||
| 1265 | 1293 | ||
| 1266 | if (!idlest || 100*this_load < imbalance*min_load) | 1294 | if (!idlest || 100*this_load < imbalance*min_load) |
| 1267 | return NULL; | 1295 | return NULL; |
| @@ -1393,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 1393 | if (sd->flags & SD_WAKE_IDLE) { | 1421 | if (sd->flags & SD_WAKE_IDLE) { |
| 1394 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1422 | cpus_and(tmp, sd->span, p->cpus_allowed); |
| 1395 | for_each_cpu_mask(i, tmp) { | 1423 | for_each_cpu_mask(i, tmp) { |
| 1396 | if (idle_cpu(i)) | 1424 | if (idle_cpu(i)) { |
| 1425 | if (i != task_cpu(p)) { | ||
| 1426 | schedstat_inc(p, | ||
| 1427 | se.nr_wakeups_idle); | ||
| 1428 | } | ||
| 1397 | return i; | 1429 | return i; |
| 1430 | } | ||
| 1398 | } | 1431 | } |
| 1399 | } else { | 1432 | } else { |
| 1400 | break; | 1433 | break; |
| @@ -1425,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
| 1425 | */ | 1458 | */ |
| 1426 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1459 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
| 1427 | { | 1460 | { |
| 1428 | int cpu, this_cpu, success = 0; | 1461 | int cpu, orig_cpu, this_cpu, success = 0; |
| 1429 | unsigned long flags; | 1462 | unsigned long flags; |
| 1430 | long old_state; | 1463 | long old_state; |
| 1431 | struct rq *rq; | 1464 | struct rq *rq; |
| @@ -1444,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1444 | goto out_running; | 1477 | goto out_running; |
| 1445 | 1478 | ||
| 1446 | cpu = task_cpu(p); | 1479 | cpu = task_cpu(p); |
| 1480 | orig_cpu = cpu; | ||
| 1447 | this_cpu = smp_processor_id(); | 1481 | this_cpu = smp_processor_id(); |
| 1448 | 1482 | ||
| 1449 | #ifdef CONFIG_SMP | 1483 | #ifdef CONFIG_SMP |
| @@ -1452,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1452 | 1486 | ||
| 1453 | new_cpu = cpu; | 1487 | new_cpu = cpu; |
| 1454 | 1488 | ||
| 1455 | schedstat_inc(rq, ttwu_cnt); | 1489 | schedstat_inc(rq, ttwu_count); |
| 1456 | if (cpu == this_cpu) { | 1490 | if (cpu == this_cpu) { |
| 1457 | schedstat_inc(rq, ttwu_local); | 1491 | schedstat_inc(rq, ttwu_local); |
| 1458 | goto out_set_cpu; | 1492 | goto out_set_cpu; |
| @@ -1487,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1487 | unsigned long tl = this_load; | 1521 | unsigned long tl = this_load; |
| 1488 | unsigned long tl_per_task; | 1522 | unsigned long tl_per_task; |
| 1489 | 1523 | ||
| 1524 | /* | ||
| 1525 | * Attract cache-cold tasks on sync wakeups: | ||
| 1526 | */ | ||
| 1527 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
| 1528 | goto out_set_cpu; | ||
| 1529 | |||
| 1530 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
| 1490 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1531 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
| 1491 | 1532 | ||
| 1492 | /* | 1533 | /* |
| @@ -1506,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1506 | * there is no bad imbalance. | 1547 | * there is no bad imbalance. |
| 1507 | */ | 1548 | */ |
| 1508 | schedstat_inc(this_sd, ttwu_move_affine); | 1549 | schedstat_inc(this_sd, ttwu_move_affine); |
| 1550 | schedstat_inc(p, se.nr_wakeups_affine); | ||
| 1509 | goto out_set_cpu; | 1551 | goto out_set_cpu; |
| 1510 | } | 1552 | } |
| 1511 | } | 1553 | } |
| @@ -1517,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1517 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1559 | if (this_sd->flags & SD_WAKE_BALANCE) { |
| 1518 | if (imbalance*this_load <= 100*load) { | 1560 | if (imbalance*this_load <= 100*load) { |
| 1519 | schedstat_inc(this_sd, ttwu_move_balance); | 1561 | schedstat_inc(this_sd, ttwu_move_balance); |
| 1562 | schedstat_inc(p, se.nr_wakeups_passive); | ||
| 1520 | goto out_set_cpu; | 1563 | goto out_set_cpu; |
| 1521 | } | 1564 | } |
| 1522 | } | 1565 | } |
| @@ -1542,18 +1585,18 @@ out_set_cpu: | |||
| 1542 | 1585 | ||
| 1543 | out_activate: | 1586 | out_activate: |
| 1544 | #endif /* CONFIG_SMP */ | 1587 | #endif /* CONFIG_SMP */ |
| 1588 | schedstat_inc(p, se.nr_wakeups); | ||
| 1589 | if (sync) | ||
| 1590 | schedstat_inc(p, se.nr_wakeups_sync); | ||
| 1591 | if (orig_cpu != cpu) | ||
| 1592 | schedstat_inc(p, se.nr_wakeups_migrate); | ||
| 1593 | if (cpu == this_cpu) | ||
| 1594 | schedstat_inc(p, se.nr_wakeups_local); | ||
| 1595 | else | ||
| 1596 | schedstat_inc(p, se.nr_wakeups_remote); | ||
| 1545 | update_rq_clock(rq); | 1597 | update_rq_clock(rq); |
| 1546 | activate_task(rq, p, 1); | 1598 | activate_task(rq, p, 1); |
| 1547 | /* | 1599 | check_preempt_curr(rq, p); |
| 1548 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
| 1549 | * has indicated that it will leave the CPU in short order) | ||
| 1550 | * don't trigger a preemption, if the woken up task will run on | ||
| 1551 | * this cpu. (in this case the 'I will reschedule' promise of | ||
| 1552 | * the waker guarantees that the freshly woken up task is going | ||
| 1553 | * to be considered on this CPU.) | ||
| 1554 | */ | ||
| 1555 | if (!sync || cpu != this_cpu) | ||
| 1556 | check_preempt_curr(rq, p); | ||
| 1557 | success = 1; | 1600 | success = 1; |
| 1558 | 1601 | ||
| 1559 | out_running: | 1602 | out_running: |
| @@ -1584,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1584 | */ | 1627 | */ |
| 1585 | static void __sched_fork(struct task_struct *p) | 1628 | static void __sched_fork(struct task_struct *p) |
| 1586 | { | 1629 | { |
| 1587 | p->se.wait_start_fair = 0; | ||
| 1588 | p->se.exec_start = 0; | 1630 | p->se.exec_start = 0; |
| 1589 | p->se.sum_exec_runtime = 0; | 1631 | p->se.sum_exec_runtime = 0; |
| 1590 | p->se.prev_sum_exec_runtime = 0; | 1632 | p->se.prev_sum_exec_runtime = 0; |
| 1591 | p->se.delta_exec = 0; | ||
| 1592 | p->se.delta_fair_run = 0; | ||
| 1593 | p->se.delta_fair_sleep = 0; | ||
| 1594 | p->se.wait_runtime = 0; | ||
| 1595 | p->se.sleep_start_fair = 0; | ||
| 1596 | 1633 | ||
| 1597 | #ifdef CONFIG_SCHEDSTATS | 1634 | #ifdef CONFIG_SCHEDSTATS |
| 1598 | p->se.wait_start = 0; | 1635 | p->se.wait_start = 0; |
| 1599 | p->se.sum_wait_runtime = 0; | ||
| 1600 | p->se.sum_sleep_runtime = 0; | 1636 | p->se.sum_sleep_runtime = 0; |
| 1601 | p->se.sleep_start = 0; | 1637 | p->se.sleep_start = 0; |
| 1602 | p->se.block_start = 0; | 1638 | p->se.block_start = 0; |
| 1603 | p->se.sleep_max = 0; | 1639 | p->se.sleep_max = 0; |
| 1604 | p->se.block_max = 0; | 1640 | p->se.block_max = 0; |
| 1605 | p->se.exec_max = 0; | 1641 | p->se.exec_max = 0; |
| 1642 | p->se.slice_max = 0; | ||
| 1606 | p->se.wait_max = 0; | 1643 | p->se.wait_max = 0; |
| 1607 | p->se.wait_runtime_overruns = 0; | ||
| 1608 | p->se.wait_runtime_underruns = 0; | ||
| 1609 | #endif | 1644 | #endif |
| 1610 | 1645 | ||
| 1611 | INIT_LIST_HEAD(&p->run_list); | 1646 | INIT_LIST_HEAD(&p->run_list); |
| @@ -1636,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 1636 | #ifdef CONFIG_SMP | 1671 | #ifdef CONFIG_SMP |
| 1637 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1672 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
| 1638 | #endif | 1673 | #endif |
| 1639 | __set_task_cpu(p, cpu); | 1674 | set_task_cpu(p, cpu); |
| 1640 | 1675 | ||
| 1641 | /* | 1676 | /* |
| 1642 | * Make sure we do not leak PI boosting priority to the child: | 1677 | * Make sure we do not leak PI boosting priority to the child: |
| 1643 | */ | 1678 | */ |
| 1644 | p->prio = current->normal_prio; | 1679 | p->prio = current->normal_prio; |
| 1680 | if (!rt_prio(p->prio)) | ||
| 1681 | p->sched_class = &fair_sched_class; | ||
| 1645 | 1682 | ||
| 1646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1683 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 1647 | if (likely(sched_info_on())) | 1684 | if (likely(sched_info_on())) |
| @@ -1658,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 1658 | } | 1695 | } |
| 1659 | 1696 | ||
| 1660 | /* | 1697 | /* |
| 1661 | * After fork, child runs first. (default) If set to 0 then | ||
| 1662 | * parent will (try to) run first. | ||
| 1663 | */ | ||
| 1664 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
| 1665 | |||
| 1666 | /* | ||
| 1667 | * wake_up_new_task - wake up a newly created task for the first time. | 1698 | * wake_up_new_task - wake up a newly created task for the first time. |
| 1668 | * | 1699 | * |
| 1669 | * This function will do some initial scheduler statistics housekeeping | 1700 | * This function will do some initial scheduler statistics housekeeping |
| @@ -1674,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 1674 | { | 1705 | { |
| 1675 | unsigned long flags; | 1706 | unsigned long flags; |
| 1676 | struct rq *rq; | 1707 | struct rq *rq; |
| 1677 | int this_cpu; | ||
| 1678 | 1708 | ||
| 1679 | rq = task_rq_lock(p, &flags); | 1709 | rq = task_rq_lock(p, &flags); |
| 1680 | BUG_ON(p->state != TASK_RUNNING); | 1710 | BUG_ON(p->state != TASK_RUNNING); |
| 1681 | this_cpu = smp_processor_id(); /* parent's CPU */ | ||
| 1682 | update_rq_clock(rq); | 1711 | update_rq_clock(rq); |
| 1683 | 1712 | ||
| 1684 | p->prio = effective_prio(p); | 1713 | p->prio = effective_prio(p); |
| 1685 | 1714 | ||
| 1686 | if (rt_prio(p->prio)) | 1715 | if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) { |
| 1687 | p->sched_class = &rt_sched_class; | ||
| 1688 | else | ||
| 1689 | p->sched_class = &fair_sched_class; | ||
| 1690 | |||
| 1691 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | ||
| 1692 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | ||
| 1693 | !current->se.on_rq) { | ||
| 1694 | |||
| 1695 | activate_task(rq, p, 0); | 1716 | activate_task(rq, p, 0); |
| 1696 | } else { | 1717 | } else { |
| 1697 | /* | 1718 | /* |
| @@ -1800,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 1800 | * with the lock held can cause deadlocks; see schedule() for | 1821 | * with the lock held can cause deadlocks; see schedule() for |
| 1801 | * details.) | 1822 | * details.) |
| 1802 | */ | 1823 | */ |
| 1803 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | 1824 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
| 1804 | __releases(rq->lock) | 1825 | __releases(rq->lock) |
| 1805 | { | 1826 | { |
| 1806 | struct mm_struct *mm = rq->prev_mm; | 1827 | struct mm_struct *mm = rq->prev_mm; |
| @@ -1982,42 +2003,10 @@ unsigned long nr_active(void) | |||
| 1982 | */ | 2003 | */ |
| 1983 | static void update_cpu_load(struct rq *this_rq) | 2004 | static void update_cpu_load(struct rq *this_rq) |
| 1984 | { | 2005 | { |
| 1985 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; | 2006 | unsigned long this_load = this_rq->load.weight; |
| 1986 | unsigned long total_load = this_rq->ls.load.weight; | ||
| 1987 | unsigned long this_load = total_load; | ||
| 1988 | struct load_stat *ls = &this_rq->ls; | ||
| 1989 | int i, scale; | 2007 | int i, scale; |
| 1990 | 2008 | ||
| 1991 | this_rq->nr_load_updates++; | 2009 | this_rq->nr_load_updates++; |
| 1992 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
| 1993 | goto do_avg; | ||
| 1994 | |||
| 1995 | /* Update delta_fair/delta_exec fields first */ | ||
| 1996 | update_curr_load(this_rq); | ||
| 1997 | |||
| 1998 | fair_delta64 = ls->delta_fair + 1; | ||
| 1999 | ls->delta_fair = 0; | ||
| 2000 | |||
| 2001 | exec_delta64 = ls->delta_exec + 1; | ||
| 2002 | ls->delta_exec = 0; | ||
| 2003 | |||
| 2004 | sample_interval64 = this_rq->clock - ls->load_update_last; | ||
| 2005 | ls->load_update_last = this_rq->clock; | ||
| 2006 | |||
| 2007 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
| 2008 | sample_interval64 = TICK_NSEC; | ||
| 2009 | |||
| 2010 | if (exec_delta64 > sample_interval64) | ||
| 2011 | exec_delta64 = sample_interval64; | ||
| 2012 | |||
| 2013 | idle_delta64 = sample_interval64 - exec_delta64; | ||
| 2014 | |||
| 2015 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
| 2016 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
| 2017 | |||
| 2018 | this_load = (unsigned long)tmp64; | ||
| 2019 | |||
| 2020 | do_avg: | ||
| 2021 | 2010 | ||
| 2022 | /* Update our load: */ | 2011 | /* Update our load: */ |
| 2023 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2012 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
| @@ -2027,7 +2016,13 @@ do_avg: | |||
| 2027 | 2016 | ||
| 2028 | old_load = this_rq->cpu_load[i]; | 2017 | old_load = this_rq->cpu_load[i]; |
| 2029 | new_load = this_load; | 2018 | new_load = this_load; |
| 2030 | 2019 | /* | |
| 2020 | * Round up the averaging division if load is increasing. This | ||
| 2021 | * prevents us from getting stuck on 9 if the load is 10, for | ||
| 2022 | * example. | ||
| 2023 | */ | ||
| 2024 | if (new_load > old_load) | ||
| 2025 | new_load += scale-1; | ||
| 2031 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2026 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
| 2032 | } | 2027 | } |
| 2033 | } | 2028 | } |
| @@ -2179,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 2179 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2174 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
| 2180 | * 3) are cache-hot on their current CPU. | 2175 | * 3) are cache-hot on their current CPU. |
| 2181 | */ | 2176 | */ |
| 2182 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 2177 | if (!cpu_isset(this_cpu, p->cpus_allowed)) { |
| 2178 | schedstat_inc(p, se.nr_failed_migrations_affine); | ||
| 2183 | return 0; | 2179 | return 0; |
| 2180 | } | ||
| 2184 | *all_pinned = 0; | 2181 | *all_pinned = 0; |
| 2185 | 2182 | ||
| 2186 | if (task_running(rq, p)) | 2183 | if (task_running(rq, p)) { |
| 2184 | schedstat_inc(p, se.nr_failed_migrations_running); | ||
| 2187 | return 0; | 2185 | return 0; |
| 2186 | } | ||
| 2187 | |||
| 2188 | /* | ||
| 2189 | * Aggressive migration if: | ||
| 2190 | * 1) task is cache cold, or | ||
| 2191 | * 2) too many balance attempts have failed. | ||
| 2192 | */ | ||
| 2193 | |||
| 2194 | if (!task_hot(p, rq->clock, sd) || | ||
| 2195 | sd->nr_balance_failed > sd->cache_nice_tries) { | ||
| 2196 | #ifdef CONFIG_SCHEDSTATS | ||
| 2197 | if (task_hot(p, rq->clock, sd)) { | ||
| 2198 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 2199 | schedstat_inc(p, se.nr_forced_migrations); | ||
| 2200 | } | ||
| 2201 | #endif | ||
| 2202 | return 1; | ||
| 2203 | } | ||
| 2188 | 2204 | ||
| 2205 | if (task_hot(p, rq->clock, sd)) { | ||
| 2206 | schedstat_inc(p, se.nr_failed_migrations_hot); | ||
| 2207 | return 0; | ||
| 2208 | } | ||
| 2189 | return 1; | 2209 | return 1; |
| 2190 | } | 2210 | } |
| 2191 | 2211 | ||
| @@ -2264,7 +2284,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2264 | struct sched_domain *sd, enum cpu_idle_type idle, | 2284 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 2265 | int *all_pinned) | 2285 | int *all_pinned) |
| 2266 | { | 2286 | { |
| 2267 | struct sched_class *class = sched_class_highest; | 2287 | const struct sched_class *class = sched_class_highest; |
| 2268 | unsigned long total_load_moved = 0; | 2288 | unsigned long total_load_moved = 0; |
| 2269 | int this_best_prio = this_rq->curr->prio; | 2289 | int this_best_prio = this_rq->curr->prio; |
| 2270 | 2290 | ||
| @@ -2289,7 +2309,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2289 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2309 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 2290 | struct sched_domain *sd, enum cpu_idle_type idle) | 2310 | struct sched_domain *sd, enum cpu_idle_type idle) |
| 2291 | { | 2311 | { |
| 2292 | struct sched_class *class; | 2312 | const struct sched_class *class; |
| 2293 | int this_best_prio = MAX_PRIO; | 2313 | int this_best_prio = MAX_PRIO; |
| 2294 | 2314 | ||
| 2295 | for (class = sched_class_highest; class; class = class->next) | 2315 | for (class = sched_class_highest; class; class = class->next) |
| @@ -2653,7 +2673,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 2653 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2673 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 2654 | sd_idle = 1; | 2674 | sd_idle = 1; |
| 2655 | 2675 | ||
| 2656 | schedstat_inc(sd, lb_cnt[idle]); | 2676 | schedstat_inc(sd, lb_count[idle]); |
| 2657 | 2677 | ||
| 2658 | redo: | 2678 | redo: |
| 2659 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2679 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| @@ -2806,7 +2826,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 2806 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2826 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
| 2807 | sd_idle = 1; | 2827 | sd_idle = 1; |
| 2808 | 2828 | ||
| 2809 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); | 2829 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
| 2810 | redo: | 2830 | redo: |
| 2811 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 2831 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
| 2812 | &sd_idle, &cpus, NULL); | 2832 | &sd_idle, &cpus, NULL); |
| @@ -2940,7 +2960,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
| 2940 | } | 2960 | } |
| 2941 | 2961 | ||
| 2942 | if (likely(sd)) { | 2962 | if (likely(sd)) { |
| 2943 | schedstat_inc(sd, alb_cnt); | 2963 | schedstat_inc(sd, alb_count); |
| 2944 | 2964 | ||
| 2945 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 2965 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
| 2946 | sd, CPU_IDLE)) | 2966 | sd, CPU_IDLE)) |
| @@ -3033,7 +3053,7 @@ static DEFINE_SPINLOCK(balancing); | |||
| 3033 | * | 3053 | * |
| 3034 | * Balancing parameters are set up in arch_init_sched_domains. | 3054 | * Balancing parameters are set up in arch_init_sched_domains. |
| 3035 | */ | 3055 | */ |
| 3036 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | 3056 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
| 3037 | { | 3057 | { |
| 3038 | int balance = 1; | 3058 | int balance = 1; |
| 3039 | struct rq *rq = cpu_rq(cpu); | 3059 | struct rq *rq = cpu_rq(cpu); |
| @@ -3280,6 +3300,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
| 3280 | } | 3300 | } |
| 3281 | 3301 | ||
| 3282 | /* | 3302 | /* |
| 3303 | * Account guest cpu time to a process. | ||
| 3304 | * @p: the process that the cpu time gets accounted to | ||
| 3305 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 3306 | */ | ||
| 3307 | void account_guest_time(struct task_struct *p, cputime_t cputime) | ||
| 3308 | { | ||
| 3309 | cputime64_t tmp; | ||
| 3310 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
| 3311 | |||
| 3312 | tmp = cputime_to_cputime64(cputime); | ||
| 3313 | |||
| 3314 | p->utime = cputime_add(p->utime, cputime); | ||
| 3315 | p->gtime = cputime_add(p->gtime, cputime); | ||
| 3316 | |||
| 3317 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
| 3318 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
| 3319 | } | ||
| 3320 | |||
| 3321 | /* | ||
| 3283 | * Account system cpu time to a process. | 3322 | * Account system cpu time to a process. |
| 3284 | * @p: the process that the cpu time gets accounted to | 3323 | * @p: the process that the cpu time gets accounted to |
| 3285 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3324 | * @hardirq_offset: the offset to subtract from hardirq_count() |
| @@ -3292,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3292 | struct rq *rq = this_rq(); | 3331 | struct rq *rq = this_rq(); |
| 3293 | cputime64_t tmp; | 3332 | cputime64_t tmp; |
| 3294 | 3333 | ||
| 3334 | if (p->flags & PF_VCPU) { | ||
| 3335 | account_guest_time(p, cputime); | ||
| 3336 | p->flags &= ~PF_VCPU; | ||
| 3337 | return; | ||
| 3338 | } | ||
| 3339 | |||
| 3295 | p->stime = cputime_add(p->stime, cputime); | 3340 | p->stime = cputime_add(p->stime, cputime); |
| 3296 | 3341 | ||
| 3297 | /* Add system time to cpustat. */ | 3342 | /* Add system time to cpustat. */ |
| @@ -3430,7 +3475,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 3430 | 3475 | ||
| 3431 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3476 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| 3432 | 3477 | ||
| 3433 | schedstat_inc(this_rq(), sched_cnt); | 3478 | schedstat_inc(this_rq(), sched_count); |
| 3479 | #ifdef CONFIG_SCHEDSTATS | ||
| 3480 | if (unlikely(prev->lock_depth >= 0)) { | ||
| 3481 | schedstat_inc(this_rq(), bkl_count); | ||
| 3482 | schedstat_inc(prev, sched_info.bkl_count); | ||
| 3483 | } | ||
| 3484 | #endif | ||
| 3434 | } | 3485 | } |
| 3435 | 3486 | ||
| 3436 | /* | 3487 | /* |
| @@ -3439,7 +3490,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 3439 | static inline struct task_struct * | 3490 | static inline struct task_struct * |
| 3440 | pick_next_task(struct rq *rq, struct task_struct *prev) | 3491 | pick_next_task(struct rq *rq, struct task_struct *prev) |
| 3441 | { | 3492 | { |
| 3442 | struct sched_class *class; | 3493 | const struct sched_class *class; |
| 3443 | struct task_struct *p; | 3494 | struct task_struct *p; |
| 3444 | 3495 | ||
| 3445 | /* | 3496 | /* |
| @@ -3488,9 +3539,13 @@ need_resched_nonpreemptible: | |||
| 3488 | 3539 | ||
| 3489 | schedule_debug(prev); | 3540 | schedule_debug(prev); |
| 3490 | 3541 | ||
| 3491 | spin_lock_irq(&rq->lock); | 3542 | /* |
| 3492 | clear_tsk_need_resched(prev); | 3543 | * Do the rq-clock update outside the rq lock: |
| 3544 | */ | ||
| 3545 | local_irq_disable(); | ||
| 3493 | __update_rq_clock(rq); | 3546 | __update_rq_clock(rq); |
| 3547 | spin_lock(&rq->lock); | ||
| 3548 | clear_tsk_need_resched(prev); | ||
| 3494 | 3549 | ||
| 3495 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3550 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 3496 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3551 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
| @@ -3550,27 +3605,30 @@ asmlinkage void __sched preempt_schedule(void) | |||
| 3550 | if (likely(ti->preempt_count || irqs_disabled())) | 3605 | if (likely(ti->preempt_count || irqs_disabled())) |
| 3551 | return; | 3606 | return; |
| 3552 | 3607 | ||
| 3553 | need_resched: | 3608 | do { |
| 3554 | add_preempt_count(PREEMPT_ACTIVE); | 3609 | add_preempt_count(PREEMPT_ACTIVE); |
| 3555 | /* | 3610 | |
| 3556 | * We keep the big kernel semaphore locked, but we | 3611 | /* |
| 3557 | * clear ->lock_depth so that schedule() doesnt | 3612 | * We keep the big kernel semaphore locked, but we |
| 3558 | * auto-release the semaphore: | 3613 | * clear ->lock_depth so that schedule() doesnt |
| 3559 | */ | 3614 | * auto-release the semaphore: |
| 3615 | */ | ||
| 3560 | #ifdef CONFIG_PREEMPT_BKL | 3616 | #ifdef CONFIG_PREEMPT_BKL |
| 3561 | saved_lock_depth = task->lock_depth; | 3617 | saved_lock_depth = task->lock_depth; |
| 3562 | task->lock_depth = -1; | 3618 | task->lock_depth = -1; |
| 3563 | #endif | 3619 | #endif |
| 3564 | schedule(); | 3620 | schedule(); |
| 3565 | #ifdef CONFIG_PREEMPT_BKL | 3621 | #ifdef CONFIG_PREEMPT_BKL |
| 3566 | task->lock_depth = saved_lock_depth; | 3622 | task->lock_depth = saved_lock_depth; |
| 3567 | #endif | 3623 | #endif |
| 3568 | sub_preempt_count(PREEMPT_ACTIVE); | 3624 | sub_preempt_count(PREEMPT_ACTIVE); |
| 3569 | 3625 | ||
| 3570 | /* we could miss a preemption opportunity between schedule and now */ | 3626 | /* |
| 3571 | barrier(); | 3627 | * Check again in case we missed a preemption opportunity |
| 3572 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3628 | * between schedule and now. |
| 3573 | goto need_resched; | 3629 | */ |
| 3630 | barrier(); | ||
| 3631 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
| 3574 | } | 3632 | } |
| 3575 | EXPORT_SYMBOL(preempt_schedule); | 3633 | EXPORT_SYMBOL(preempt_schedule); |
| 3576 | 3634 | ||
| @@ -3590,29 +3648,32 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3590 | /* Catch callers which need to be fixed */ | 3648 | /* Catch callers which need to be fixed */ |
| 3591 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3649 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3592 | 3650 | ||
| 3593 | need_resched: | 3651 | do { |
| 3594 | add_preempt_count(PREEMPT_ACTIVE); | 3652 | add_preempt_count(PREEMPT_ACTIVE); |
| 3595 | /* | 3653 | |
| 3596 | * We keep the big kernel semaphore locked, but we | 3654 | /* |
| 3597 | * clear ->lock_depth so that schedule() doesnt | 3655 | * We keep the big kernel semaphore locked, but we |
| 3598 | * auto-release the semaphore: | 3656 | * clear ->lock_depth so that schedule() doesnt |
| 3599 | */ | 3657 | * auto-release the semaphore: |
| 3658 | */ | ||
| 3600 | #ifdef CONFIG_PREEMPT_BKL | 3659 | #ifdef CONFIG_PREEMPT_BKL |
| 3601 | saved_lock_depth = task->lock_depth; | 3660 | saved_lock_depth = task->lock_depth; |
| 3602 | task->lock_depth = -1; | 3661 | task->lock_depth = -1; |
| 3603 | #endif | 3662 | #endif |
| 3604 | local_irq_enable(); | 3663 | local_irq_enable(); |
| 3605 | schedule(); | 3664 | schedule(); |
| 3606 | local_irq_disable(); | 3665 | local_irq_disable(); |
| 3607 | #ifdef CONFIG_PREEMPT_BKL | 3666 | #ifdef CONFIG_PREEMPT_BKL |
| 3608 | task->lock_depth = saved_lock_depth; | 3667 | task->lock_depth = saved_lock_depth; |
| 3609 | #endif | 3668 | #endif |
| 3610 | sub_preempt_count(PREEMPT_ACTIVE); | 3669 | sub_preempt_count(PREEMPT_ACTIVE); |
| 3611 | 3670 | ||
| 3612 | /* we could miss a preemption opportunity between schedule and now */ | 3671 | /* |
| 3613 | barrier(); | 3672 | * Check again in case we missed a preemption opportunity |
| 3614 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3673 | * between schedule and now. |
| 3615 | goto need_resched; | 3674 | */ |
| 3675 | barrier(); | ||
| 3676 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | ||
| 3616 | } | 3677 | } |
| 3617 | 3678 | ||
| 3618 | #endif /* CONFIG_PREEMPT */ | 3679 | #endif /* CONFIG_PREEMPT */ |
| @@ -3636,10 +3697,9 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 3636 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3697 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
| 3637 | int nr_exclusive, int sync, void *key) | 3698 | int nr_exclusive, int sync, void *key) |
| 3638 | { | 3699 | { |
| 3639 | struct list_head *tmp, *next; | 3700 | wait_queue_t *curr, *next; |
| 3640 | 3701 | ||
| 3641 | list_for_each_safe(tmp, next, &q->task_list) { | 3702 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
| 3642 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | ||
| 3643 | unsigned flags = curr->flags; | 3703 | unsigned flags = curr->flags; |
| 3644 | 3704 | ||
| 3645 | if (curr->func(curr, mode, sync, key) && | 3705 | if (curr->func(curr, mode, sync, key) && |
| @@ -3729,206 +3789,116 @@ void fastcall complete_all(struct completion *x) | |||
| 3729 | } | 3789 | } |
| 3730 | EXPORT_SYMBOL(complete_all); | 3790 | EXPORT_SYMBOL(complete_all); |
| 3731 | 3791 | ||
| 3732 | void fastcall __sched wait_for_completion(struct completion *x) | 3792 | static inline long __sched |
| 3733 | { | 3793 | do_wait_for_common(struct completion *x, long timeout, int state) |
| 3734 | might_sleep(); | ||
| 3735 | |||
| 3736 | spin_lock_irq(&x->wait.lock); | ||
| 3737 | if (!x->done) { | ||
| 3738 | DECLARE_WAITQUEUE(wait, current); | ||
| 3739 | |||
| 3740 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
| 3741 | __add_wait_queue_tail(&x->wait, &wait); | ||
| 3742 | do { | ||
| 3743 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 3744 | spin_unlock_irq(&x->wait.lock); | ||
| 3745 | schedule(); | ||
| 3746 | spin_lock_irq(&x->wait.lock); | ||
| 3747 | } while (!x->done); | ||
| 3748 | __remove_wait_queue(&x->wait, &wait); | ||
| 3749 | } | ||
| 3750 | x->done--; | ||
| 3751 | spin_unlock_irq(&x->wait.lock); | ||
| 3752 | } | ||
| 3753 | EXPORT_SYMBOL(wait_for_completion); | ||
| 3754 | |||
| 3755 | unsigned long fastcall __sched | ||
| 3756 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
| 3757 | { | 3794 | { |
| 3758 | might_sleep(); | ||
| 3759 | |||
| 3760 | spin_lock_irq(&x->wait.lock); | ||
| 3761 | if (!x->done) { | 3795 | if (!x->done) { |
| 3762 | DECLARE_WAITQUEUE(wait, current); | 3796 | DECLARE_WAITQUEUE(wait, current); |
| 3763 | 3797 | ||
| 3764 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3798 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
| 3765 | __add_wait_queue_tail(&x->wait, &wait); | 3799 | __add_wait_queue_tail(&x->wait, &wait); |
| 3766 | do { | 3800 | do { |
| 3767 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3801 | if (state == TASK_INTERRUPTIBLE && |
| 3802 | signal_pending(current)) { | ||
| 3803 | __remove_wait_queue(&x->wait, &wait); | ||
| 3804 | return -ERESTARTSYS; | ||
| 3805 | } | ||
| 3806 | __set_current_state(state); | ||
| 3768 | spin_unlock_irq(&x->wait.lock); | 3807 | spin_unlock_irq(&x->wait.lock); |
| 3769 | timeout = schedule_timeout(timeout); | 3808 | timeout = schedule_timeout(timeout); |
| 3770 | spin_lock_irq(&x->wait.lock); | 3809 | spin_lock_irq(&x->wait.lock); |
| 3771 | if (!timeout) { | 3810 | if (!timeout) { |
| 3772 | __remove_wait_queue(&x->wait, &wait); | 3811 | __remove_wait_queue(&x->wait, &wait); |
| 3773 | goto out; | 3812 | return timeout; |
| 3774 | } | 3813 | } |
| 3775 | } while (!x->done); | 3814 | } while (!x->done); |
| 3776 | __remove_wait_queue(&x->wait, &wait); | 3815 | __remove_wait_queue(&x->wait, &wait); |
| 3777 | } | 3816 | } |
| 3778 | x->done--; | 3817 | x->done--; |
| 3779 | out: | ||
| 3780 | spin_unlock_irq(&x->wait.lock); | ||
| 3781 | return timeout; | 3818 | return timeout; |
| 3782 | } | 3819 | } |
| 3783 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
| 3784 | 3820 | ||
| 3785 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3821 | static long __sched |
| 3822 | wait_for_common(struct completion *x, long timeout, int state) | ||
| 3786 | { | 3823 | { |
| 3787 | int ret = 0; | ||
| 3788 | |||
| 3789 | might_sleep(); | 3824 | might_sleep(); |
| 3790 | 3825 | ||
| 3791 | spin_lock_irq(&x->wait.lock); | 3826 | spin_lock_irq(&x->wait.lock); |
| 3792 | if (!x->done) { | 3827 | timeout = do_wait_for_common(x, timeout, state); |
| 3793 | DECLARE_WAITQUEUE(wait, current); | ||
| 3794 | |||
| 3795 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
| 3796 | __add_wait_queue_tail(&x->wait, &wait); | ||
| 3797 | do { | ||
| 3798 | if (signal_pending(current)) { | ||
| 3799 | ret = -ERESTARTSYS; | ||
| 3800 | __remove_wait_queue(&x->wait, &wait); | ||
| 3801 | goto out; | ||
| 3802 | } | ||
| 3803 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 3804 | spin_unlock_irq(&x->wait.lock); | ||
| 3805 | schedule(); | ||
| 3806 | spin_lock_irq(&x->wait.lock); | ||
| 3807 | } while (!x->done); | ||
| 3808 | __remove_wait_queue(&x->wait, &wait); | ||
| 3809 | } | ||
| 3810 | x->done--; | ||
| 3811 | out: | ||
| 3812 | spin_unlock_irq(&x->wait.lock); | 3828 | spin_unlock_irq(&x->wait.lock); |
| 3829 | return timeout; | ||
| 3830 | } | ||
| 3813 | 3831 | ||
| 3814 | return ret; | 3832 | void fastcall __sched wait_for_completion(struct completion *x) |
| 3833 | { | ||
| 3834 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 3815 | } | 3835 | } |
| 3816 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3836 | EXPORT_SYMBOL(wait_for_completion); |
| 3817 | 3837 | ||
| 3818 | unsigned long fastcall __sched | 3838 | unsigned long fastcall __sched |
| 3819 | wait_for_completion_interruptible_timeout(struct completion *x, | 3839 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
| 3820 | unsigned long timeout) | ||
| 3821 | { | 3840 | { |
| 3822 | might_sleep(); | 3841 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
| 3823 | |||
| 3824 | spin_lock_irq(&x->wait.lock); | ||
| 3825 | if (!x->done) { | ||
| 3826 | DECLARE_WAITQUEUE(wait, current); | ||
| 3827 | |||
| 3828 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
| 3829 | __add_wait_queue_tail(&x->wait, &wait); | ||
| 3830 | do { | ||
| 3831 | if (signal_pending(current)) { | ||
| 3832 | timeout = -ERESTARTSYS; | ||
| 3833 | __remove_wait_queue(&x->wait, &wait); | ||
| 3834 | goto out; | ||
| 3835 | } | ||
| 3836 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 3837 | spin_unlock_irq(&x->wait.lock); | ||
| 3838 | timeout = schedule_timeout(timeout); | ||
| 3839 | spin_lock_irq(&x->wait.lock); | ||
| 3840 | if (!timeout) { | ||
| 3841 | __remove_wait_queue(&x->wait, &wait); | ||
| 3842 | goto out; | ||
| 3843 | } | ||
| 3844 | } while (!x->done); | ||
| 3845 | __remove_wait_queue(&x->wait, &wait); | ||
| 3846 | } | ||
| 3847 | x->done--; | ||
| 3848 | out: | ||
| 3849 | spin_unlock_irq(&x->wait.lock); | ||
| 3850 | return timeout; | ||
| 3851 | } | 3842 | } |
| 3852 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3843 | EXPORT_SYMBOL(wait_for_completion_timeout); |
| 3853 | 3844 | ||
| 3854 | static inline void | 3845 | int __sched wait_for_completion_interruptible(struct completion *x) |
| 3855 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | ||
| 3856 | { | 3846 | { |
| 3857 | spin_lock_irqsave(&q->lock, *flags); | 3847 | return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
| 3858 | __add_wait_queue(q, wait); | ||
| 3859 | spin_unlock(&q->lock); | ||
| 3860 | } | 3848 | } |
| 3849 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
| 3861 | 3850 | ||
| 3862 | static inline void | 3851 | unsigned long fastcall __sched |
| 3863 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | 3852 | wait_for_completion_interruptible_timeout(struct completion *x, |
| 3853 | unsigned long timeout) | ||
| 3864 | { | 3854 | { |
| 3865 | spin_lock_irq(&q->lock); | 3855 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
| 3866 | __remove_wait_queue(q, wait); | ||
| 3867 | spin_unlock_irqrestore(&q->lock, *flags); | ||
| 3868 | } | 3856 | } |
| 3857 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
| 3869 | 3858 | ||
| 3870 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 3859 | static long __sched |
| 3860 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | ||
| 3871 | { | 3861 | { |
| 3872 | unsigned long flags; | 3862 | unsigned long flags; |
| 3873 | wait_queue_t wait; | 3863 | wait_queue_t wait; |
| 3874 | 3864 | ||
| 3875 | init_waitqueue_entry(&wait, current); | 3865 | init_waitqueue_entry(&wait, current); |
| 3876 | 3866 | ||
| 3877 | current->state = TASK_INTERRUPTIBLE; | 3867 | __set_current_state(state); |
| 3878 | 3868 | ||
| 3879 | sleep_on_head(q, &wait, &flags); | 3869 | spin_lock_irqsave(&q->lock, flags); |
| 3880 | schedule(); | 3870 | __add_wait_queue(q, &wait); |
| 3881 | sleep_on_tail(q, &wait, &flags); | 3871 | spin_unlock(&q->lock); |
| 3872 | timeout = schedule_timeout(timeout); | ||
| 3873 | spin_lock_irq(&q->lock); | ||
| 3874 | __remove_wait_queue(q, &wait); | ||
| 3875 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 3876 | |||
| 3877 | return timeout; | ||
| 3878 | } | ||
| 3879 | |||
| 3880 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | ||
| 3881 | { | ||
| 3882 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 3882 | } | 3883 | } |
| 3883 | EXPORT_SYMBOL(interruptible_sleep_on); | 3884 | EXPORT_SYMBOL(interruptible_sleep_on); |
| 3884 | 3885 | ||
| 3885 | long __sched | 3886 | long __sched |
| 3886 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3887 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
| 3887 | { | 3888 | { |
| 3888 | unsigned long flags; | 3889 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
| 3889 | wait_queue_t wait; | ||
| 3890 | |||
| 3891 | init_waitqueue_entry(&wait, current); | ||
| 3892 | |||
| 3893 | current->state = TASK_INTERRUPTIBLE; | ||
| 3894 | |||
| 3895 | sleep_on_head(q, &wait, &flags); | ||
| 3896 | timeout = schedule_timeout(timeout); | ||
| 3897 | sleep_on_tail(q, &wait, &flags); | ||
| 3898 | |||
| 3899 | return timeout; | ||
| 3900 | } | 3890 | } |
| 3901 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3891 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
| 3902 | 3892 | ||
| 3903 | void __sched sleep_on(wait_queue_head_t *q) | 3893 | void __sched sleep_on(wait_queue_head_t *q) |
| 3904 | { | 3894 | { |
| 3905 | unsigned long flags; | 3895 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
| 3906 | wait_queue_t wait; | ||
| 3907 | |||
| 3908 | init_waitqueue_entry(&wait, current); | ||
| 3909 | |||
| 3910 | current->state = TASK_UNINTERRUPTIBLE; | ||
| 3911 | |||
| 3912 | sleep_on_head(q, &wait, &flags); | ||
| 3913 | schedule(); | ||
| 3914 | sleep_on_tail(q, &wait, &flags); | ||
| 3915 | } | 3896 | } |
| 3916 | EXPORT_SYMBOL(sleep_on); | 3897 | EXPORT_SYMBOL(sleep_on); |
| 3917 | 3898 | ||
| 3918 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3899 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
| 3919 | { | 3900 | { |
| 3920 | unsigned long flags; | 3901 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
| 3921 | wait_queue_t wait; | ||
| 3922 | |||
| 3923 | init_waitqueue_entry(&wait, current); | ||
| 3924 | |||
| 3925 | current->state = TASK_UNINTERRUPTIBLE; | ||
| 3926 | |||
| 3927 | sleep_on_head(q, &wait, &flags); | ||
| 3928 | timeout = schedule_timeout(timeout); | ||
| 3929 | sleep_on_tail(q, &wait, &flags); | ||
| 3930 | |||
| 3931 | return timeout; | ||
| 3932 | } | 3902 | } |
| 3933 | EXPORT_SYMBOL(sleep_on_timeout); | 3903 | EXPORT_SYMBOL(sleep_on_timeout); |
| 3934 | 3904 | ||
| @@ -3947,7 +3917,7 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
| 3947 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3917 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 3948 | { | 3918 | { |
| 3949 | unsigned long flags; | 3919 | unsigned long flags; |
| 3950 | int oldprio, on_rq; | 3920 | int oldprio, on_rq, running; |
| 3951 | struct rq *rq; | 3921 | struct rq *rq; |
| 3952 | 3922 | ||
| 3953 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3923 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
| @@ -3957,8 +3927,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3957 | 3927 | ||
| 3958 | oldprio = p->prio; | 3928 | oldprio = p->prio; |
| 3959 | on_rq = p->se.on_rq; | 3929 | on_rq = p->se.on_rq; |
| 3960 | if (on_rq) | 3930 | running = task_running(rq, p); |
| 3931 | if (on_rq) { | ||
| 3961 | dequeue_task(rq, p, 0); | 3932 | dequeue_task(rq, p, 0); |
| 3933 | if (running) | ||
| 3934 | p->sched_class->put_prev_task(rq, p); | ||
| 3935 | } | ||
| 3962 | 3936 | ||
| 3963 | if (rt_prio(prio)) | 3937 | if (rt_prio(prio)) |
| 3964 | p->sched_class = &rt_sched_class; | 3938 | p->sched_class = &rt_sched_class; |
| @@ -3968,13 +3942,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3968 | p->prio = prio; | 3942 | p->prio = prio; |
| 3969 | 3943 | ||
| 3970 | if (on_rq) { | 3944 | if (on_rq) { |
| 3945 | if (running) | ||
| 3946 | p->sched_class->set_curr_task(rq); | ||
| 3971 | enqueue_task(rq, p, 0); | 3947 | enqueue_task(rq, p, 0); |
| 3972 | /* | 3948 | /* |
| 3973 | * Reschedule if we are currently running on this runqueue and | 3949 | * Reschedule if we are currently running on this runqueue and |
| 3974 | * our priority decreased, or if we are not currently running on | 3950 | * our priority decreased, or if we are not currently running on |
| 3975 | * this runqueue and our priority is higher than the current's | 3951 | * this runqueue and our priority is higher than the current's |
| 3976 | */ | 3952 | */ |
| 3977 | if (task_running(rq, p)) { | 3953 | if (running) { |
| 3978 | if (p->prio > oldprio) | 3954 | if (p->prio > oldprio) |
| 3979 | resched_task(rq->curr); | 3955 | resched_task(rq->curr); |
| 3980 | } else { | 3956 | } else { |
| @@ -4138,7 +4114,7 @@ struct task_struct *idle_task(int cpu) | |||
| 4138 | * find_process_by_pid - find a process with a matching PID value. | 4114 | * find_process_by_pid - find a process with a matching PID value. |
| 4139 | * @pid: the pid in question. | 4115 | * @pid: the pid in question. |
| 4140 | */ | 4116 | */ |
| 4141 | static inline struct task_struct *find_process_by_pid(pid_t pid) | 4117 | static struct task_struct *find_process_by_pid(pid_t pid) |
| 4142 | { | 4118 | { |
| 4143 | return pid ? find_task_by_pid(pid) : current; | 4119 | return pid ? find_task_by_pid(pid) : current; |
| 4144 | } | 4120 | } |
| @@ -4180,7 +4156,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
| 4180 | int sched_setscheduler(struct task_struct *p, int policy, | 4156 | int sched_setscheduler(struct task_struct *p, int policy, |
| 4181 | struct sched_param *param) | 4157 | struct sched_param *param) |
| 4182 | { | 4158 | { |
| 4183 | int retval, oldprio, oldpolicy = -1, on_rq; | 4159 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
| 4184 | unsigned long flags; | 4160 | unsigned long flags; |
| 4185 | struct rq *rq; | 4161 | struct rq *rq; |
| 4186 | 4162 | ||
| @@ -4262,18 +4238,26 @@ recheck: | |||
| 4262 | } | 4238 | } |
| 4263 | update_rq_clock(rq); | 4239 | update_rq_clock(rq); |
| 4264 | on_rq = p->se.on_rq; | 4240 | on_rq = p->se.on_rq; |
| 4265 | if (on_rq) | 4241 | running = task_running(rq, p); |
| 4242 | if (on_rq) { | ||
| 4266 | deactivate_task(rq, p, 0); | 4243 | deactivate_task(rq, p, 0); |
| 4244 | if (running) | ||
| 4245 | p->sched_class->put_prev_task(rq, p); | ||
| 4246 | } | ||
| 4247 | |||
| 4267 | oldprio = p->prio; | 4248 | oldprio = p->prio; |
| 4268 | __setscheduler(rq, p, policy, param->sched_priority); | 4249 | __setscheduler(rq, p, policy, param->sched_priority); |
| 4250 | |||
| 4269 | if (on_rq) { | 4251 | if (on_rq) { |
| 4252 | if (running) | ||
| 4253 | p->sched_class->set_curr_task(rq); | ||
| 4270 | activate_task(rq, p, 0); | 4254 | activate_task(rq, p, 0); |
| 4271 | /* | 4255 | /* |
| 4272 | * Reschedule if we are currently running on this runqueue and | 4256 | * Reschedule if we are currently running on this runqueue and |
| 4273 | * our priority decreased, or if we are not currently running on | 4257 | * our priority decreased, or if we are not currently running on |
| 4274 | * this runqueue and our priority is higher than the current's | 4258 | * this runqueue and our priority is higher than the current's |
| 4275 | */ | 4259 | */ |
| 4276 | if (task_running(rq, p)) { | 4260 | if (running) { |
| 4277 | if (p->prio > oldprio) | 4261 | if (p->prio > oldprio) |
| 4278 | resched_task(rq->curr); | 4262 | resched_task(rq->curr); |
| 4279 | } else { | 4263 | } else { |
| @@ -4344,10 +4328,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
| 4344 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4328 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
| 4345 | { | 4329 | { |
| 4346 | struct task_struct *p; | 4330 | struct task_struct *p; |
| 4347 | int retval = -EINVAL; | 4331 | int retval; |
| 4348 | 4332 | ||
| 4349 | if (pid < 0) | 4333 | if (pid < 0) |
| 4350 | goto out_nounlock; | 4334 | return -EINVAL; |
| 4351 | 4335 | ||
| 4352 | retval = -ESRCH; | 4336 | retval = -ESRCH; |
| 4353 | read_lock(&tasklist_lock); | 4337 | read_lock(&tasklist_lock); |
| @@ -4358,8 +4342,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) | |||
| 4358 | retval = p->policy; | 4342 | retval = p->policy; |
| 4359 | } | 4343 | } |
| 4360 | read_unlock(&tasklist_lock); | 4344 | read_unlock(&tasklist_lock); |
| 4361 | |||
| 4362 | out_nounlock: | ||
| 4363 | return retval; | 4345 | return retval; |
| 4364 | } | 4346 | } |
| 4365 | 4347 | ||
| @@ -4372,10 +4354,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
| 4372 | { | 4354 | { |
| 4373 | struct sched_param lp; | 4355 | struct sched_param lp; |
| 4374 | struct task_struct *p; | 4356 | struct task_struct *p; |
| 4375 | int retval = -EINVAL; | 4357 | int retval; |
| 4376 | 4358 | ||
| 4377 | if (!param || pid < 0) | 4359 | if (!param || pid < 0) |
| 4378 | goto out_nounlock; | 4360 | return -EINVAL; |
| 4379 | 4361 | ||
| 4380 | read_lock(&tasklist_lock); | 4362 | read_lock(&tasklist_lock); |
| 4381 | p = find_process_by_pid(pid); | 4363 | p = find_process_by_pid(pid); |
| @@ -4395,7 +4377,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | |||
| 4395 | */ | 4377 | */ |
| 4396 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4378 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
| 4397 | 4379 | ||
| 4398 | out_nounlock: | ||
| 4399 | return retval; | 4380 | return retval; |
| 4400 | 4381 | ||
| 4401 | out_unlock: | 4382 | out_unlock: |
| @@ -4555,8 +4536,8 @@ asmlinkage long sys_sched_yield(void) | |||
| 4555 | { | 4536 | { |
| 4556 | struct rq *rq = this_rq_lock(); | 4537 | struct rq *rq = this_rq_lock(); |
| 4557 | 4538 | ||
| 4558 | schedstat_inc(rq, yld_cnt); | 4539 | schedstat_inc(rq, yld_count); |
| 4559 | current->sched_class->yield_task(rq, current); | 4540 | current->sched_class->yield_task(rq); |
| 4560 | 4541 | ||
| 4561 | /* | 4542 | /* |
| 4562 | * Since we are going to call schedule() anyway, there's | 4543 | * Since we are going to call schedule() anyway, there's |
| @@ -4750,11 +4731,12 @@ asmlinkage | |||
| 4750 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4731 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
| 4751 | { | 4732 | { |
| 4752 | struct task_struct *p; | 4733 | struct task_struct *p; |
| 4753 | int retval = -EINVAL; | 4734 | unsigned int time_slice; |
| 4735 | int retval; | ||
| 4754 | struct timespec t; | 4736 | struct timespec t; |
| 4755 | 4737 | ||
| 4756 | if (pid < 0) | 4738 | if (pid < 0) |
| 4757 | goto out_nounlock; | 4739 | return -EINVAL; |
| 4758 | 4740 | ||
| 4759 | retval = -ESRCH; | 4741 | retval = -ESRCH; |
| 4760 | read_lock(&tasklist_lock); | 4742 | read_lock(&tasklist_lock); |
| @@ -4766,12 +4748,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
| 4766 | if (retval) | 4748 | if (retval) |
| 4767 | goto out_unlock; | 4749 | goto out_unlock; |
| 4768 | 4750 | ||
| 4769 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4751 | if (p->policy == SCHED_FIFO) |
| 4770 | 0 : static_prio_timeslice(p->static_prio), &t); | 4752 | time_slice = 0; |
| 4753 | else if (p->policy == SCHED_RR) | ||
| 4754 | time_slice = DEF_TIMESLICE; | ||
| 4755 | else { | ||
| 4756 | struct sched_entity *se = &p->se; | ||
| 4757 | unsigned long flags; | ||
| 4758 | struct rq *rq; | ||
| 4759 | |||
| 4760 | rq = task_rq_lock(p, &flags); | ||
| 4761 | time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | ||
| 4762 | task_rq_unlock(rq, &flags); | ||
| 4763 | } | ||
| 4771 | read_unlock(&tasklist_lock); | 4764 | read_unlock(&tasklist_lock); |
| 4765 | jiffies_to_timespec(time_slice, &t); | ||
| 4772 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4766 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
| 4773 | out_nounlock: | ||
| 4774 | return retval; | 4767 | return retval; |
| 4768 | |||
| 4775 | out_unlock: | 4769 | out_unlock: |
| 4776 | read_unlock(&tasklist_lock); | 4770 | read_unlock(&tasklist_lock); |
| 4777 | return retval; | 4771 | return retval; |
| @@ -4900,32 +4894,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4900 | */ | 4894 | */ |
| 4901 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4895 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
| 4902 | 4896 | ||
| 4903 | /* | ||
| 4904 | * Increase the granularity value when there are more CPUs, | ||
| 4905 | * because with more CPUs the 'effective latency' as visible | ||
| 4906 | * to users decreases. But the relationship is not linear, | ||
| 4907 | * so pick a second-best guess by going with the log2 of the | ||
| 4908 | * number of CPUs. | ||
| 4909 | * | ||
| 4910 | * This idea comes from the SD scheduler of Con Kolivas: | ||
| 4911 | */ | ||
| 4912 | static inline void sched_init_granularity(void) | ||
| 4913 | { | ||
| 4914 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
| 4915 | const unsigned long limit = 100000000; | ||
| 4916 | |||
| 4917 | sysctl_sched_min_granularity *= factor; | ||
| 4918 | if (sysctl_sched_min_granularity > limit) | ||
| 4919 | sysctl_sched_min_granularity = limit; | ||
| 4920 | |||
| 4921 | sysctl_sched_latency *= factor; | ||
| 4922 | if (sysctl_sched_latency > limit) | ||
| 4923 | sysctl_sched_latency = limit; | ||
| 4924 | |||
| 4925 | sysctl_sched_runtime_limit = sysctl_sched_latency; | ||
| 4926 | sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; | ||
| 4927 | } | ||
| 4928 | |||
| 4929 | #ifdef CONFIG_SMP | 4897 | #ifdef CONFIG_SMP |
| 4930 | /* | 4898 | /* |
| 4931 | * This is how migration works: | 4899 | * This is how migration works: |
| @@ -5103,35 +5071,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 5103 | struct rq *rq; | 5071 | struct rq *rq; |
| 5104 | int dest_cpu; | 5072 | int dest_cpu; |
| 5105 | 5073 | ||
| 5106 | restart: | 5074 | do { |
| 5107 | /* On same node? */ | 5075 | /* On same node? */ |
| 5108 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5076 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
| 5109 | cpus_and(mask, mask, p->cpus_allowed); | 5077 | cpus_and(mask, mask, p->cpus_allowed); |
| 5110 | dest_cpu = any_online_cpu(mask); | 5078 | dest_cpu = any_online_cpu(mask); |
| 5111 | 5079 | ||
| 5112 | /* On any allowed CPU? */ | 5080 | /* On any allowed CPU? */ |
| 5113 | if (dest_cpu == NR_CPUS) | 5081 | if (dest_cpu == NR_CPUS) |
| 5114 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5082 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 5115 | 5083 | ||
| 5116 | /* No more Mr. Nice Guy. */ | 5084 | /* No more Mr. Nice Guy. */ |
| 5117 | if (dest_cpu == NR_CPUS) { | 5085 | if (dest_cpu == NR_CPUS) { |
| 5118 | rq = task_rq_lock(p, &flags); | 5086 | rq = task_rq_lock(p, &flags); |
| 5119 | cpus_setall(p->cpus_allowed); | 5087 | cpus_setall(p->cpus_allowed); |
| 5120 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5088 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 5121 | task_rq_unlock(rq, &flags); | 5089 | task_rq_unlock(rq, &flags); |
| 5122 | 5090 | ||
| 5123 | /* | 5091 | /* |
| 5124 | * Don't tell them about moving exiting tasks or | 5092 | * Don't tell them about moving exiting tasks or |
| 5125 | * kernel threads (both mm NULL), since they never | 5093 | * kernel threads (both mm NULL), since they never |
| 5126 | * leave kernel. | 5094 | * leave kernel. |
| 5127 | */ | 5095 | */ |
| 5128 | if (p->mm && printk_ratelimit()) | 5096 | if (p->mm && printk_ratelimit()) |
| 5129 | printk(KERN_INFO "process %d (%s) no " | 5097 | printk(KERN_INFO "process %d (%s) no " |
| 5130 | "longer affine to cpu%d\n", | 5098 | "longer affine to cpu%d\n", |
| 5131 | p->pid, p->comm, dead_cpu); | 5099 | p->pid, p->comm, dead_cpu); |
| 5132 | } | 5100 | } |
| 5133 | if (!__migrate_task(p, dead_cpu, dest_cpu)) | 5101 | } while (!__migrate_task(p, dead_cpu, dest_cpu)); |
| 5134 | goto restart; | ||
| 5135 | } | 5102 | } |
| 5136 | 5103 | ||
| 5137 | /* | 5104 | /* |
| @@ -5173,6 +5140,20 @@ static void migrate_live_tasks(int src_cpu) | |||
| 5173 | } | 5140 | } |
| 5174 | 5141 | ||
| 5175 | /* | 5142 | /* |
| 5143 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
| 5144 | */ | ||
| 5145 | static void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
| 5146 | { | ||
| 5147 | update_rq_clock(rq); | ||
| 5148 | |||
| 5149 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 5150 | rq->nr_uninterruptible--; | ||
| 5151 | |||
| 5152 | enqueue_task(rq, p, 0); | ||
| 5153 | inc_nr_running(p, rq); | ||
| 5154 | } | ||
| 5155 | |||
| 5156 | /* | ||
| 5176 | * Schedules idle task to be the next runnable task on current CPU. | 5157 | * Schedules idle task to be the next runnable task on current CPU. |
| 5177 | * It does so by boosting its priority to highest possible and adding it to | 5158 | * It does so by boosting its priority to highest possible and adding it to |
| 5178 | * the _front_ of the runqueue. Used by CPU offline code. | 5159 | * the _front_ of the runqueue. Used by CPU offline code. |
| @@ -5284,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = { | |||
| 5284 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5265 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
| 5285 | { | 5266 | { |
| 5286 | struct ctl_table *entry = | 5267 | struct ctl_table *entry = |
| 5287 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); | 5268 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
| 5288 | |||
| 5289 | BUG_ON(!entry); | ||
| 5290 | memset(entry, 0, n * sizeof(struct ctl_table)); | ||
| 5291 | 5269 | ||
| 5292 | return entry; | 5270 | return entry; |
| 5293 | } | 5271 | } |
| 5294 | 5272 | ||
| 5273 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
| 5274 | { | ||
| 5275 | struct ctl_table *entry = *tablep; | ||
| 5276 | |||
| 5277 | for (entry = *tablep; entry->procname; entry++) | ||
| 5278 | if (entry->child) | ||
| 5279 | sd_free_ctl_entry(&entry->child); | ||
| 5280 | |||
| 5281 | kfree(*tablep); | ||
| 5282 | *tablep = NULL; | ||
| 5283 | } | ||
| 5284 | |||
| 5295 | static void | 5285 | static void |
| 5296 | set_table_entry(struct ctl_table *entry, | 5286 | set_table_entry(struct ctl_table *entry, |
| 5297 | const char *procname, void *data, int maxlen, | 5287 | const char *procname, void *data, int maxlen, |
| @@ -5307,7 +5297,10 @@ set_table_entry(struct ctl_table *entry, | |||
| 5307 | static struct ctl_table * | 5297 | static struct ctl_table * |
| 5308 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 5298 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
| 5309 | { | 5299 | { |
| 5310 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 5300 | struct ctl_table *table = sd_alloc_ctl_entry(12); |
| 5301 | |||
| 5302 | if (table == NULL) | ||
| 5303 | return NULL; | ||
| 5311 | 5304 | ||
| 5312 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 5305 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
| 5313 | sizeof(long), 0644, proc_doulongvec_minmax); | 5306 | sizeof(long), 0644, proc_doulongvec_minmax); |
| @@ -5327,11 +5320,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 5327 | sizeof(int), 0644, proc_dointvec_minmax); | 5320 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5328 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 5321 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
| 5329 | sizeof(int), 0644, proc_dointvec_minmax); | 5322 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5330 | set_table_entry(&table[10], "cache_nice_tries", | 5323 | set_table_entry(&table[9], "cache_nice_tries", |
| 5331 | &sd->cache_nice_tries, | 5324 | &sd->cache_nice_tries, |
| 5332 | sizeof(int), 0644, proc_dointvec_minmax); | 5325 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5333 | set_table_entry(&table[12], "flags", &sd->flags, | 5326 | set_table_entry(&table[10], "flags", &sd->flags, |
| 5334 | sizeof(int), 0644, proc_dointvec_minmax); | 5327 | sizeof(int), 0644, proc_dointvec_minmax); |
| 5328 | /* &table[11] is terminator */ | ||
| 5335 | 5329 | ||
| 5336 | return table; | 5330 | return table; |
| 5337 | } | 5331 | } |
| @@ -5346,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
| 5346 | for_each_domain(cpu, sd) | 5340 | for_each_domain(cpu, sd) |
| 5347 | domain_num++; | 5341 | domain_num++; |
| 5348 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 5342 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
| 5343 | if (table == NULL) | ||
| 5344 | return NULL; | ||
| 5349 | 5345 | ||
| 5350 | i = 0; | 5346 | i = 0; |
| 5351 | for_each_domain(cpu, sd) { | 5347 | for_each_domain(cpu, sd) { |
| @@ -5360,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
| 5360 | } | 5356 | } |
| 5361 | 5357 | ||
| 5362 | static struct ctl_table_header *sd_sysctl_header; | 5358 | static struct ctl_table_header *sd_sysctl_header; |
| 5363 | static void init_sched_domain_sysctl(void) | 5359 | static void register_sched_domain_sysctl(void) |
| 5364 | { | 5360 | { |
| 5365 | int i, cpu_num = num_online_cpus(); | 5361 | int i, cpu_num = num_online_cpus(); |
| 5366 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5362 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
| 5367 | char buf[32]; | 5363 | char buf[32]; |
| 5368 | 5364 | ||
| 5365 | if (entry == NULL) | ||
| 5366 | return; | ||
| 5367 | |||
| 5369 | sd_ctl_dir[0].child = entry; | 5368 | sd_ctl_dir[0].child = entry; |
| 5370 | 5369 | ||
| 5371 | for (i = 0; i < cpu_num; i++, entry++) { | 5370 | for_each_online_cpu(i) { |
| 5372 | snprintf(buf, 32, "cpu%d", i); | 5371 | snprintf(buf, 32, "cpu%d", i); |
| 5373 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5372 | entry->procname = kstrdup(buf, GFP_KERNEL); |
| 5374 | entry->mode = 0555; | 5373 | entry->mode = 0555; |
| 5375 | entry->child = sd_alloc_ctl_cpu_table(i); | 5374 | entry->child = sd_alloc_ctl_cpu_table(i); |
| 5375 | entry++; | ||
| 5376 | } | 5376 | } |
| 5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5377 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
| 5378 | } | 5378 | } |
| 5379 | |||
| 5380 | static void unregister_sched_domain_sysctl(void) | ||
| 5381 | { | ||
| 5382 | unregister_sysctl_table(sd_sysctl_header); | ||
| 5383 | sd_sysctl_header = NULL; | ||
| 5384 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
| 5385 | } | ||
| 5379 | #else | 5386 | #else |
| 5380 | static void init_sched_domain_sysctl(void) | 5387 | static void register_sched_domain_sysctl(void) |
| 5388 | { | ||
| 5389 | } | ||
| 5390 | static void unregister_sched_domain_sysctl(void) | ||
| 5381 | { | 5391 | { |
| 5382 | } | 5392 | } |
| 5383 | #endif | 5393 | #endif |
| @@ -5499,8 +5509,7 @@ int __init migration_init(void) | |||
| 5499 | int nr_cpu_ids __read_mostly = NR_CPUS; | 5509 | int nr_cpu_ids __read_mostly = NR_CPUS; |
| 5500 | EXPORT_SYMBOL(nr_cpu_ids); | 5510 | EXPORT_SYMBOL(nr_cpu_ids); |
| 5501 | 5511 | ||
| 5502 | #undef SCHED_DOMAIN_DEBUG | 5512 | #ifdef CONFIG_SCHED_DEBUG |
| 5503 | #ifdef SCHED_DOMAIN_DEBUG | ||
| 5504 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 5513 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
| 5505 | { | 5514 | { |
| 5506 | int level = 0; | 5515 | int level = 0; |
| @@ -5558,16 +5567,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5558 | printk("\n"); | 5567 | printk("\n"); |
| 5559 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5568 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 5560 | "set\n"); | 5569 | "set\n"); |
| 5570 | break; | ||
| 5561 | } | 5571 | } |
| 5562 | 5572 | ||
| 5563 | if (!cpus_weight(group->cpumask)) { | 5573 | if (!cpus_weight(group->cpumask)) { |
| 5564 | printk("\n"); | 5574 | printk("\n"); |
| 5565 | printk(KERN_ERR "ERROR: empty group\n"); | 5575 | printk(KERN_ERR "ERROR: empty group\n"); |
| 5576 | break; | ||
| 5566 | } | 5577 | } |
| 5567 | 5578 | ||
| 5568 | if (cpus_intersects(groupmask, group->cpumask)) { | 5579 | if (cpus_intersects(groupmask, group->cpumask)) { |
| 5569 | printk("\n"); | 5580 | printk("\n"); |
| 5570 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5581 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
| 5582 | break; | ||
| 5571 | } | 5583 | } |
| 5572 | 5584 | ||
| 5573 | cpus_or(groupmask, groupmask, group->cpumask); | 5585 | cpus_or(groupmask, groupmask, group->cpumask); |
| @@ -5701,7 +5713,7 @@ static int __init isolated_cpu_setup(char *str) | |||
| 5701 | return 1; | 5713 | return 1; |
| 5702 | } | 5714 | } |
| 5703 | 5715 | ||
| 5704 | __setup ("isolcpus=", isolated_cpu_setup); | 5716 | __setup("isolcpus=", isolated_cpu_setup); |
| 5705 | 5717 | ||
| 5706 | /* | 5718 | /* |
| 5707 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 5719 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
| @@ -5930,24 +5942,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
| 5930 | 5942 | ||
| 5931 | if (!sg) | 5943 | if (!sg) |
| 5932 | return; | 5944 | return; |
| 5933 | next_sg: | 5945 | do { |
| 5934 | for_each_cpu_mask(j, sg->cpumask) { | 5946 | for_each_cpu_mask(j, sg->cpumask) { |
| 5935 | struct sched_domain *sd; | 5947 | struct sched_domain *sd; |
| 5936 | 5948 | ||
| 5937 | sd = &per_cpu(phys_domains, j); | 5949 | sd = &per_cpu(phys_domains, j); |
| 5938 | if (j != first_cpu(sd->groups->cpumask)) { | 5950 | if (j != first_cpu(sd->groups->cpumask)) { |
| 5939 | /* | 5951 | /* |
| 5940 | * Only add "power" once for each | 5952 | * Only add "power" once for each |
| 5941 | * physical package. | 5953 | * physical package. |
| 5942 | */ | 5954 | */ |
| 5943 | continue; | 5955 | continue; |
| 5944 | } | 5956 | } |
| 5945 | 5957 | ||
| 5946 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 5958 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
| 5947 | } | 5959 | } |
| 5948 | sg = sg->next; | 5960 | sg = sg->next; |
| 5949 | if (sg != group_head) | 5961 | } while (sg != group_head); |
| 5950 | goto next_sg; | ||
| 5951 | } | 5962 | } |
| 5952 | #endif | 5963 | #endif |
| 5953 | 5964 | ||
| @@ -6058,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6058 | /* | 6069 | /* |
| 6059 | * Allocate the per-node list of sched groups | 6070 | * Allocate the per-node list of sched groups |
| 6060 | */ | 6071 | */ |
| 6061 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, | 6072 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
| 6062 | GFP_KERNEL); | 6073 | GFP_KERNEL); |
| 6063 | if (!sched_group_nodes) { | 6074 | if (!sched_group_nodes) { |
| 6064 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6075 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| @@ -6311,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 6311 | 6322 | ||
| 6312 | err = build_sched_domains(&cpu_default_map); | 6323 | err = build_sched_domains(&cpu_default_map); |
| 6313 | 6324 | ||
| 6325 | register_sched_domain_sysctl(); | ||
| 6326 | |||
| 6314 | return err; | 6327 | return err; |
| 6315 | } | 6328 | } |
| 6316 | 6329 | ||
| @@ -6327,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6327 | { | 6340 | { |
| 6328 | int i; | 6341 | int i; |
| 6329 | 6342 | ||
| 6343 | unregister_sched_domain_sysctl(); | ||
| 6344 | |||
| 6330 | for_each_cpu_mask(i, *cpu_map) | 6345 | for_each_cpu_mask(i, *cpu_map) |
| 6331 | cpu_attach_domain(NULL, i); | 6346 | cpu_attach_domain(NULL, i); |
| 6332 | synchronize_sched(); | 6347 | synchronize_sched(); |
| @@ -6357,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
| 6357 | if (!err && !cpus_empty(*partition2)) | 6372 | if (!err && !cpus_empty(*partition2)) |
| 6358 | err = build_sched_domains(partition2); | 6373 | err = build_sched_domains(partition2); |
| 6359 | 6374 | ||
| 6375 | register_sched_domain_sysctl(); | ||
| 6376 | |||
| 6360 | return err; | 6377 | return err; |
| 6361 | } | 6378 | } |
| 6362 | 6379 | ||
| @@ -6488,17 +6505,13 @@ void __init sched_init_smp(void) | |||
| 6488 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6505 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
| 6489 | hotcpu_notifier(update_sched_domains, 0); | 6506 | hotcpu_notifier(update_sched_domains, 0); |
| 6490 | 6507 | ||
| 6491 | init_sched_domain_sysctl(); | ||
| 6492 | |||
| 6493 | /* Move init over to a non-isolated CPU */ | 6508 | /* Move init over to a non-isolated CPU */ |
| 6494 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6509 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
| 6495 | BUG(); | 6510 | BUG(); |
| 6496 | sched_init_granularity(); | ||
| 6497 | } | 6511 | } |
| 6498 | #else | 6512 | #else |
| 6499 | void __init sched_init_smp(void) | 6513 | void __init sched_init_smp(void) |
| 6500 | { | 6514 | { |
| 6501 | sched_init_granularity(); | ||
| 6502 | } | 6515 | } |
| 6503 | #endif /* CONFIG_SMP */ | 6516 | #endif /* CONFIG_SMP */ |
| 6504 | 6517 | ||
| @@ -6512,28 +6525,20 @@ int in_sched_functions(unsigned long addr) | |||
| 6512 | && addr < (unsigned long)__sched_text_end); | 6525 | && addr < (unsigned long)__sched_text_end); |
| 6513 | } | 6526 | } |
| 6514 | 6527 | ||
| 6515 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 6528 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
| 6516 | { | 6529 | { |
| 6517 | cfs_rq->tasks_timeline = RB_ROOT; | 6530 | cfs_rq->tasks_timeline = RB_ROOT; |
| 6518 | cfs_rq->fair_clock = 1; | ||
| 6519 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6531 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6520 | cfs_rq->rq = rq; | 6532 | cfs_rq->rq = rq; |
| 6521 | #endif | 6533 | #endif |
| 6534 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
| 6522 | } | 6535 | } |
| 6523 | 6536 | ||
| 6524 | void __init sched_init(void) | 6537 | void __init sched_init(void) |
| 6525 | { | 6538 | { |
| 6526 | u64 now = sched_clock(); | ||
| 6527 | int highest_cpu = 0; | 6539 | int highest_cpu = 0; |
| 6528 | int i, j; | 6540 | int i, j; |
| 6529 | 6541 | ||
| 6530 | /* | ||
| 6531 | * Link up the scheduling class hierarchy: | ||
| 6532 | */ | ||
| 6533 | rt_sched_class.next = &fair_sched_class; | ||
| 6534 | fair_sched_class.next = &idle_sched_class; | ||
| 6535 | idle_sched_class.next = NULL; | ||
| 6536 | |||
| 6537 | for_each_possible_cpu(i) { | 6542 | for_each_possible_cpu(i) { |
| 6538 | struct rt_prio_array *array; | 6543 | struct rt_prio_array *array; |
| 6539 | struct rq *rq; | 6544 | struct rq *rq; |
| @@ -6546,10 +6551,28 @@ void __init sched_init(void) | |||
| 6546 | init_cfs_rq(&rq->cfs, rq); | 6551 | init_cfs_rq(&rq->cfs, rq); |
| 6547 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6552 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6548 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6553 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 6549 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 6554 | { |
| 6555 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
| 6556 | struct sched_entity *se = | ||
| 6557 | &per_cpu(init_sched_entity, i); | ||
| 6558 | |||
| 6559 | init_cfs_rq_p[i] = cfs_rq; | ||
| 6560 | init_cfs_rq(cfs_rq, rq); | ||
| 6561 | cfs_rq->tg = &init_task_group; | ||
| 6562 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
| 6563 | &rq->leaf_cfs_rq_list); | ||
| 6564 | |||
| 6565 | init_sched_entity_p[i] = se; | ||
| 6566 | se->cfs_rq = &rq->cfs; | ||
| 6567 | se->my_q = cfs_rq; | ||
| 6568 | se->load.weight = init_task_group_load; | ||
| 6569 | se->load.inv_weight = | ||
| 6570 | div64_64(1ULL<<32, init_task_group_load); | ||
| 6571 | se->parent = NULL; | ||
| 6572 | } | ||
| 6573 | init_task_group.shares = init_task_group_load; | ||
| 6574 | spin_lock_init(&init_task_group.lock); | ||
| 6550 | #endif | 6575 | #endif |
| 6551 | rq->ls.load_update_last = now; | ||
| 6552 | rq->ls.load_update_start = now; | ||
| 6553 | 6576 | ||
| 6554 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 6577 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 6555 | rq->cpu_load[j] = 0; | 6578 | rq->cpu_load[j] = 0; |
| @@ -6634,26 +6657,40 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 6634 | #endif | 6657 | #endif |
| 6635 | 6658 | ||
| 6636 | #ifdef CONFIG_MAGIC_SYSRQ | 6659 | #ifdef CONFIG_MAGIC_SYSRQ |
| 6660 | static void normalize_task(struct rq *rq, struct task_struct *p) | ||
| 6661 | { | ||
| 6662 | int on_rq; | ||
| 6663 | update_rq_clock(rq); | ||
| 6664 | on_rq = p->se.on_rq; | ||
| 6665 | if (on_rq) | ||
| 6666 | deactivate_task(rq, p, 0); | ||
| 6667 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
| 6668 | if (on_rq) { | ||
| 6669 | activate_task(rq, p, 0); | ||
| 6670 | resched_task(rq->curr); | ||
| 6671 | } | ||
| 6672 | } | ||
| 6673 | |||
| 6637 | void normalize_rt_tasks(void) | 6674 | void normalize_rt_tasks(void) |
| 6638 | { | 6675 | { |
| 6639 | struct task_struct *g, *p; | 6676 | struct task_struct *g, *p; |
| 6640 | unsigned long flags; | 6677 | unsigned long flags; |
| 6641 | struct rq *rq; | 6678 | struct rq *rq; |
| 6642 | int on_rq; | ||
| 6643 | 6679 | ||
| 6644 | read_lock_irq(&tasklist_lock); | 6680 | read_lock_irq(&tasklist_lock); |
| 6645 | do_each_thread(g, p) { | 6681 | do_each_thread(g, p) { |
| 6646 | p->se.fair_key = 0; | 6682 | /* |
| 6647 | p->se.wait_runtime = 0; | 6683 | * Only normalize user tasks: |
| 6684 | */ | ||
| 6685 | if (!p->mm) | ||
| 6686 | continue; | ||
| 6687 | |||
| 6648 | p->se.exec_start = 0; | 6688 | p->se.exec_start = 0; |
| 6649 | p->se.wait_start_fair = 0; | ||
| 6650 | p->se.sleep_start_fair = 0; | ||
| 6651 | #ifdef CONFIG_SCHEDSTATS | 6689 | #ifdef CONFIG_SCHEDSTATS |
| 6652 | p->se.wait_start = 0; | 6690 | p->se.wait_start = 0; |
| 6653 | p->se.sleep_start = 0; | 6691 | p->se.sleep_start = 0; |
| 6654 | p->se.block_start = 0; | 6692 | p->se.block_start = 0; |
| 6655 | #endif | 6693 | #endif |
| 6656 | task_rq(p)->cfs.fair_clock = 0; | ||
| 6657 | task_rq(p)->clock = 0; | 6694 | task_rq(p)->clock = 0; |
| 6658 | 6695 | ||
| 6659 | if (!rt_task(p)) { | 6696 | if (!rt_task(p)) { |
| @@ -6668,26 +6705,9 @@ void normalize_rt_tasks(void) | |||
| 6668 | 6705 | ||
| 6669 | spin_lock_irqsave(&p->pi_lock, flags); | 6706 | spin_lock_irqsave(&p->pi_lock, flags); |
| 6670 | rq = __task_rq_lock(p); | 6707 | rq = __task_rq_lock(p); |
| 6671 | #ifdef CONFIG_SMP | ||
| 6672 | /* | ||
| 6673 | * Do not touch the migration thread: | ||
| 6674 | */ | ||
| 6675 | if (p == rq->migration_thread) | ||
| 6676 | goto out_unlock; | ||
| 6677 | #endif | ||
| 6678 | 6708 | ||
| 6679 | update_rq_clock(rq); | 6709 | normalize_task(rq, p); |
| 6680 | on_rq = p->se.on_rq; | 6710 | |
| 6681 | if (on_rq) | ||
| 6682 | deactivate_task(rq, p, 0); | ||
| 6683 | __setscheduler(rq, p, SCHED_NORMAL, 0); | ||
| 6684 | if (on_rq) { | ||
| 6685 | activate_task(rq, p, 0); | ||
| 6686 | resched_task(rq->curr); | ||
| 6687 | } | ||
| 6688 | #ifdef CONFIG_SMP | ||
| 6689 | out_unlock: | ||
| 6690 | #endif | ||
| 6691 | __task_rq_unlock(rq); | 6711 | __task_rq_unlock(rq); |
| 6692 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6712 | spin_unlock_irqrestore(&p->pi_lock, flags); |
| 6693 | } while_each_thread(g, p); | 6713 | } while_each_thread(g, p); |
| @@ -6740,3 +6760,201 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 6740 | } | 6760 | } |
| 6741 | 6761 | ||
| 6742 | #endif | 6762 | #endif |
| 6763 | |||
| 6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 6765 | |||
| 6766 | /* allocate runqueue etc for a new task group */ | ||
| 6767 | struct task_group *sched_create_group(void) | ||
| 6768 | { | ||
| 6769 | struct task_group *tg; | ||
| 6770 | struct cfs_rq *cfs_rq; | ||
| 6771 | struct sched_entity *se; | ||
| 6772 | struct rq *rq; | ||
| 6773 | int i; | ||
| 6774 | |||
| 6775 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | ||
| 6776 | if (!tg) | ||
| 6777 | return ERR_PTR(-ENOMEM); | ||
| 6778 | |||
| 6779 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | ||
| 6780 | if (!tg->cfs_rq) | ||
| 6781 | goto err; | ||
| 6782 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | ||
| 6783 | if (!tg->se) | ||
| 6784 | goto err; | ||
| 6785 | |||
| 6786 | for_each_possible_cpu(i) { | ||
| 6787 | rq = cpu_rq(i); | ||
| 6788 | |||
| 6789 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | ||
| 6790 | cpu_to_node(i)); | ||
| 6791 | if (!cfs_rq) | ||
| 6792 | goto err; | ||
| 6793 | |||
| 6794 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | ||
| 6795 | cpu_to_node(i)); | ||
| 6796 | if (!se) | ||
| 6797 | goto err; | ||
| 6798 | |||
| 6799 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | ||
| 6800 | memset(se, 0, sizeof(struct sched_entity)); | ||
| 6801 | |||
| 6802 | tg->cfs_rq[i] = cfs_rq; | ||
| 6803 | init_cfs_rq(cfs_rq, rq); | ||
| 6804 | cfs_rq->tg = tg; | ||
| 6805 | |||
| 6806 | tg->se[i] = se; | ||
| 6807 | se->cfs_rq = &rq->cfs; | ||
| 6808 | se->my_q = cfs_rq; | ||
| 6809 | se->load.weight = NICE_0_LOAD; | ||
| 6810 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
| 6811 | se->parent = NULL; | ||
| 6812 | } | ||
| 6813 | |||
| 6814 | for_each_possible_cpu(i) { | ||
| 6815 | rq = cpu_rq(i); | ||
| 6816 | cfs_rq = tg->cfs_rq[i]; | ||
| 6817 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
| 6818 | } | ||
| 6819 | |||
| 6820 | tg->shares = NICE_0_LOAD; | ||
| 6821 | spin_lock_init(&tg->lock); | ||
| 6822 | |||
| 6823 | return tg; | ||
| 6824 | |||
| 6825 | err: | ||
| 6826 | for_each_possible_cpu(i) { | ||
| 6827 | if (tg->cfs_rq) | ||
| 6828 | kfree(tg->cfs_rq[i]); | ||
| 6829 | if (tg->se) | ||
| 6830 | kfree(tg->se[i]); | ||
| 6831 | } | ||
| 6832 | kfree(tg->cfs_rq); | ||
| 6833 | kfree(tg->se); | ||
| 6834 | kfree(tg); | ||
| 6835 | |||
| 6836 | return ERR_PTR(-ENOMEM); | ||
| 6837 | } | ||
| 6838 | |||
| 6839 | /* rcu callback to free various structures associated with a task group */ | ||
| 6840 | static void free_sched_group(struct rcu_head *rhp) | ||
| 6841 | { | ||
| 6842 | struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); | ||
| 6843 | struct task_group *tg = cfs_rq->tg; | ||
| 6844 | struct sched_entity *se; | ||
| 6845 | int i; | ||
| 6846 | |||
| 6847 | /* now it should be safe to free those cfs_rqs */ | ||
| 6848 | for_each_possible_cpu(i) { | ||
| 6849 | cfs_rq = tg->cfs_rq[i]; | ||
| 6850 | kfree(cfs_rq); | ||
| 6851 | |||
| 6852 | se = tg->se[i]; | ||
| 6853 | kfree(se); | ||
| 6854 | } | ||
| 6855 | |||
| 6856 | kfree(tg->cfs_rq); | ||
| 6857 | kfree(tg->se); | ||
| 6858 | kfree(tg); | ||
| 6859 | } | ||
| 6860 | |||
| 6861 | /* Destroy runqueue etc associated with a task group */ | ||
| 6862 | void sched_destroy_group(struct task_group *tg) | ||
| 6863 | { | ||
| 6864 | struct cfs_rq *cfs_rq; | ||
| 6865 | int i; | ||
| 6866 | |||
| 6867 | for_each_possible_cpu(i) { | ||
| 6868 | cfs_rq = tg->cfs_rq[i]; | ||
| 6869 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
| 6870 | } | ||
| 6871 | |||
| 6872 | cfs_rq = tg->cfs_rq[0]; | ||
| 6873 | |||
| 6874 | /* wait for possible concurrent references to cfs_rqs complete */ | ||
| 6875 | call_rcu(&cfs_rq->rcu, free_sched_group); | ||
| 6876 | } | ||
| 6877 | |||
| 6878 | /* change task's runqueue when it moves between groups. | ||
| 6879 | * The caller of this function should have put the task in its new group | ||
| 6880 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
| 6881 | * reflect its new group. | ||
| 6882 | */ | ||
| 6883 | void sched_move_task(struct task_struct *tsk) | ||
| 6884 | { | ||
| 6885 | int on_rq, running; | ||
| 6886 | unsigned long flags; | ||
| 6887 | struct rq *rq; | ||
| 6888 | |||
| 6889 | rq = task_rq_lock(tsk, &flags); | ||
| 6890 | |||
| 6891 | if (tsk->sched_class != &fair_sched_class) | ||
| 6892 | goto done; | ||
| 6893 | |||
| 6894 | update_rq_clock(rq); | ||
| 6895 | |||
| 6896 | running = task_running(rq, tsk); | ||
| 6897 | on_rq = tsk->se.on_rq; | ||
| 6898 | |||
| 6899 | if (on_rq) { | ||
| 6900 | dequeue_task(rq, tsk, 0); | ||
| 6901 | if (unlikely(running)) | ||
| 6902 | tsk->sched_class->put_prev_task(rq, tsk); | ||
| 6903 | } | ||
| 6904 | |||
| 6905 | set_task_cfs_rq(tsk); | ||
| 6906 | |||
| 6907 | if (on_rq) { | ||
| 6908 | if (unlikely(running)) | ||
| 6909 | tsk->sched_class->set_curr_task(rq); | ||
| 6910 | enqueue_task(rq, tsk, 0); | ||
| 6911 | } | ||
| 6912 | |||
| 6913 | done: | ||
| 6914 | task_rq_unlock(rq, &flags); | ||
| 6915 | } | ||
| 6916 | |||
| 6917 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
| 6918 | { | ||
| 6919 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
| 6920 | struct rq *rq = cfs_rq->rq; | ||
| 6921 | int on_rq; | ||
| 6922 | |||
| 6923 | spin_lock_irq(&rq->lock); | ||
| 6924 | |||
| 6925 | on_rq = se->on_rq; | ||
| 6926 | if (on_rq) | ||
| 6927 | dequeue_entity(cfs_rq, se, 0); | ||
| 6928 | |||
| 6929 | se->load.weight = shares; | ||
| 6930 | se->load.inv_weight = div64_64((1ULL<<32), shares); | ||
| 6931 | |||
| 6932 | if (on_rq) | ||
| 6933 | enqueue_entity(cfs_rq, se, 0); | ||
| 6934 | |||
| 6935 | spin_unlock_irq(&rq->lock); | ||
| 6936 | } | ||
| 6937 | |||
| 6938 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
| 6939 | { | ||
| 6940 | int i; | ||
| 6941 | |||
| 6942 | spin_lock(&tg->lock); | ||
| 6943 | if (tg->shares == shares) | ||
| 6944 | goto done; | ||
| 6945 | |||
| 6946 | tg->shares = shares; | ||
| 6947 | for_each_possible_cpu(i) | ||
| 6948 | set_se_shares(tg->se[i], shares); | ||
| 6949 | |||
| 6950 | done: | ||
| 6951 | spin_unlock(&tg->lock); | ||
| 6952 | return 0; | ||
| 6953 | } | ||
| 6954 | |||
| 6955 | unsigned long sched_group_shares(struct task_group *tg) | ||
| 6956 | { | ||
| 6957 | return tg->shares; | ||
| 6958 | } | ||
| 6959 | |||
| 6960 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
