diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 392 |
1 files changed, 175 insertions, 217 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd974faf467d..94340c7544a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 776 | * Scheduling class queueing methods: |
777 | */ | 777 | */ |
778 | 778 | ||
779 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
780 | static void | ||
781 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
782 | { | ||
783 | cfs_rq->task_weight += weight; | ||
784 | } | ||
785 | #else | ||
786 | static inline void | ||
787 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
788 | { | ||
789 | } | ||
790 | #endif | ||
791 | |||
792 | static void | 779 | static void |
793 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
794 | { | 781 | { |
795 | update_load_add(&cfs_rq->load, se->load.weight); | 782 | update_load_add(&cfs_rq->load, se->load.weight); |
796 | if (!parent_entity(se)) | 783 | if (!parent_entity(se)) |
797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
798 | if (entity_is_task(se)) { | 785 | #ifdef CONFIG_SMP |
799 | add_cfs_task_weight(cfs_rq, se->load.weight); | 786 | if (entity_is_task(se)) |
800 | list_add(&se->group_node, &cfs_rq->tasks); | 787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
801 | } | 788 | #endif |
802 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
803 | } | 790 | } |
804 | 791 | ||
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
808 | update_load_sub(&cfs_rq->load, se->load.weight); | 795 | update_load_sub(&cfs_rq->load, se->load.weight); |
809 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 797 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
811 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) |
812 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
813 | list_del_init(&se->group_node); | 799 | list_del_init(&se->group_node); |
814 | } | ||
815 | cfs_rq->nr_running--; | 800 | cfs_rq->nr_running--; |
816 | } | 801 | } |
817 | 802 | ||
@@ -2672,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2672 | /* | 2657 | /* |
2673 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2658 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2674 | */ | 2659 | */ |
2675 | rcu_read_lock(); | ||
2676 | |||
2677 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2678 | for_each_lower_domain(sd) { | 2661 | for_each_lower_domain(sd) { |
2679 | sg = sd->groups; | 2662 | sg = sd->groups; |
@@ -2695,8 +2678,6 @@ next: | |||
2695 | } while (sg != sd->groups); | 2678 | } while (sg != sd->groups); |
2696 | } | 2679 | } |
2697 | done: | 2680 | done: |
2698 | rcu_read_unlock(); | ||
2699 | |||
2700 | return target; | 2681 | return target; |
2701 | } | 2682 | } |
2702 | 2683 | ||
@@ -2922,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2922 | return; | 2903 | return; |
2923 | 2904 | ||
2924 | /* | 2905 | /* |
2925 | * This is possible from callers such as pull_task(), in which we | 2906 | * This is possible from callers such as move_task(), in which we |
2926 | * unconditionally check_prempt_curr() after an enqueue (which may have | 2907 | * unconditionally check_prempt_curr() after an enqueue (which may have |
2927 | * lead to a throttle). This both saves work and prevents false | 2908 | * lead to a throttle). This both saves work and prevents false |
2928 | * next-buddy nomination below. | 2909 | * next-buddy nomination below. |
@@ -3086,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3086 | * Fair scheduling class load-balancing methods: | 3067 | * Fair scheduling class load-balancing methods: |
3087 | */ | 3068 | */ |
3088 | 3069 | ||
3070 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3071 | |||
3072 | #define LBF_ALL_PINNED 0x01 | ||
3073 | #define LBF_NEED_BREAK 0x02 | ||
3074 | |||
3075 | struct lb_env { | ||
3076 | struct sched_domain *sd; | ||
3077 | |||
3078 | int src_cpu; | ||
3079 | struct rq *src_rq; | ||
3080 | |||
3081 | int dst_cpu; | ||
3082 | struct rq *dst_rq; | ||
3083 | |||
3084 | enum cpu_idle_type idle; | ||
3085 | long load_move; | ||
3086 | unsigned int flags; | ||
3087 | |||
3088 | unsigned int loop; | ||
3089 | unsigned int loop_break; | ||
3090 | unsigned int loop_max; | ||
3091 | }; | ||
3092 | |||
3089 | /* | 3093 | /* |
3090 | * pull_task - move a task from a remote runqueue to the local runqueue. | 3094 | * move_task - move a task from one runqueue to another runqueue. |
3091 | * Both runqueues must be locked. | 3095 | * Both runqueues must be locked. |
3092 | */ | 3096 | */ |
3093 | static void pull_task(struct rq *src_rq, struct task_struct *p, | 3097 | static void move_task(struct task_struct *p, struct lb_env *env) |
3094 | struct rq *this_rq, int this_cpu) | ||
3095 | { | 3098 | { |
3096 | deactivate_task(src_rq, p, 0); | 3099 | deactivate_task(env->src_rq, p, 0); |
3097 | set_task_cpu(p, this_cpu); | 3100 | set_task_cpu(p, env->dst_cpu); |
3098 | activate_task(this_rq, p, 0); | 3101 | activate_task(env->dst_rq, p, 0); |
3099 | check_preempt_curr(this_rq, p, 0); | 3102 | check_preempt_curr(env->dst_rq, p, 0); |
3100 | } | 3103 | } |
3101 | 3104 | ||
3102 | /* | 3105 | /* |
@@ -3131,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
3131 | return delta < (s64)sysctl_sched_migration_cost; | 3134 | return delta < (s64)sysctl_sched_migration_cost; |
3132 | } | 3135 | } |
3133 | 3136 | ||
3134 | #define LBF_ALL_PINNED 0x01 | ||
3135 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | ||
3136 | #define LBF_HAD_BREAK 0x04 | ||
3137 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3138 | #define LBF_ABORT 0x10 | ||
3139 | |||
3140 | /* | 3137 | /* |
3141 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3138 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
3142 | */ | 3139 | */ |
3143 | static | 3140 | static |
3144 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3141 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
3145 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3146 | int *lb_flags) | ||
3147 | { | 3142 | { |
3148 | int tsk_cache_hot = 0; | 3143 | int tsk_cache_hot = 0; |
3149 | /* | 3144 | /* |
@@ -3152,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3152 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3147 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3153 | * 3) are cache-hot on their current CPU. | 3148 | * 3) are cache-hot on their current CPU. |
3154 | */ | 3149 | */ |
3155 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { | 3150 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3156 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3151 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3157 | return 0; | 3152 | return 0; |
3158 | } | 3153 | } |
3159 | *lb_flags &= ~LBF_ALL_PINNED; | 3154 | env->flags &= ~LBF_ALL_PINNED; |
3160 | 3155 | ||
3161 | if (task_running(rq, p)) { | 3156 | if (task_running(env->src_rq, p)) { |
3162 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3157 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
3163 | return 0; | 3158 | return 0; |
3164 | } | 3159 | } |
@@ -3169,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3169 | * 2) too many balance attempts have failed. | 3164 | * 2) too many balance attempts have failed. |
3170 | */ | 3165 | */ |
3171 | 3166 | ||
3172 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); | 3167 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3173 | if (!tsk_cache_hot || | 3168 | if (!tsk_cache_hot || |
3174 | sd->nr_balance_failed > sd->cache_nice_tries) { | 3169 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3175 | #ifdef CONFIG_SCHEDSTATS | 3170 | #ifdef CONFIG_SCHEDSTATS |
3176 | if (tsk_cache_hot) { | 3171 | if (tsk_cache_hot) { |
3177 | schedstat_inc(sd, lb_hot_gained[idle]); | 3172 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3178 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3173 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3179 | } | 3174 | } |
3180 | #endif | 3175 | #endif |
@@ -3195,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3195 | * | 3190 | * |
3196 | * Called with both runqueues locked. | 3191 | * Called with both runqueues locked. |
3197 | */ | 3192 | */ |
3198 | static int | 3193 | static int move_one_task(struct lb_env *env) |
3199 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3200 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3201 | { | 3194 | { |
3202 | struct task_struct *p, *n; | 3195 | struct task_struct *p, *n; |
3203 | struct cfs_rq *cfs_rq; | ||
3204 | int pinned = 0; | ||
3205 | 3196 | ||
3206 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 3197 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3207 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 3198 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) |
3208 | if (throttled_lb_pair(task_group(p), | 3199 | continue; |
3209 | busiest->cpu, this_cpu)) | ||
3210 | break; | ||
3211 | 3200 | ||
3212 | if (!can_migrate_task(p, busiest, this_cpu, | 3201 | if (!can_migrate_task(p, env)) |
3213 | sd, idle, &pinned)) | 3202 | continue; |
3214 | continue; | ||
3215 | 3203 | ||
3216 | pull_task(busiest, p, this_rq, this_cpu); | 3204 | move_task(p, env); |
3217 | /* | 3205 | /* |
3218 | * Right now, this is only the second place pull_task() | 3206 | * Right now, this is only the second place move_task() |
3219 | * is called, so we can safely collect pull_task() | 3207 | * is called, so we can safely collect move_task() |
3220 | * stats here rather than inside pull_task(). | 3208 | * stats here rather than inside move_task(). |
3221 | */ | 3209 | */ |
3222 | schedstat_inc(sd, lb_gained[idle]); | 3210 | schedstat_inc(env->sd, lb_gained[env->idle]); |
3223 | return 1; | 3211 | return 1; |
3224 | } | ||
3225 | } | 3212 | } |
3226 | |||
3227 | return 0; | 3213 | return 0; |
3228 | } | 3214 | } |
3229 | 3215 | ||
3230 | static unsigned long | 3216 | static unsigned long task_h_load(struct task_struct *p); |
3231 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3217 | |
3232 | unsigned long max_load_move, struct sched_domain *sd, | 3218 | /* |
3233 | enum cpu_idle_type idle, int *lb_flags, | 3219 | * move_tasks tries to move up to load_move weighted load from busiest to |
3234 | struct cfs_rq *busiest_cfs_rq) | 3220 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | ||
3222 | * | ||
3223 | * Called with both runqueues locked. | ||
3224 | */ | ||
3225 | static int move_tasks(struct lb_env *env) | ||
3235 | { | 3226 | { |
3236 | int loops = 0, pulled = 0; | 3227 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
3237 | long rem_load_move = max_load_move; | 3228 | struct task_struct *p; |
3238 | struct task_struct *p, *n; | 3229 | unsigned long load; |
3230 | int pulled = 0; | ||
3231 | |||
3232 | if (env->load_move <= 0) | ||
3233 | return 0; | ||
3239 | 3234 | ||
3240 | if (max_load_move == 0) | 3235 | while (!list_empty(tasks)) { |
3241 | goto out; | 3236 | p = list_first_entry(tasks, struct task_struct, se.group_node); |
3242 | 3237 | ||
3243 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3238 | env->loop++; |
3244 | if (loops++ > sysctl_sched_nr_migrate) { | 3239 | /* We've more or less seen every task there is, call it quits */ |
3245 | *lb_flags |= LBF_NEED_BREAK; | 3240 | if (env->loop > env->loop_max) |
3241 | break; | ||
3242 | |||
3243 | /* take a breather every nr_migrate tasks */ | ||
3244 | if (env->loop > env->loop_break) { | ||
3245 | env->loop_break += sysctl_sched_nr_migrate; | ||
3246 | env->flags |= LBF_NEED_BREAK; | ||
3246 | break; | 3247 | break; |
3247 | } | 3248 | } |
3248 | 3249 | ||
3249 | if ((p->se.load.weight >> 1) > rem_load_move || | 3250 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
3250 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3251 | goto next; |
3251 | lb_flags)) | 3252 | |
3252 | continue; | 3253 | load = task_h_load(p); |
3254 | |||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | ||
3256 | goto next; | ||
3257 | |||
3258 | if ((load / 2) > env->load_move) | ||
3259 | goto next; | ||
3253 | 3260 | ||
3254 | pull_task(busiest, p, this_rq, this_cpu); | 3261 | if (!can_migrate_task(p, env)) |
3262 | goto next; | ||
3263 | |||
3264 | move_task(p, env); | ||
3255 | pulled++; | 3265 | pulled++; |
3256 | rem_load_move -= p->se.load.weight; | 3266 | env->load_move -= load; |
3257 | 3267 | ||
3258 | #ifdef CONFIG_PREEMPT | 3268 | #ifdef CONFIG_PREEMPT |
3259 | /* | 3269 | /* |
@@ -3261,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3261 | * kernels will stop after the first task is pulled to minimize | 3271 | * kernels will stop after the first task is pulled to minimize |
3262 | * the critical section. | 3272 | * the critical section. |
3263 | */ | 3273 | */ |
3264 | if (idle == CPU_NEWLY_IDLE) { | 3274 | if (env->idle == CPU_NEWLY_IDLE) |
3265 | *lb_flags |= LBF_ABORT; | ||
3266 | break; | 3275 | break; |
3267 | } | ||
3268 | #endif | 3276 | #endif |
3269 | 3277 | ||
3270 | /* | 3278 | /* |
3271 | * We only want to steal up to the prescribed amount of | 3279 | * We only want to steal up to the prescribed amount of |
3272 | * weighted load. | 3280 | * weighted load. |
3273 | */ | 3281 | */ |
3274 | if (rem_load_move <= 0) | 3282 | if (env->load_move <= 0) |
3275 | break; | 3283 | break; |
3284 | |||
3285 | continue; | ||
3286 | next: | ||
3287 | list_move_tail(&p->se.group_node, tasks); | ||
3276 | } | 3288 | } |
3277 | out: | 3289 | |
3278 | /* | 3290 | /* |
3279 | * Right now, this is one of only two places pull_task() is called, | 3291 | * Right now, this is one of only two places move_task() is called, |
3280 | * so we can safely collect pull_task() stats here rather than | 3292 | * so we can safely collect move_task() stats here rather than |
3281 | * inside pull_task(). | 3293 | * inside move_task(). |
3282 | */ | 3294 | */ |
3283 | schedstat_add(sd, lb_gained[idle], pulled); | 3295 | schedstat_add(env->sd, lb_gained[env->idle], pulled); |
3284 | 3296 | ||
3285 | return max_load_move - rem_load_move; | 3297 | return pulled; |
3286 | } | 3298 | } |
3287 | 3299 | ||
3288 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3300 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3362,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
3362 | 3374 | ||
3363 | static void update_h_load(long cpu) | 3375 | static void update_h_load(long cpu) |
3364 | { | 3376 | { |
3377 | rcu_read_lock(); | ||
3365 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3378 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
3379 | rcu_read_unlock(); | ||
3366 | } | 3380 | } |
3367 | 3381 | ||
3368 | static unsigned long | 3382 | static unsigned long task_h_load(struct task_struct *p) |
3369 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3370 | unsigned long max_load_move, | ||
3371 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3372 | int *lb_flags) | ||
3373 | { | 3383 | { |
3374 | long rem_load_move = max_load_move; | 3384 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
3375 | struct cfs_rq *busiest_cfs_rq; | 3385 | unsigned long load; |
3376 | |||
3377 | rcu_read_lock(); | ||
3378 | update_h_load(cpu_of(busiest)); | ||
3379 | |||
3380 | for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { | ||
3381 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
3382 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
3383 | u64 rem_load, moved_load; | ||
3384 | |||
3385 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3386 | break; | ||
3387 | |||
3388 | /* | ||
3389 | * empty group or part of a throttled hierarchy | ||
3390 | */ | ||
3391 | if (!busiest_cfs_rq->task_weight || | ||
3392 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
3393 | continue; | ||
3394 | |||
3395 | rem_load = (u64)rem_load_move * busiest_weight; | ||
3396 | rem_load = div_u64(rem_load, busiest_h_load + 1); | ||
3397 | |||
3398 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | ||
3399 | rem_load, sd, idle, lb_flags, | ||
3400 | busiest_cfs_rq); | ||
3401 | |||
3402 | if (!moved_load) | ||
3403 | continue; | ||
3404 | 3386 | ||
3405 | moved_load *= busiest_h_load; | 3387 | load = p->se.load.weight; |
3406 | moved_load = div_u64(moved_load, busiest_weight + 1); | 3388 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); |
3407 | 3389 | ||
3408 | rem_load_move -= moved_load; | 3390 | return load; |
3409 | if (rem_load_move < 0) | ||
3410 | break; | ||
3411 | } | ||
3412 | rcu_read_unlock(); | ||
3413 | |||
3414 | return max_load_move - rem_load_move; | ||
3415 | } | 3391 | } |
3416 | #else | 3392 | #else |
3417 | static inline void update_shares(int cpu) | 3393 | static inline void update_shares(int cpu) |
3418 | { | 3394 | { |
3419 | } | 3395 | } |
3420 | 3396 | ||
3421 | static unsigned long | 3397 | static inline void update_h_load(long cpu) |
3422 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3423 | unsigned long max_load_move, | ||
3424 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3425 | int *lb_flags) | ||
3426 | { | 3398 | { |
3427 | return balance_tasks(this_rq, this_cpu, busiest, | ||
3428 | max_load_move, sd, idle, lb_flags, | ||
3429 | &busiest->cfs); | ||
3430 | } | 3399 | } |
3431 | #endif | ||
3432 | 3400 | ||
3433 | /* | 3401 | static unsigned long task_h_load(struct task_struct *p) |
3434 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3435 | * this_rq, as part of a balancing operation within domain "sd". | ||
3436 | * Returns 1 if successful and 0 otherwise. | ||
3437 | * | ||
3438 | * Called with both runqueues locked. | ||
3439 | */ | ||
3440 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3441 | unsigned long max_load_move, | ||
3442 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3443 | int *lb_flags) | ||
3444 | { | 3402 | { |
3445 | unsigned long total_load_moved = 0, load_moved; | 3403 | return p->se.load.weight; |
3446 | |||
3447 | do { | ||
3448 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | ||
3449 | max_load_move - total_load_moved, | ||
3450 | sd, idle, lb_flags); | ||
3451 | |||
3452 | total_load_moved += load_moved; | ||
3453 | |||
3454 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3455 | break; | ||
3456 | |||
3457 | #ifdef CONFIG_PREEMPT | ||
3458 | /* | ||
3459 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3460 | * kernels will stop after the first task is pulled to minimize | ||
3461 | * the critical section. | ||
3462 | */ | ||
3463 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { | ||
3464 | *lb_flags |= LBF_ABORT; | ||
3465 | break; | ||
3466 | } | ||
3467 | #endif | ||
3468 | } while (load_moved && max_load_move > total_load_moved); | ||
3469 | |||
3470 | return total_load_moved > 0; | ||
3471 | } | 3404 | } |
3405 | #endif | ||
3472 | 3406 | ||
3473 | /********** Helpers for find_busiest_group ************************/ | 3407 | /********** Helpers for find_busiest_group ************************/ |
3474 | /* | 3408 | /* |
@@ -3778,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3778 | struct sched_domain *child = sd->child; | 3712 | struct sched_domain *child = sd->child; |
3779 | struct sched_group *group, *sdg = sd->groups; | 3713 | struct sched_group *group, *sdg = sd->groups; |
3780 | unsigned long power; | 3714 | unsigned long power; |
3715 | unsigned long interval; | ||
3716 | |||
3717 | interval = msecs_to_jiffies(sd->balance_interval); | ||
3718 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
3719 | sdg->sgp->next_update = jiffies + interval; | ||
3781 | 3720 | ||
3782 | if (!child) { | 3721 | if (!child) { |
3783 | update_cpu_power(sd, cpu); | 3722 | update_cpu_power(sd, cpu); |
@@ -3885,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3885 | * domains. In the newly idle case, we will allow all the cpu's | 3824 | * domains. In the newly idle case, we will allow all the cpu's |
3886 | * to do the newly idle load balance. | 3825 | * to do the newly idle load balance. |
3887 | */ | 3826 | */ |
3888 | if (idle != CPU_NEWLY_IDLE && local_group) { | 3827 | if (local_group) { |
3889 | if (balance_cpu != this_cpu) { | 3828 | if (idle != CPU_NEWLY_IDLE) { |
3890 | *balance = 0; | 3829 | if (balance_cpu != this_cpu) { |
3891 | return; | 3830 | *balance = 0; |
3892 | } | 3831 | return; |
3893 | update_group_power(sd, this_cpu); | 3832 | } |
3833 | update_group_power(sd, this_cpu); | ||
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
3835 | update_group_power(sd, this_cpu); | ||
3894 | } | 3836 | } |
3895 | 3837 | ||
3896 | /* Adjust by relative CPU power of the group */ | 3838 | /* Adjust by relative CPU power of the group */ |
@@ -4453,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4453 | struct sched_domain *sd, enum cpu_idle_type idle, | 4395 | struct sched_domain *sd, enum cpu_idle_type idle, |
4454 | int *balance) | 4396 | int *balance) |
4455 | { | 4397 | { |
4456 | int ld_moved, lb_flags = 0, active_balance = 0; | 4398 | int ld_moved, active_balance = 0; |
4457 | struct sched_group *group; | 4399 | struct sched_group *group; |
4458 | unsigned long imbalance; | 4400 | unsigned long imbalance; |
4459 | struct rq *busiest; | 4401 | struct rq *busiest; |
4460 | unsigned long flags; | 4402 | unsigned long flags; |
4461 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
4462 | 4404 | ||
4405 | struct lb_env env = { | ||
4406 | .sd = sd, | ||
4407 | .dst_cpu = this_cpu, | ||
4408 | .dst_rq = this_rq, | ||
4409 | .idle = idle, | ||
4410 | .loop_break = sysctl_sched_nr_migrate, | ||
4411 | }; | ||
4412 | |||
4463 | cpumask_copy(cpus, cpu_active_mask); | 4413 | cpumask_copy(cpus, cpu_active_mask); |
4464 | 4414 | ||
4465 | schedstat_inc(sd, lb_count[idle]); | 4415 | schedstat_inc(sd, lb_count[idle]); |
@@ -4494,32 +4444,34 @@ redo: | |||
4494 | * still unbalanced. ld_moved simply stays zero, so it is | 4444 | * still unbalanced. ld_moved simply stays zero, so it is |
4495 | * correctly treated as an imbalance. | 4445 | * correctly treated as an imbalance. |
4496 | */ | 4446 | */ |
4497 | lb_flags |= LBF_ALL_PINNED; | 4447 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | ||
4449 | env.src_cpu = busiest->cpu; | ||
4450 | env.src_rq = busiest; | ||
4451 | env.loop_max = busiest->nr_running; | ||
4452 | |||
4453 | more_balance: | ||
4498 | local_irq_save(flags); | 4454 | local_irq_save(flags); |
4499 | double_rq_lock(this_rq, busiest); | 4455 | double_rq_lock(this_rq, busiest); |
4500 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4456 | if (!env.loop) |
4501 | imbalance, sd, idle, &lb_flags); | 4457 | update_h_load(env.src_cpu); |
4458 | ld_moved += move_tasks(&env); | ||
4502 | double_rq_unlock(this_rq, busiest); | 4459 | double_rq_unlock(this_rq, busiest); |
4503 | local_irq_restore(flags); | 4460 | local_irq_restore(flags); |
4504 | 4461 | ||
4462 | if (env.flags & LBF_NEED_BREAK) { | ||
4463 | env.flags &= ~LBF_NEED_BREAK; | ||
4464 | goto more_balance; | ||
4465 | } | ||
4466 | |||
4505 | /* | 4467 | /* |
4506 | * some other cpu did the load balance for us. | 4468 | * some other cpu did the load balance for us. |
4507 | */ | 4469 | */ |
4508 | if (ld_moved && this_cpu != smp_processor_id()) | 4470 | if (ld_moved && this_cpu != smp_processor_id()) |
4509 | resched_cpu(this_cpu); | 4471 | resched_cpu(this_cpu); |
4510 | 4472 | ||
4511 | if (lb_flags & LBF_ABORT) | ||
4512 | goto out_balanced; | ||
4513 | |||
4514 | if (lb_flags & LBF_NEED_BREAK) { | ||
4515 | lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | ||
4516 | if (lb_flags & LBF_ABORT) | ||
4517 | goto out_balanced; | ||
4518 | goto redo; | ||
4519 | } | ||
4520 | |||
4521 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4473 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4522 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { | 4474 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4523 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4475 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4524 | if (!cpumask_empty(cpus)) | 4476 | if (!cpumask_empty(cpus)) |
4525 | goto redo; | 4477 | goto redo; |
@@ -4549,7 +4501,7 @@ redo: | |||
4549 | tsk_cpus_allowed(busiest->curr))) { | 4501 | tsk_cpus_allowed(busiest->curr))) { |
4550 | raw_spin_unlock_irqrestore(&busiest->lock, | 4502 | raw_spin_unlock_irqrestore(&busiest->lock, |
4551 | flags); | 4503 | flags); |
4552 | lb_flags |= LBF_ALL_PINNED; | 4504 | env.flags |= LBF_ALL_PINNED; |
4553 | goto out_one_pinned; | 4505 | goto out_one_pinned; |
4554 | } | 4506 | } |
4555 | 4507 | ||
@@ -4602,7 +4554,7 @@ out_balanced: | |||
4602 | 4554 | ||
4603 | out_one_pinned: | 4555 | out_one_pinned: |
4604 | /* tune up the balancing interval */ | 4556 | /* tune up the balancing interval */ |
4605 | if (((lb_flags & LBF_ALL_PINNED) && | 4557 | if (((env.flags & LBF_ALL_PINNED) && |
4606 | sd->balance_interval < MAX_PINNED_INTERVAL) || | 4558 | sd->balance_interval < MAX_PINNED_INTERVAL) || |
4607 | (sd->balance_interval < sd->max_interval)) | 4559 | (sd->balance_interval < sd->max_interval)) |
4608 | sd->balance_interval *= 2; | 4560 | sd->balance_interval *= 2; |
@@ -4712,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data) | |||
4712 | } | 4664 | } |
4713 | 4665 | ||
4714 | if (likely(sd)) { | 4666 | if (likely(sd)) { |
4667 | struct lb_env env = { | ||
4668 | .sd = sd, | ||
4669 | .dst_cpu = target_cpu, | ||
4670 | .dst_rq = target_rq, | ||
4671 | .src_cpu = busiest_rq->cpu, | ||
4672 | .src_rq = busiest_rq, | ||
4673 | .idle = CPU_IDLE, | ||
4674 | }; | ||
4675 | |||
4715 | schedstat_inc(sd, alb_count); | 4676 | schedstat_inc(sd, alb_count); |
4716 | 4677 | ||
4717 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 4678 | if (move_one_task(&env)) |
4718 | sd, CPU_IDLE)) | ||
4719 | schedstat_inc(sd, alb_pushed); | 4679 | schedstat_inc(sd, alb_pushed); |
4720 | else | 4680 | else |
4721 | schedstat_inc(sd, alb_failed); | 4681 | schedstat_inc(sd, alb_failed); |
@@ -4947,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
4947 | 4907 | ||
4948 | static DEFINE_SPINLOCK(balancing); | 4908 | static DEFINE_SPINLOCK(balancing); |
4949 | 4909 | ||
4950 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
4951 | |||
4952 | /* | 4910 | /* |
4953 | * Scale the max load_balance interval with the number of CPUs in the system. | 4911 | * Scale the max load_balance interval with the number of CPUs in the system. |
4954 | * This trades load-balance latency on larger machines for less cross talk. | 4912 | * This trades load-balance latency on larger machines for less cross talk. |
@@ -5342,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq) | |||
5342 | void init_cfs_rq(struct cfs_rq *cfs_rq) | 5300 | void init_cfs_rq(struct cfs_rq *cfs_rq) |
5343 | { | 5301 | { |
5344 | cfs_rq->tasks_timeline = RB_ROOT; | 5302 | cfs_rq->tasks_timeline = RB_ROOT; |
5345 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5346 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 5303 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
5347 | #ifndef CONFIG_64BIT | 5304 | #ifndef CONFIG_64BIT |
5348 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5305 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
@@ -5614,6 +5571,7 @@ __init void init_sched_fair_class(void) | |||
5614 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 5571 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
5615 | 5572 | ||
5616 | #ifdef CONFIG_NO_HZ | 5573 | #ifdef CONFIG_NO_HZ |
5574 | nohz.next_balance = jiffies; | ||
5617 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 5575 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
5618 | cpu_notifier(sched_ilb_notifier, 0); | 5576 | cpu_notifier(sched_ilb_notifier, 0); |
5619 | #endif | 5577 | #endif |