aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c329
1 files changed, 198 insertions, 131 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 22b8b4f2b616..6ee2e0af665b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3091,13 +3091,40 @@ out_unlock:
3091} 3091}
3092 3092
3093#ifdef CONFIG_NO_HZ 3093#ifdef CONFIG_NO_HZ
3094
3095static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3096
3097static void trigger_sched_softirq(void *data)
3098{
3099 raise_softirq_irqoff(SCHED_SOFTIRQ);
3100}
3101
3102static inline void init_sched_softirq_csd(struct call_single_data *csd)
3103{
3104 csd->func = trigger_sched_softirq;
3105 csd->info = NULL;
3106 csd->flags = 0;
3107 csd->priv = 0;
3108}
3109
3110/*
3111 * idle load balancing details
3112 * - One of the idle CPUs nominates itself as idle load_balancer, while
3113 * entering idle.
3114 * - This idle load balancer CPU will also go into tickless mode when
3115 * it is idle, just like all other idle CPUs
3116 * - When one of the busy CPUs notice that there may be an idle rebalancing
3117 * needed, they will kick the idle load balancer, which then does idle
3118 * load balancing for all the idle CPUs.
3119 */
3094static struct { 3120static struct {
3095 atomic_t load_balancer; 3121 atomic_t load_balancer;
3096 cpumask_var_t cpu_mask; 3122 atomic_t first_pick_cpu;
3097 cpumask_var_t ilb_grp_nohz_mask; 3123 atomic_t second_pick_cpu;
3098} nohz ____cacheline_aligned = { 3124 cpumask_var_t idle_cpus_mask;
3099 .load_balancer = ATOMIC_INIT(-1), 3125 cpumask_var_t grp_idle_mask;
3100}; 3126 unsigned long next_balance; /* in jiffy units */
3127} nohz ____cacheline_aligned;
3101 3128
3102int get_nohz_load_balancer(void) 3129int get_nohz_load_balancer(void)
3103{ 3130{
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3151 */ 3178 */
3152static inline int is_semi_idle_group(struct sched_group *ilb_group) 3179static inline int is_semi_idle_group(struct sched_group *ilb_group)
3153{ 3180{
3154 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3181 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3155 sched_group_cpus(ilb_group)); 3182 sched_group_cpus(ilb_group));
3156 3183
3157 /* 3184 /*
3158 * A sched_group is semi-idle when it has atleast one busy cpu 3185 * A sched_group is semi-idle when it has atleast one busy cpu
3159 * and atleast one idle cpu. 3186 * and atleast one idle cpu.
3160 */ 3187 */
3161 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3188 if (cpumask_empty(nohz.grp_idle_mask))
3162 return 0; 3189 return 0;
3163 3190
3164 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3191 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3165 return 0; 3192 return 0;
3166 3193
3167 return 1; 3194 return 1;
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
3194 * Optimize for the case when we have no idle CPUs or only one 3221 * Optimize for the case when we have no idle CPUs or only one
3195 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3222 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3196 */ 3223 */
3197 if (cpumask_weight(nohz.cpu_mask) < 2) 3224 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3198 goto out_done; 3225 goto out_done;
3199 3226
3200 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3227 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
3202 3229
3203 do { 3230 do {
3204 if (is_semi_idle_group(ilb_group)) 3231 if (is_semi_idle_group(ilb_group))
3205 return cpumask_first(nohz.ilb_grp_nohz_mask); 3232 return cpumask_first(nohz.grp_idle_mask);
3206 3233
3207 ilb_group = ilb_group->next; 3234 ilb_group = ilb_group->next;
3208 3235
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
3210 } 3237 }
3211 3238
3212out_done: 3239out_done:
3213 return cpumask_first(nohz.cpu_mask); 3240 return nr_cpu_ids;
3214} 3241}
3215#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3242#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3216static inline int find_new_ilb(int call_cpu) 3243static inline int find_new_ilb(int call_cpu)
3217{ 3244{
3218 return cpumask_first(nohz.cpu_mask); 3245 return nr_cpu_ids;
3219} 3246}
3220#endif 3247#endif
3221 3248
3222/* 3249/*
3250 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3251 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3252 * CPU (if there is one).
3253 */
3254static void nohz_balancer_kick(int cpu)
3255{
3256 int ilb_cpu;
3257
3258 nohz.next_balance++;
3259
3260 ilb_cpu = get_nohz_load_balancer();
3261
3262 if (ilb_cpu >= nr_cpu_ids) {
3263 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3264 if (ilb_cpu >= nr_cpu_ids)
3265 return;
3266 }
3267
3268 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3269 struct call_single_data *cp;
3270
3271 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3272 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3273 __smp_call_function_single(ilb_cpu, cp, 0);
3274 }
3275 return;
3276}
3277
3278/*
3223 * This routine will try to nominate the ilb (idle load balancing) 3279 * This routine will try to nominate the ilb (idle load balancing)
3224 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3280 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3225 * load balancing on behalf of all those cpus. If all the cpus in the system 3281 * load balancing on behalf of all those cpus.
3226 * go into this tickless mode, then there will be no ilb owner (as there is
3227 * no need for one) and all the cpus will sleep till the next wakeup event
3228 * arrives...
3229 *
3230 * For the ilb owner, tick is not stopped. And this tick will be used
3231 * for idle load balancing. ilb owner will still be part of
3232 * nohz.cpu_mask..
3233 * 3282 *
3234 * While stopping the tick, this cpu will become the ilb owner if there 3283 * When the ilb owner becomes busy, we will not have new ilb owner until some
3235 * is no other owner. And will be the owner till that cpu becomes busy 3284 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3236 * or if all cpus in the system stop their ticks at which point 3285 * idle load balancing by kicking one of the idle CPUs.
3237 * there is no need for ilb owner.
3238 * 3286 *
3239 * When the ilb owner becomes busy, it nominates another owner, during the 3287 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3240 * next busy scheduler_tick() 3288 * ilb owner CPU in future (when there is a need for idle load balancing on
3289 * behalf of all idle CPUs).
3241 */ 3290 */
3242int select_nohz_load_balancer(int stop_tick) 3291void select_nohz_load_balancer(int stop_tick)
3243{ 3292{
3244 int cpu = smp_processor_id(); 3293 int cpu = smp_processor_id();
3245 3294
3246 if (stop_tick) { 3295 if (stop_tick) {
3247 cpu_rq(cpu)->in_nohz_recently = 1;
3248
3249 if (!cpu_active(cpu)) { 3296 if (!cpu_active(cpu)) {
3250 if (atomic_read(&nohz.load_balancer) != cpu) 3297 if (atomic_read(&nohz.load_balancer) != cpu)
3251 return 0; 3298 return;
3252 3299
3253 /* 3300 /*
3254 * If we are going offline and still the leader, 3301 * If we are going offline and still the leader,
3255 * give up! 3302 * give up!
3256 */ 3303 */
3257 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3304 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3305 nr_cpu_ids) != cpu)
3258 BUG(); 3306 BUG();
3259 3307
3260 return 0; 3308 return;
3261 } 3309 }
3262 3310
3263 cpumask_set_cpu(cpu, nohz.cpu_mask); 3311 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3264 3312
3265 /* time for ilb owner also to sleep */ 3313 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3266 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3314 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3267 if (atomic_read(&nohz.load_balancer) == cpu) 3315 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3268 atomic_set(&nohz.load_balancer, -1); 3316 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3269 return 0;
3270 }
3271 3317
3272 if (atomic_read(&nohz.load_balancer) == -1) { 3318 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3273 /* make me the ilb owner */
3274 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3275 return 1;
3276 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3277 int new_ilb; 3319 int new_ilb;
3278 3320
3279 if (!(sched_smt_power_savings || 3321 /* make me the ilb owner */
3280 sched_mc_power_savings)) 3322 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3281 return 1; 3323 cpu) != nr_cpu_ids)
3324 return;
3325
3282 /* 3326 /*
3283 * Check to see if there is a more power-efficient 3327 * Check to see if there is a more power-efficient
3284 * ilb. 3328 * ilb.
3285 */ 3329 */
3286 new_ilb = find_new_ilb(cpu); 3330 new_ilb = find_new_ilb(cpu);
3287 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3288 atomic_set(&nohz.load_balancer, -1); 3332 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3289 resched_cpu(new_ilb); 3333 resched_cpu(new_ilb);
3290 return 0; 3334 return;
3291 } 3335 }
3292 return 1; 3336 return;
3293 } 3337 }
3294 } else { 3338 } else {
3295 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3339 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3296 return 0; 3340 return;
3297 3341
3298 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3342 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3299 3343
3300 if (atomic_read(&nohz.load_balancer) == cpu) 3344 if (atomic_read(&nohz.load_balancer) == cpu)
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3346 nr_cpu_ids) != cpu)
3302 BUG(); 3347 BUG();
3303 } 3348 }
3304 return 0; 3349 return;
3305} 3350}
3306#endif 3351#endif
3307 3352
@@ -3383,11 +3428,101 @@ out:
3383 rq->next_balance = next_balance; 3428 rq->next_balance = next_balance;
3384} 3429}
3385 3430
3431#ifdef CONFIG_NO_HZ
3386/* 3432/*
3387 * run_rebalance_domains is triggered when needed from the scheduler tick. 3433 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3388 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3389 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3434 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3390 */ 3435 */
3436static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3437{
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 struct rq *rq;
3440 int balance_cpu;
3441
3442 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3443 return;
3444
3445 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3446 if (balance_cpu == this_cpu)
3447 continue;
3448
3449 /*
3450 * If this cpu gets work to do, stop the load balancing
3451 * work being done for other cpus. Next load
3452 * balancing owner will pick it up.
3453 */
3454 if (need_resched()) {
3455 this_rq->nohz_balance_kick = 0;
3456 break;
3457 }
3458
3459 raw_spin_lock_irq(&this_rq->lock);
3460 update_cpu_load(this_rq);
3461 raw_spin_unlock_irq(&this_rq->lock);
3462
3463 rebalance_domains(balance_cpu, CPU_IDLE);
3464
3465 rq = cpu_rq(balance_cpu);
3466 if (time_after(this_rq->next_balance, rq->next_balance))
3467 this_rq->next_balance = rq->next_balance;
3468 }
3469 nohz.next_balance = this_rq->next_balance;
3470 this_rq->nohz_balance_kick = 0;
3471}
3472
3473/*
3474 * Current heuristic for kicking the idle load balancer
3475 * - first_pick_cpu is the one of the busy CPUs. It will kick
3476 * idle load balancer when it has more than one process active. This
3477 * eliminates the need for idle load balancing altogether when we have
3478 * only one running process in the system (common case).
3479 * - If there are more than one busy CPU, idle load balancer may have
3480 * to run for active_load_balance to happen (i.e., two busy CPUs are
3481 * SMT or core siblings and can run better if they move to different
3482 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3483 * which will kick idle load balancer as soon as it has any load.
3484 */
3485static inline int nohz_kick_needed(struct rq *rq, int cpu)
3486{
3487 unsigned long now = jiffies;
3488 int ret;
3489 int first_pick_cpu, second_pick_cpu;
3490
3491 if (time_before(now, nohz.next_balance))
3492 return 0;
3493
3494 if (!rq->nr_running)
3495 return 0;
3496
3497 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3498 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3499
3500 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3501 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3502 return 0;
3503
3504 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3505 if (ret == nr_cpu_ids || ret == cpu) {
3506 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3507 if (rq->nr_running > 1)
3508 return 1;
3509 } else {
3510 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3511 if (ret == nr_cpu_ids || ret == cpu) {
3512 if (rq->nr_running)
3513 return 1;
3514 }
3515 }
3516 return 0;
3517}
3518#else
3519static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3520#endif
3521
3522/*
3523 * run_rebalance_domains is triggered when needed from the scheduler tick.
3524 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3525 */
3391static void run_rebalance_domains(struct softirq_action *h) 3526static void run_rebalance_domains(struct softirq_action *h)
3392{ 3527{
3393 int this_cpu = smp_processor_id(); 3528 int this_cpu = smp_processor_id();
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3397 3532
3398 rebalance_domains(this_cpu, idle); 3533 rebalance_domains(this_cpu, idle);
3399 3534
3400#ifdef CONFIG_NO_HZ
3401 /* 3535 /*
3402 * If this cpu is the owner for idle load balancing, then do the 3536 * If this cpu has a pending nohz_balance_kick, then do the
3403 * balancing on behalf of the other idle cpus whose ticks are 3537 * balancing on behalf of the other idle cpus whose ticks are
3404 * stopped. 3538 * stopped.
3405 */ 3539 */
3406 if (this_rq->idle_at_tick && 3540 nohz_idle_balance(this_cpu, idle);
3407 atomic_read(&nohz.load_balancer) == this_cpu) {
3408 struct rq *rq;
3409 int balance_cpu;
3410
3411 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3412 if (balance_cpu == this_cpu)
3413 continue;
3414
3415 /*
3416 * If this cpu gets work to do, stop the load balancing
3417 * work being done for other cpus. Next load
3418 * balancing owner will pick it up.
3419 */
3420 if (need_resched())
3421 break;
3422
3423 rq = cpu_rq(balance_cpu);
3424 raw_spin_lock_irq(&rq->lock);
3425 update_cpu_load(rq);
3426 raw_spin_unlock_irq(&rq->lock);
3427 rebalance_domains(balance_cpu, CPU_IDLE);
3428
3429 if (time_after(this_rq->next_balance, rq->next_balance))
3430 this_rq->next_balance = rq->next_balance;
3431 }
3432 }
3433#endif
3434} 3541}
3435 3542
3436static inline int on_null_domain(int cpu) 3543static inline int on_null_domain(int cpu)
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
3440 3547
3441/* 3548/*
3442 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3549 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3443 *
3444 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3445 * idle load balancing owner or decide to stop the periodic load balancing,
3446 * if the whole system is idle.
3447 */ 3550 */
3448static inline void trigger_load_balance(struct rq *rq, int cpu) 3551static inline void trigger_load_balance(struct rq *rq, int cpu)
3449{ 3552{
3450#ifdef CONFIG_NO_HZ
3451 /*
3452 * If we were in the nohz mode recently and busy at the current
3453 * scheduler tick, then check if we need to nominate new idle
3454 * load balancer.
3455 */
3456 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3457 rq->in_nohz_recently = 0;
3458
3459 if (atomic_read(&nohz.load_balancer) == cpu) {
3460 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3461 atomic_set(&nohz.load_balancer, -1);
3462 }
3463
3464 if (atomic_read(&nohz.load_balancer) == -1) {
3465 int ilb = find_new_ilb(cpu);
3466
3467 if (ilb < nr_cpu_ids)
3468 resched_cpu(ilb);
3469 }
3470 }
3471
3472 /*
3473 * If this cpu is idle and doing idle load balancing for all the
3474 * cpus with ticks stopped, is it time for that to stop?
3475 */
3476 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3477 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3478 resched_cpu(cpu);
3479 return;
3480 }
3481
3482 /*
3483 * If this cpu is idle and the idle load balancing is done by
3484 * someone else, then no need raise the SCHED_SOFTIRQ
3485 */
3486 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3487 cpumask_test_cpu(cpu, nohz.cpu_mask))
3488 return;
3489#endif
3490 /* Don't need to rebalance while attached to NULL domain */ 3553 /* Don't need to rebalance while attached to NULL domain */
3491 if (time_after_eq(jiffies, rq->next_balance) && 3554 if (time_after_eq(jiffies, rq->next_balance) &&
3492 likely(!on_null_domain(cpu))) 3555 likely(!on_null_domain(cpu)))
3493 raise_softirq(SCHED_SOFTIRQ); 3556 raise_softirq(SCHED_SOFTIRQ);
3557#ifdef CONFIG_NO_HZ
3558 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3559 nohz_balancer_kick(cpu);
3560#endif
3494} 3561}
3495 3562
3496static void rq_online_fair(struct rq *rq) 3563static void rq_online_fair(struct rq *rq)