aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c533
1 files changed, 136 insertions, 397 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2703 int want_sd = 1; 2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2704 int sync = wake_flags & WF_SYNC;
2705 2705
2706 if (p->rt.nr_cpus_allowed == 1) 2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu; 2707 return prev_cpu;
2708 2708
2709 if (sd_flag & SD_BALANCE_WAKE) { 2709 if (sd_flag & SD_BALANCE_WAKE) {
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);
3218static const unsigned int sched_nr_migrate_break = 32; 3215static const unsigned int sched_nr_migrate_break = 32;
3219 3216
3220/* 3217/*
3221 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3222 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3223 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3224 * 3221 *
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3231 unsigned long load; 3228 unsigned long load;
3232 int pulled = 0; 3229 int pulled = 0;
3233 3230
3234 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3235 return 0; 3232 return 0;
3236 3233
3237 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3258 goto next; 3255 goto next;
3259 3256
3260 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3261 goto next; 3258 goto next;
3262 3259
3263 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3265 3262
3266 move_task(p, env); 3263 move_task(p, env);
3267 pulled++; 3264 pulled++;
3268 env->load_move -= load; 3265 env->imbalance -= load;
3269 3266
3270#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3271 /* 3268 /*
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3281 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3282 * weighted load. 3279 * weighted load.
3283 */ 3280 */
3284 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3285 break; 3282 break;
3286 3283
3287 continue; 3284 continue;
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
3435 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3436 3433
3437 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3438#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3439 int power_savings_balance; /* Is powersave balance needed for this sd */
3440 struct sched_group *group_min; /* Least loaded group in sd */
3441 struct sched_group *group_leader; /* Group which relieves group_min */
3442 unsigned long min_load_per_task; /* load_per_task in group_min */
3443 unsigned long leader_nr_running; /* Nr running of group_leader */
3444 unsigned long min_nr_running; /* Nr running of group_min */
3445#endif
3446}; 3435};
3447 3436
3448/* 3437/*
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3486 return load_idx; 3475 return load_idx;
3487} 3476}
3488 3477
3489
3490#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3491/**
3492 * init_sd_power_savings_stats - Initialize power savings statistics for
3493 * the given sched_domain, during load balancing.
3494 *
3495 * @sd: Sched domain whose power-savings statistics are to be initialized.
3496 * @sds: Variable containing the statistics for sd.
3497 * @idle: Idle status of the CPU at which we're performing load-balancing.
3498 */
3499static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3500 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3501{
3502 /*
3503 * Busy processors will not participate in power savings
3504 * balance.
3505 */
3506 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3507 sds->power_savings_balance = 0;
3508 else {
3509 sds->power_savings_balance = 1;
3510 sds->min_nr_running = ULONG_MAX;
3511 sds->leader_nr_running = 0;
3512 }
3513}
3514
3515/**
3516 * update_sd_power_savings_stats - Update the power saving stats for a
3517 * sched_domain while performing load balancing.
3518 *
3519 * @group: sched_group belonging to the sched_domain under consideration.
3520 * @sds: Variable containing the statistics of the sched_domain
3521 * @local_group: Does group contain the CPU for which we're performing
3522 * load balancing ?
3523 * @sgs: Variable containing the statistics of the group.
3524 */
3525static inline void update_sd_power_savings_stats(struct sched_group *group,
3526 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3527{
3528
3529 if (!sds->power_savings_balance)
3530 return;
3531
3532 /*
3533 * If the local group is idle or completely loaded
3534 * no need to do power savings balance at this domain
3535 */
3536 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3537 !sds->this_nr_running))
3538 sds->power_savings_balance = 0;
3539
3540 /*
3541 * If a group is already running at full capacity or idle,
3542 * don't include that group in power savings calculations
3543 */
3544 if (!sds->power_savings_balance ||
3545 sgs->sum_nr_running >= sgs->group_capacity ||
3546 !sgs->sum_nr_running)
3547 return;
3548
3549 /*
3550 * Calculate the group which has the least non-idle load.
3551 * This is the group from where we need to pick up the load
3552 * for saving power
3553 */
3554 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3555 (sgs->sum_nr_running == sds->min_nr_running &&
3556 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3557 sds->group_min = group;
3558 sds->min_nr_running = sgs->sum_nr_running;
3559 sds->min_load_per_task = sgs->sum_weighted_load /
3560 sgs->sum_nr_running;
3561 }
3562
3563 /*
3564 * Calculate the group which is almost near its
3565 * capacity but still has some space to pick up some load
3566 * from other group and save more power
3567 */
3568 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3569 return;
3570
3571 if (sgs->sum_nr_running > sds->leader_nr_running ||
3572 (sgs->sum_nr_running == sds->leader_nr_running &&
3573 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3574 sds->group_leader = group;
3575 sds->leader_nr_running = sgs->sum_nr_running;
3576 }
3577}
3578
3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @sds: Variable containing the statistics of the sched_domain
3582 * under consideration.
3583 * @this_cpu: Cpu at which we're currently performing load-balancing.
3584 * @imbalance: Variable to store the imbalance.
3585 *
3586 * Description:
3587 * Check if we have potential to perform some power-savings balance.
3588 * If yes, set the busiest group to be the least loaded group in the
3589 * sched_domain, so that it's CPUs can be put to idle.
3590 *
3591 * Returns 1 if there is potential to perform power-savings balance.
3592 * Else returns 0.
3593 */
3594static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3595 int this_cpu, unsigned long *imbalance)
3596{
3597 if (!sds->power_savings_balance)
3598 return 0;
3599
3600 if (sds->this != sds->group_leader ||
3601 sds->group_leader == sds->group_min)
3602 return 0;
3603
3604 *imbalance = sds->min_load_per_task;
3605 sds->busiest = sds->group_min;
3606
3607 return 1;
3608
3609}
3610#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3611static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3612 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3613{
3614 return;
3615}
3616
3617static inline void update_sd_power_savings_stats(struct sched_group *group,
3618 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3619{
3620 return;
3621}
3622
3623static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 int this_cpu, unsigned long *imbalance)
3625{
3626 return 0;
3627}
3628#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3629
3630
3631unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3632{ 3479{
3633 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3656,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3656unsigned long scale_rt_power(int cpu) 3503unsigned long scale_rt_power(int cpu)
3657{ 3504{
3658 struct rq *rq = cpu_rq(cpu); 3505 struct rq *rq = cpu_rq(cpu);
3659 u64 total, available; 3506 u64 total, available, age_stamp, avg;
3507
3508 /*
3509 * Since we're reading these variables without serialization make sure
3510 * we read them once before doing sanity checks on them.
3511 */
3512 age_stamp = ACCESS_ONCE(rq->age_stamp);
3513 avg = ACCESS_ONCE(rq->rt_avg);
3660 3514
3661 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3515 total = sched_avg_period() + (rq->clock - age_stamp);
3662 3516
3663 if (unlikely(total < rq->rt_avg)) { 3517 if (unlikely(total < avg)) {
3664 /* Ensures that power won't end up being negative */ 3518 /* Ensures that power won't end up being negative */
3665 available = 0; 3519 available = 0;
3666 } else { 3520 } else {
3667 available = total - rq->rt_avg; 3521 available = total - avg;
3668 } 3522 }
3669 3523
3670 if (unlikely((s64)total < SCHED_POWER_SCALE)) 3524 if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3727,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
3727 3581
3728 power = 0; 3582 power = 0;
3729 3583
3730 group = child->groups; 3584 if (child->flags & SD_OVERLAP) {
3731 do { 3585 /*
3732 power += group->sgp->power; 3586 * SD_OVERLAP domains cannot assume that child groups
3733 group = group->next; 3587 * span the current group.
3734 } while (group != child->groups); 3588 */
3735 3589
3736 sdg->sgp->power = power; 3590 for_each_cpu(cpu, sched_group_cpus(sdg))
3591 power += power_of(cpu);
3592 } else {
3593 /*
3594 * !SD_OVERLAP domains can assume that child groups
3595 * span the current group.
3596 */
3597
3598 group = child->groups;
3599 do {
3600 power += group->sgp->power;
3601 group = group->next;
3602 } while (group != child->groups);
3603 }
3604
3605 sdg->sgp->power_orig = sdg->sgp->power = power;
3737} 3606}
3738 3607
3739/* 3608/*
@@ -3763,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3763 3632
3764/** 3633/**
3765 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3766 * @sd: The sched_domain whose statistics are to be updated. 3635 * @env: The load balancing environment.
3767 * @group: sched_group whose statistics are to be updated. 3636 * @group: sched_group whose statistics are to be updated.
3768 * @this_cpu: Cpu for which load balance is currently performed.
3769 * @idle: Idle status of this_cpu
3770 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3771 * @local_group: Does group contain this_cpu. 3638 * @local_group: Does group contain this_cpu.
3772 * @cpus: Set of cpus considered for load balancing. 3639 * @cpus: Set of cpus considered for load balancing.
3773 * @balance: Should we balance. 3640 * @balance: Should we balance.
3774 * @sgs: variable to hold the statistics for this group. 3641 * @sgs: variable to hold the statistics for this group.
3775 */ 3642 */
3776static inline void update_sg_lb_stats(struct sched_domain *sd, 3643static inline void update_sg_lb_stats(struct lb_env *env,
3777 struct sched_group *group, int this_cpu, 3644 struct sched_group *group, int load_idx,
3778 enum cpu_idle_type idle, int load_idx,
3779 int local_group, const struct cpumask *cpus, 3645 int local_group, const struct cpumask *cpus,
3780 int *balance, struct sg_lb_stats *sgs) 3646 int *balance, struct sg_lb_stats *sgs)
3781{ 3647{
3782 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3648 unsigned long nr_running, max_nr_running, min_nr_running;
3783 int i; 3649 unsigned long load, max_cpu_load, min_cpu_load;
3784 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3650 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3785 unsigned long avg_load_per_task = 0; 3651 unsigned long avg_load_per_task = 0;
3652 int i;
3786 3653
3787 if (local_group) 3654 if (local_group)
3788 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_balance_cpu(group);
3789 3656
3790 /* Tally up the load of all CPUs in the group */ 3657 /* Tally up the load of all CPUs in the group */
3791 max_cpu_load = 0; 3658 max_cpu_load = 0;
3792 min_cpu_load = ~0UL; 3659 min_cpu_load = ~0UL;
3793 max_nr_running = 0; 3660 max_nr_running = 0;
3661 min_nr_running = ~0UL;
3794 3662
3795 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3663 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3796 struct rq *rq = cpu_rq(i); 3664 struct rq *rq = cpu_rq(i);
3797 3665
3666 nr_running = rq->nr_running;
3667
3798 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3799 if (local_group) { 3669 if (local_group) {
3800 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu &&
3671 cpumask_test_cpu(i, sched_group_mask(group))) {
3801 first_idle_cpu = 1; 3672 first_idle_cpu = 1;
3802 balance_cpu = i; 3673 balance_cpu = i;
3803 } 3674 }
@@ -3805,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3805 load = target_load(i, load_idx); 3676 load = target_load(i, load_idx);
3806 } else { 3677 } else {
3807 load = source_load(i, load_idx); 3678 load = source_load(i, load_idx);
3808 if (load > max_cpu_load) { 3679 if (load > max_cpu_load)
3809 max_cpu_load = load; 3680 max_cpu_load = load;
3810 max_nr_running = rq->nr_running;
3811 }
3812 if (min_cpu_load > load) 3681 if (min_cpu_load > load)
3813 min_cpu_load = load; 3682 min_cpu_load = load;
3683
3684 if (nr_running > max_nr_running)
3685 max_nr_running = nr_running;
3686 if (min_nr_running > nr_running)
3687 min_nr_running = nr_running;
3814 } 3688 }
3815 3689
3816 sgs->group_load += load; 3690 sgs->group_load += load;
3817 sgs->sum_nr_running += rq->nr_running; 3691 sgs->sum_nr_running += nr_running;
3818 sgs->sum_weighted_load += weighted_cpuload(i); 3692 sgs->sum_weighted_load += weighted_cpuload(i);
3819 if (idle_cpu(i)) 3693 if (idle_cpu(i))
3820 sgs->idle_cpus++; 3694 sgs->idle_cpus++;
@@ -3827,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3827 * to do the newly idle load balance. 3701 * to do the newly idle load balance.
3828 */ 3702 */
3829 if (local_group) { 3703 if (local_group) {
3830 if (idle != CPU_NEWLY_IDLE) { 3704 if (env->idle != CPU_NEWLY_IDLE) {
3831 if (balance_cpu != this_cpu) { 3705 if (balance_cpu != env->dst_cpu) {
3832 *balance = 0; 3706 *balance = 0;
3833 return; 3707 return;
3834 } 3708 }
3835 update_group_power(sd, this_cpu); 3709 update_group_power(env->sd, env->dst_cpu);
3836 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3710 } else if (time_after_eq(jiffies, group->sgp->next_update))
3837 update_group_power(sd, this_cpu); 3711 update_group_power(env->sd, env->dst_cpu);
3838 } 3712 }
3839 3713
3840 /* Adjust by relative CPU power of the group */ 3714 /* Adjust by relative CPU power of the group */
@@ -3852,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3852 if (sgs->sum_nr_running) 3726 if (sgs->sum_nr_running)
3853 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3727 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3854 3728
3855 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3729 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3730 (max_nr_running - min_nr_running) > 1)
3856 sgs->group_imb = 1; 3731 sgs->group_imb = 1;
3857 3732
3858 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3733 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3859 SCHED_POWER_SCALE); 3734 SCHED_POWER_SCALE);
3860 if (!sgs->group_capacity) 3735 if (!sgs->group_capacity)
3861 sgs->group_capacity = fix_small_capacity(sd, group); 3736 sgs->group_capacity = fix_small_capacity(env->sd, group);
3862 sgs->group_weight = group->group_weight; 3737 sgs->group_weight = group->group_weight;
3863 3738
3864 if (sgs->group_capacity > sgs->sum_nr_running) 3739 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3867,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3867 3742
3868/** 3743/**
3869 * update_sd_pick_busiest - return 1 on busiest group 3744 * update_sd_pick_busiest - return 1 on busiest group
3870 * @sd: sched_domain whose statistics are to be checked 3745 * @env: The load balancing environment.
3871 * @sds: sched_domain statistics 3746 * @sds: sched_domain statistics
3872 * @sg: sched_group candidate to be checked for being the busiest 3747 * @sg: sched_group candidate to be checked for being the busiest
3873 * @sgs: sched_group statistics 3748 * @sgs: sched_group statistics
3874 * @this_cpu: the current cpu
3875 * 3749 *
3876 * Determine if @sg is a busier group than the previously selected 3750 * Determine if @sg is a busier group than the previously selected
3877 * busiest group. 3751 * busiest group.
3878 */ 3752 */
3879static bool update_sd_pick_busiest(struct sched_domain *sd, 3753static bool update_sd_pick_busiest(struct lb_env *env,
3880 struct sd_lb_stats *sds, 3754 struct sd_lb_stats *sds,
3881 struct sched_group *sg, 3755 struct sched_group *sg,
3882 struct sg_lb_stats *sgs, 3756 struct sg_lb_stats *sgs)
3883 int this_cpu)
3884{ 3757{
3885 if (sgs->avg_load <= sds->max_load) 3758 if (sgs->avg_load <= sds->max_load)
3886 return false; 3759 return false;
@@ -3896,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3896 * numbered CPUs in the group, therefore mark all groups 3769 * numbered CPUs in the group, therefore mark all groups
3897 * higher than ourself as busy. 3770 * higher than ourself as busy.
3898 */ 3771 */
3899 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3772 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3900 this_cpu < group_first_cpu(sg)) { 3773 env->dst_cpu < group_first_cpu(sg)) {
3901 if (!sds->busiest) 3774 if (!sds->busiest)
3902 return true; 3775 return true;
3903 3776
@@ -3910,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3910 3783
3911/** 3784/**
3912 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 3785 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3913 * @sd: sched_domain whose statistics are to be updated. 3786 * @env: The load balancing environment.
3914 * @this_cpu: Cpu for which load balance is currently performed.
3915 * @idle: Idle status of this_cpu
3916 * @cpus: Set of cpus considered for load balancing. 3787 * @cpus: Set of cpus considered for load balancing.
3917 * @balance: Should we balance. 3788 * @balance: Should we balance.
3918 * @sds: variable to hold the statistics for this sched_domain. 3789 * @sds: variable to hold the statistics for this sched_domain.
3919 */ 3790 */
3920static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3791static inline void update_sd_lb_stats(struct lb_env *env,
3921 enum cpu_idle_type idle, const struct cpumask *cpus, 3792 const struct cpumask *cpus,
3922 int *balance, struct sd_lb_stats *sds) 3793 int *balance, struct sd_lb_stats *sds)
3923{ 3794{
3924 struct sched_domain *child = sd->child; 3795 struct sched_domain *child = env->sd->child;
3925 struct sched_group *sg = sd->groups; 3796 struct sched_group *sg = env->sd->groups;
3926 struct sg_lb_stats sgs; 3797 struct sg_lb_stats sgs;
3927 int load_idx, prefer_sibling = 0; 3798 int load_idx, prefer_sibling = 0;
3928 3799
3929 if (child && child->flags & SD_PREFER_SIBLING) 3800 if (child && child->flags & SD_PREFER_SIBLING)
3930 prefer_sibling = 1; 3801 prefer_sibling = 1;
3931 3802
3932 init_sd_power_savings_stats(sd, sds, idle); 3803 load_idx = get_sd_load_idx(env->sd, env->idle);
3933 load_idx = get_sd_load_idx(sd, idle);
3934 3804
3935 do { 3805 do {
3936 int local_group; 3806 int local_group;
3937 3807
3938 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3808 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3939 memset(&sgs, 0, sizeof(sgs)); 3809 memset(&sgs, 0, sizeof(sgs));
3940 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3810 update_sg_lb_stats(env, sg, load_idx, local_group,
3941 local_group, cpus, balance, &sgs); 3811 cpus, balance, &sgs);
3942 3812
3943 if (local_group && !(*balance)) 3813 if (local_group && !(*balance))
3944 return; 3814 return;
@@ -3966,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3966 sds->this_load_per_task = sgs.sum_weighted_load; 3836 sds->this_load_per_task = sgs.sum_weighted_load;
3967 sds->this_has_capacity = sgs.group_has_capacity; 3837 sds->this_has_capacity = sgs.group_has_capacity;
3968 sds->this_idle_cpus = sgs.idle_cpus; 3838 sds->this_idle_cpus = sgs.idle_cpus;
3969 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3839 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3970 sds->max_load = sgs.avg_load; 3840 sds->max_load = sgs.avg_load;
3971 sds->busiest = sg; 3841 sds->busiest = sg;
3972 sds->busiest_nr_running = sgs.sum_nr_running; 3842 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3978,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3978 sds->group_imb = sgs.group_imb; 3848 sds->group_imb = sgs.group_imb;
3979 } 3849 }
3980 3850
3981 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3982 sg = sg->next; 3851 sg = sg->next;
3983 } while (sg != sd->groups); 3852 } while (sg != env->sd->groups);
3984} 3853}
3985 3854
3986/** 3855/**
@@ -4003,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4003 * Returns 1 when packing is required and a task should be moved to 3872 * Returns 1 when packing is required and a task should be moved to
4004 * this CPU. The amount of the imbalance is returned in *imbalance. 3873 * this CPU. The amount of the imbalance is returned in *imbalance.
4005 * 3874 *
4006 * @sd: The sched_domain whose packing is to be checked. 3875 * @env: The load balancing environment.
4007 * @sds: Statistics of the sched_domain which is to be packed 3876 * @sds: Statistics of the sched_domain which is to be packed
4008 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4009 * @imbalance: returns amount of imbalanced due to packing.
4010 */ 3877 */
4011static int check_asym_packing(struct sched_domain *sd, 3878static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4012 struct sd_lb_stats *sds,
4013 int this_cpu, unsigned long *imbalance)
4014{ 3879{
4015 int busiest_cpu; 3880 int busiest_cpu;
4016 3881
4017 if (!(sd->flags & SD_ASYM_PACKING)) 3882 if (!(env->sd->flags & SD_ASYM_PACKING))
4018 return 0; 3883 return 0;
4019 3884
4020 if (!sds->busiest) 3885 if (!sds->busiest)
4021 return 0; 3886 return 0;
4022 3887
4023 busiest_cpu = group_first_cpu(sds->busiest); 3888 busiest_cpu = group_first_cpu(sds->busiest);
4024 if (this_cpu > busiest_cpu) 3889 if (env->dst_cpu > busiest_cpu)
4025 return 0; 3890 return 0;
4026 3891
4027 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3892 env->imbalance = DIV_ROUND_CLOSEST(
4028 SCHED_POWER_SCALE); 3893 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3894
4029 return 1; 3895 return 1;
4030} 3896}
4031 3897
@@ -4033,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd,
4033 * fix_small_imbalance - Calculate the minor imbalance that exists 3899 * fix_small_imbalance - Calculate the minor imbalance that exists
4034 * amongst the groups of a sched_domain, during 3900 * amongst the groups of a sched_domain, during
4035 * load balancing. 3901 * load balancing.
3902 * @env: The load balancing environment.
4036 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 3903 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4037 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4038 * @imbalance: Variable to store the imbalance.
4039 */ 3904 */
4040static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3905static inline
4041 int this_cpu, unsigned long *imbalance) 3906void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4042{ 3907{
4043 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3908 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4044 unsigned int imbn = 2; 3909 unsigned int imbn = 2;
@@ -4049,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4049 if (sds->busiest_load_per_task > 3914 if (sds->busiest_load_per_task >
4050 sds->this_load_per_task) 3915 sds->this_load_per_task)
4051 imbn = 1; 3916 imbn = 1;
4052 } else 3917 } else {
4053 sds->this_load_per_task = 3918 sds->this_load_per_task =
4054 cpu_avg_load_per_task(this_cpu); 3919 cpu_avg_load_per_task(env->dst_cpu);
3920 }
4055 3921
4056 scaled_busy_load_per_task = sds->busiest_load_per_task 3922 scaled_busy_load_per_task = sds->busiest_load_per_task
4057 * SCHED_POWER_SCALE; 3923 * SCHED_POWER_SCALE;
@@ -4059,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4059 3925
4060 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3926 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4061 (scaled_busy_load_per_task * imbn)) { 3927 (scaled_busy_load_per_task * imbn)) {
4062 *imbalance = sds->busiest_load_per_task; 3928 env->imbalance = sds->busiest_load_per_task;
4063 return; 3929 return;
4064 } 3930 }
4065 3931
@@ -4096,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4096 3962
4097 /* Move if we gain throughput */ 3963 /* Move if we gain throughput */
4098 if (pwr_move > pwr_now) 3964 if (pwr_move > pwr_now)
4099 *imbalance = sds->busiest_load_per_task; 3965 env->imbalance = sds->busiest_load_per_task;
4100} 3966}
4101 3967
4102/** 3968/**
4103 * calculate_imbalance - Calculate the amount of imbalance present within the 3969 * calculate_imbalance - Calculate the amount of imbalance present within the
4104 * groups of a given sched_domain during load balance. 3970 * groups of a given sched_domain during load balance.
3971 * @env: load balance environment
4105 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3972 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4106 * @this_cpu: Cpu for which currently load balance is being performed.
4107 * @imbalance: The variable to store the imbalance.
4108 */ 3973 */
4109static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3974static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4110 unsigned long *imbalance)
4111{ 3975{
4112 unsigned long max_pull, load_above_capacity = ~0UL; 3976 unsigned long max_pull, load_above_capacity = ~0UL;
4113 3977
@@ -4123,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4123 * its cpu_power, while calculating max_load..) 3987 * its cpu_power, while calculating max_load..)
4124 */ 3988 */
4125 if (sds->max_load < sds->avg_load) { 3989 if (sds->max_load < sds->avg_load) {
4126 *imbalance = 0; 3990 env->imbalance = 0;
4127 return fix_small_imbalance(sds, this_cpu, imbalance); 3991 return fix_small_imbalance(env, sds);
4128 } 3992 }
4129 3993
4130 if (!sds->group_imb) { 3994 if (!sds->group_imb) {
@@ -4152,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4152 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4016 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4153 4017
4154 /* How much load to actually move to equalise the imbalance */ 4018 /* How much load to actually move to equalise the imbalance */
4155 *imbalance = min(max_pull * sds->busiest->sgp->power, 4019 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4156 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4020 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4157 / SCHED_POWER_SCALE; 4021 / SCHED_POWER_SCALE;
4158 4022
@@ -4162,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4162 * a think about bumping its value to force at least one task to be 4026 * a think about bumping its value to force at least one task to be
4163 * moved 4027 * moved
4164 */ 4028 */
4165 if (*imbalance < sds->busiest_load_per_task) 4029 if (env->imbalance < sds->busiest_load_per_task)
4166 return fix_small_imbalance(sds, this_cpu, imbalance); 4030 return fix_small_imbalance(env, sds);
4167 4031
4168} 4032}
4169 4033
@@ -4179,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4179 * Also calculates the amount of weighted load which should be moved 4043 * Also calculates the amount of weighted load which should be moved
4180 * to restore balance. 4044 * to restore balance.
4181 * 4045 *
4182 * @sd: The sched_domain whose busiest group is to be returned. 4046 * @env: The load balancing environment.
4183 * @this_cpu: The cpu for which load balancing is currently being performed.
4184 * @imbalance: Variable which stores amount of weighted load which should
4185 * be moved to restore balance/put a group to idle.
4186 * @idle: The idle status of this_cpu.
4187 * @cpus: The set of CPUs under consideration for load-balancing. 4047 * @cpus: The set of CPUs under consideration for load-balancing.
4188 * @balance: Pointer to a variable indicating if this_cpu 4048 * @balance: Pointer to a variable indicating if this_cpu
4189 * is the appropriate cpu to perform load balancing at this_level. 4049 * is the appropriate cpu to perform load balancing at this_level.
@@ -4194,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4194 * put to idle by rebalancing its tasks onto our group. 4054 * put to idle by rebalancing its tasks onto our group.
4195 */ 4055 */
4196static struct sched_group * 4056static struct sched_group *
4197find_busiest_group(struct sched_domain *sd, int this_cpu, 4057find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4198 unsigned long *imbalance, enum cpu_idle_type idle,
4199 const struct cpumask *cpus, int *balance)
4200{ 4058{
4201 struct sd_lb_stats sds; 4059 struct sd_lb_stats sds;
4202 4060
@@ -4206,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4206 * Compute the various statistics relavent for load balancing at 4064 * Compute the various statistics relavent for load balancing at
4207 * this level. 4065 * this level.
4208 */ 4066 */
4209 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4067 update_sd_lb_stats(env, cpus, balance, &sds);
4210 4068
4211 /* 4069 /*
4212 * this_cpu is not the appropriate cpu to perform load balancing at 4070 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4215 if (!(*balance)) 4073 if (!(*balance))
4216 goto ret; 4074 goto ret;
4217 4075
4218 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4076 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4219 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4077 check_asym_packing(env, &sds))
4220 return sds.busiest; 4078 return sds.busiest;
4221 4079
4222 /* There is no busy sibling group to pull tasks from */ 4080 /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4234 goto force_balance; 4092 goto force_balance;
4235 4093
4236 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4094 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4237 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4095 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4238 !sds.busiest_has_capacity) 4096 !sds.busiest_has_capacity)
4239 goto force_balance; 4097 goto force_balance;
4240 4098
@@ -4252,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4252 if (sds.this_load >= sds.avg_load) 4110 if (sds.this_load >= sds.avg_load)
4253 goto out_balanced; 4111 goto out_balanced;
4254 4112
4255 if (idle == CPU_IDLE) { 4113 if (env->idle == CPU_IDLE) {
4256 /* 4114 /*
4257 * This cpu is idle. If the busiest group load doesn't 4115 * This cpu is idle. If the busiest group load doesn't
4258 * have more tasks than the number of available cpu's and 4116 * have more tasks than the number of available cpu's and
@@ -4267,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4267 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4125 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4268 * imbalance_pct to be conservative. 4126 * imbalance_pct to be conservative.
4269 */ 4127 */
4270 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4128 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4271 goto out_balanced; 4129 goto out_balanced;
4272 } 4130 }
4273 4131
4274force_balance: 4132force_balance:
4275 /* Looks like there is an imbalance. Compute it */ 4133 /* Looks like there is an imbalance. Compute it */
4276 calculate_imbalance(&sds, this_cpu, imbalance); 4134 calculate_imbalance(env, &sds);
4277 return sds.busiest; 4135 return sds.busiest;
4278 4136
4279out_balanced: 4137out_balanced:
4280 /*
4281 * There is no obvious imbalance. But check if we can do some balancing
4282 * to save power.
4283 */
4284 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4285 return sds.busiest;
4286ret: 4138ret:
4287 *imbalance = 0; 4139 env->imbalance = 0;
4288 return NULL; 4140 return NULL;
4289} 4141}
4290 4142
4291/* 4143/*
4292 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4144 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4293 */ 4145 */
4294static struct rq * 4146static struct rq *find_busiest_queue(struct lb_env *env,
4295find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4147 struct sched_group *group,
4296 enum cpu_idle_type idle, unsigned long imbalance, 4148 const struct cpumask *cpus)
4297 const struct cpumask *cpus)
4298{ 4149{
4299 struct rq *busiest = NULL, *rq; 4150 struct rq *busiest = NULL, *rq;
4300 unsigned long max_load = 0; 4151 unsigned long max_load = 0;
@@ -4307,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4307 unsigned long wl; 4158 unsigned long wl;
4308 4159
4309 if (!capacity) 4160 if (!capacity)
4310 capacity = fix_small_capacity(sd, group); 4161 capacity = fix_small_capacity(env->sd, group);
4311 4162
4312 if (!cpumask_test_cpu(i, cpus)) 4163 if (!cpumask_test_cpu(i, cpus))
4313 continue; 4164 continue;
@@ -4319,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4319 * When comparing with imbalance, use weighted_cpuload() 4170 * When comparing with imbalance, use weighted_cpuload()
4320 * which is not scaled with the cpu power. 4171 * which is not scaled with the cpu power.
4321 */ 4172 */
4322 if (capacity && rq->nr_running == 1 && wl > imbalance) 4173 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4323 continue; 4174 continue;
4324 4175
4325 /* 4176 /*
@@ -4348,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4348/* Working cpumask for load_balance and load_balance_newidle. */ 4199/* Working cpumask for load_balance and load_balance_newidle. */
4349DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4200DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4350 4201
4351static int need_active_balance(struct sched_domain *sd, int idle, 4202static int need_active_balance(struct lb_env *env)
4352 int busiest_cpu, int this_cpu)
4353{ 4203{
4354 if (idle == CPU_NEWLY_IDLE) { 4204 struct sched_domain *sd = env->sd;
4205
4206 if (env->idle == CPU_NEWLY_IDLE) {
4355 4207
4356 /* 4208 /*
4357 * ASYM_PACKING needs to force migrate tasks from busy but 4209 * ASYM_PACKING needs to force migrate tasks from busy but
4358 * higher numbered CPUs in order to pack all tasks in the 4210 * higher numbered CPUs in order to pack all tasks in the
4359 * lowest numbered CPUs. 4211 * lowest numbered CPUs.
4360 */ 4212 */
4361 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4213 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4362 return 1; 4214 return 1;
4363
4364 /*
4365 * The only task running in a non-idle cpu can be moved to this
4366 * cpu in an attempt to completely freeup the other CPU
4367 * package.
4368 *
4369 * The package power saving logic comes from
4370 * find_busiest_group(). If there are no imbalance, then
4371 * f_b_g() will return NULL. However when sched_mc={1,2} then
4372 * f_b_g() will select a group from which a running task may be
4373 * pulled to this cpu in order to make the other package idle.
4374 * If there is no opportunity to make a package idle and if
4375 * there are no imbalance, then f_b_g() will return NULL and no
4376 * action will be taken in load_balance_newidle().
4377 *
4378 * Under normal task pull operation due to imbalance, there
4379 * will be more than one task in the source run queue and
4380 * move_tasks() will succeed. ld_moved will be true and this
4381 * active balance code will not be triggered.
4382 */
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return 0;
4385 } 4215 }
4386 4216
4387 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4217 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4399,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4399{ 4229{
4400 int ld_moved, active_balance = 0; 4230 int ld_moved, active_balance = 0;
4401 struct sched_group *group; 4231 struct sched_group *group;
4402 unsigned long imbalance;
4403 struct rq *busiest; 4232 struct rq *busiest;
4404 unsigned long flags; 4233 unsigned long flags;
4405 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4234 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4417 schedstat_inc(sd, lb_count[idle]); 4246 schedstat_inc(sd, lb_count[idle]);
4418 4247
4419redo: 4248redo:
4420 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4249 group = find_busiest_group(&env, cpus, balance);
4421 cpus, balance);
4422 4250
4423 if (*balance == 0) 4251 if (*balance == 0)
4424 goto out_balanced; 4252 goto out_balanced;
@@ -4428,7 +4256,7 @@ redo:
4428 goto out_balanced; 4256 goto out_balanced;
4429 } 4257 }
4430 4258
4431 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4259 busiest = find_busiest_queue(&env, group, cpus);
4432 if (!busiest) { 4260 if (!busiest) {
4433 schedstat_inc(sd, lb_nobusyq[idle]); 4261 schedstat_inc(sd, lb_nobusyq[idle]);
4434 goto out_balanced; 4262 goto out_balanced;
@@ -4436,7 +4264,7 @@ redo:
4436 4264
4437 BUG_ON(busiest == this_rq); 4265 BUG_ON(busiest == this_rq);
4438 4266
4439 schedstat_add(sd, lb_imbalance[idle], imbalance); 4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4440 4268
4441 ld_moved = 0; 4269 ld_moved = 0;
4442 if (busiest->nr_running > 1) { 4270 if (busiest->nr_running > 1) {
@@ -4447,10 +4275,9 @@ redo:
4447 * correctly treated as an imbalance. 4275 * correctly treated as an imbalance.
4448 */ 4276 */
4449 env.flags |= LBF_ALL_PINNED; 4277 env.flags |= LBF_ALL_PINNED;
4450 env.load_move = imbalance; 4278 env.src_cpu = busiest->cpu;
4451 env.src_cpu = busiest->cpu; 4279 env.src_rq = busiest;
4452 env.src_rq = busiest; 4280 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4453 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4454 4281
4455more_balance: 4282more_balance:
4456 local_irq_save(flags); 4283 local_irq_save(flags);
@@ -4492,7 +4319,7 @@ more_balance:
4492 if (idle != CPU_NEWLY_IDLE) 4319 if (idle != CPU_NEWLY_IDLE)
4493 sd->nr_balance_failed++; 4320 sd->nr_balance_failed++;
4494 4321
4495 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4322 if (need_active_balance(&env)) {
4496 raw_spin_lock_irqsave(&busiest->lock, flags); 4323 raw_spin_lock_irqsave(&busiest->lock, flags);
4497 4324
4498 /* don't kick the active_load_balance_cpu_stop, 4325 /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4346,11 @@ more_balance:
4519 } 4346 }
4520 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4347 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4521 4348
4522 if (active_balance) 4349 if (active_balance) {
4523 stop_one_cpu_nowait(cpu_of(busiest), 4350 stop_one_cpu_nowait(cpu_of(busiest),
4524 active_load_balance_cpu_stop, busiest, 4351 active_load_balance_cpu_stop, busiest,
4525 &busiest->active_balance_work); 4352 &busiest->active_balance_work);
4353 }
4526 4354
4527 /* 4355 /*
4528 * We've kicked active balancing, reset the failure 4356 * We've kicked active balancing, reset the failure
@@ -4703,104 +4531,15 @@ static struct {
4703 unsigned long next_balance; /* in jiffy units */ 4531 unsigned long next_balance; /* in jiffy units */
4704} nohz ____cacheline_aligned; 4532} nohz ____cacheline_aligned;
4705 4533
4706#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4534static inline int find_new_ilb(int call_cpu)
4707/**
4708 * lowest_flag_domain - Return lowest sched_domain containing flag.
4709 * @cpu: The cpu whose lowest level of sched domain is to
4710 * be returned.
4711 * @flag: The flag to check for the lowest sched_domain
4712 * for the given cpu.
4713 *
4714 * Returns the lowest sched_domain of a cpu which contains the given flag.
4715 */
4716static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4717{
4718 struct sched_domain *sd;
4719
4720 for_each_domain(cpu, sd)
4721 if (sd->flags & flag)
4722 break;
4723
4724 return sd;
4725}
4726
4727/**
4728 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4729 * @cpu: The cpu whose domains we're iterating over.
4730 * @sd: variable holding the value of the power_savings_sd
4731 * for cpu.
4732 * @flag: The flag to filter the sched_domains to be iterated.
4733 *
4734 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4735 * set, starting from the lowest sched_domain to the highest.
4736 */
4737#define for_each_flag_domain(cpu, sd, flag) \
4738 for (sd = lowest_flag_domain(cpu, flag); \
4739 (sd && (sd->flags & flag)); sd = sd->parent)
4740
4741/**
4742 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4743 * @cpu: The cpu which is nominating a new idle_load_balancer.
4744 *
4745 * Returns: Returns the id of the idle load balancer if it exists,
4746 * Else, returns >= nr_cpu_ids.
4747 *
4748 * This algorithm picks the idle load balancer such that it belongs to a
4749 * semi-idle powersavings sched_domain. The idea is to try and avoid
4750 * completely idle packages/cores just for the purpose of idle load balancing
4751 * when there are other idle cpu's which are better suited for that job.
4752 */
4753static int find_new_ilb(int cpu)
4754{ 4535{
4755 int ilb = cpumask_first(nohz.idle_cpus_mask); 4536 int ilb = cpumask_first(nohz.idle_cpus_mask);
4756 struct sched_group *ilbg;
4757 struct sched_domain *sd;
4758
4759 /*
4760 * Have idle load balancer selection from semi-idle packages only
4761 * when power-aware load balancing is enabled
4762 */
4763 if (!(sched_smt_power_savings || sched_mc_power_savings))
4764 goto out_done;
4765
4766 /*
4767 * Optimize for the case when we have no idle CPUs or only one
4768 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4769 */
4770 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4771 goto out_done;
4772 4537
4773 rcu_read_lock();
4774 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4775 ilbg = sd->groups;
4776
4777 do {
4778 if (ilbg->group_weight !=
4779 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4780 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4781 sched_group_cpus(ilbg));
4782 goto unlock;
4783 }
4784
4785 ilbg = ilbg->next;
4786
4787 } while (ilbg != sd->groups);
4788 }
4789unlock:
4790 rcu_read_unlock();
4791
4792out_done:
4793 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4538 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4794 return ilb; 4539 return ilb;
4795 4540
4796 return nr_cpu_ids; 4541 return nr_cpu_ids;
4797} 4542}
4798#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4799static inline int find_new_ilb(int call_cpu)
4800{
4801 return nr_cpu_ids;
4802}
4803#endif
4804 4543
4805/* 4544/*
4806 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4545 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5023,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5023 4762
5024 raw_spin_lock_irq(&this_rq->lock); 4763 raw_spin_lock_irq(&this_rq->lock);
5025 update_rq_clock(this_rq); 4764 update_rq_clock(this_rq);
5026 update_cpu_load(this_rq); 4765 update_idle_cpu_load(this_rq);
5027 raw_spin_unlock_irq(&this_rq->lock); 4766 raw_spin_unlock_irq(&this_rq->lock);
5028 4767
5029 rebalance_domains(balance_cpu, CPU_IDLE); 4768 rebalance_domains(balance_cpu, CPU_IDLE);