aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c94
-rw-r--r--kernel/sched/fair.c275
2 files changed, 2 insertions, 367 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd314d7cd9f8..24ca677b5457 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5929,8 +5929,6 @@ static const struct cpumask *cpu_cpu_mask(int cpu)
5929 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
5930} 5930}
5931 5931
5932int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5933
5934struct sd_data { 5932struct sd_data {
5935 struct sched_domain **__percpu sd; 5933 struct sched_domain **__percpu sd;
5936 struct sched_group **__percpu sg; 5934 struct sched_group **__percpu sg;
@@ -6322,7 +6320,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6322 | 0*SD_WAKE_AFFINE 6320 | 0*SD_WAKE_AFFINE
6323 | 0*SD_PREFER_LOCAL 6321 | 0*SD_PREFER_LOCAL
6324 | 0*SD_SHARE_CPUPOWER 6322 | 0*SD_SHARE_CPUPOWER
6325 | 0*SD_POWERSAVINGS_BALANCE
6326 | 0*SD_SHARE_PKG_RESOURCES 6323 | 0*SD_SHARE_PKG_RESOURCES
6327 | 1*SD_SERIALIZE 6324 | 1*SD_SERIALIZE
6328 | 0*SD_PREFER_SIBLING 6325 | 0*SD_PREFER_SIBLING
@@ -6819,97 +6816,6 @@ match2:
6819 mutex_unlock(&sched_domains_mutex); 6816 mutex_unlock(&sched_domains_mutex);
6820} 6817}
6821 6818
6822#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6823static void reinit_sched_domains(void)
6824{
6825 get_online_cpus();
6826
6827 /* Destroy domains first to force the rebuild */
6828 partition_sched_domains(0, NULL, NULL);
6829
6830 rebuild_sched_domains();
6831 put_online_cpus();
6832}
6833
6834static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6835{
6836 unsigned int level = 0;
6837
6838 if (sscanf(buf, "%u", &level) != 1)
6839 return -EINVAL;
6840
6841 /*
6842 * level is always be positive so don't check for
6843 * level < POWERSAVINGS_BALANCE_NONE which is 0
6844 * What happens on 0 or 1 byte write,
6845 * need to check for count as well?
6846 */
6847
6848 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6849 return -EINVAL;
6850
6851 if (smt)
6852 sched_smt_power_savings = level;
6853 else
6854 sched_mc_power_savings = level;
6855
6856 reinit_sched_domains();
6857
6858 return count;
6859}
6860
6861#ifdef CONFIG_SCHED_MC
6862static ssize_t sched_mc_power_savings_show(struct device *dev,
6863 struct device_attribute *attr,
6864 char *buf)
6865{
6866 return sprintf(buf, "%u\n", sched_mc_power_savings);
6867}
6868static ssize_t sched_mc_power_savings_store(struct device *dev,
6869 struct device_attribute *attr,
6870 const char *buf, size_t count)
6871{
6872 return sched_power_savings_store(buf, count, 0);
6873}
6874static DEVICE_ATTR(sched_mc_power_savings, 0644,
6875 sched_mc_power_savings_show,
6876 sched_mc_power_savings_store);
6877#endif
6878
6879#ifdef CONFIG_SCHED_SMT
6880static ssize_t sched_smt_power_savings_show(struct device *dev,
6881 struct device_attribute *attr,
6882 char *buf)
6883{
6884 return sprintf(buf, "%u\n", sched_smt_power_savings);
6885}
6886static ssize_t sched_smt_power_savings_store(struct device *dev,
6887 struct device_attribute *attr,
6888 const char *buf, size_t count)
6889{
6890 return sched_power_savings_store(buf, count, 1);
6891}
6892static DEVICE_ATTR(sched_smt_power_savings, 0644,
6893 sched_smt_power_savings_show,
6894 sched_smt_power_savings_store);
6895#endif
6896
6897int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6898{
6899 int err = 0;
6900
6901#ifdef CONFIG_SCHED_SMT
6902 if (smt_capable())
6903 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6904#endif
6905#ifdef CONFIG_SCHED_MC
6906 if (!err && mc_capable())
6907 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6908#endif
6909 return err;
6910}
6911#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6912
6913/* 6819/*
6914 * Update cpusets according to cpu_active mask. If cpusets are 6820 * Update cpusets according to cpu_active mask. If cpusets are
6915 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6821 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b42f4487329..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
3435 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3436 3433
3437 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3438#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3439 int power_savings_balance; /* Is powersave balance needed for this sd */
3440 struct sched_group *group_min; /* Least loaded group in sd */
3441 struct sched_group *group_leader; /* Group which relieves group_min */
3442 unsigned long min_load_per_task; /* load_per_task in group_min */
3443 unsigned long leader_nr_running; /* Nr running of group_leader */
3444 unsigned long min_nr_running; /* Nr running of group_min */
3445#endif
3446}; 3435};
3447 3436
3448/* 3437/*
@@ -3486,147 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3486 return load_idx; 3475 return load_idx;
3487} 3476}
3488 3477
3489
3490#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3491/**
3492 * init_sd_power_savings_stats - Initialize power savings statistics for
3493 * the given sched_domain, during load balancing.
3494 *
3495 * @sd: Sched domain whose power-savings statistics are to be initialized.
3496 * @sds: Variable containing the statistics for sd.
3497 * @idle: Idle status of the CPU at which we're performing load-balancing.
3498 */
3499static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3500 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3501{
3502 /*
3503 * Busy processors will not participate in power savings
3504 * balance.
3505 */
3506 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3507 sds->power_savings_balance = 0;
3508 else {
3509 sds->power_savings_balance = 1;
3510 sds->min_nr_running = ULONG_MAX;
3511 sds->leader_nr_running = 0;
3512 }
3513}
3514
3515/**
3516 * update_sd_power_savings_stats - Update the power saving stats for a
3517 * sched_domain while performing load balancing.
3518 *
3519 * @group: sched_group belonging to the sched_domain under consideration.
3520 * @sds: Variable containing the statistics of the sched_domain
3521 * @local_group: Does group contain the CPU for which we're performing
3522 * load balancing ?
3523 * @sgs: Variable containing the statistics of the group.
3524 */
3525static inline void update_sd_power_savings_stats(struct sched_group *group,
3526 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3527{
3528
3529 if (!sds->power_savings_balance)
3530 return;
3531
3532 /*
3533 * If the local group is idle or completely loaded
3534 * no need to do power savings balance at this domain
3535 */
3536 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3537 !sds->this_nr_running))
3538 sds->power_savings_balance = 0;
3539
3540 /*
3541 * If a group is already running at full capacity or idle,
3542 * don't include that group in power savings calculations
3543 */
3544 if (!sds->power_savings_balance ||
3545 sgs->sum_nr_running >= sgs->group_capacity ||
3546 !sgs->sum_nr_running)
3547 return;
3548
3549 /*
3550 * Calculate the group which has the least non-idle load.
3551 * This is the group from where we need to pick up the load
3552 * for saving power
3553 */
3554 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3555 (sgs->sum_nr_running == sds->min_nr_running &&
3556 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3557 sds->group_min = group;
3558 sds->min_nr_running = sgs->sum_nr_running;
3559 sds->min_load_per_task = sgs->sum_weighted_load /
3560 sgs->sum_nr_running;
3561 }
3562
3563 /*
3564 * Calculate the group which is almost near its
3565 * capacity but still has some space to pick up some load
3566 * from other group and save more power
3567 */
3568 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3569 return;
3570
3571 if (sgs->sum_nr_running > sds->leader_nr_running ||
3572 (sgs->sum_nr_running == sds->leader_nr_running &&
3573 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3574 sds->group_leader = group;
3575 sds->leader_nr_running = sgs->sum_nr_running;
3576 }
3577}
3578
3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @env: load balance environment
3582 * @sds: Variable containing the statistics of the sched_domain
3583 * under consideration.
3584 *
3585 * Description:
3586 * Check if we have potential to perform some power-savings balance.
3587 * If yes, set the busiest group to be the least loaded group in the
3588 * sched_domain, so that it's CPUs can be put to idle.
3589 *
3590 * Returns 1 if there is potential to perform power-savings balance.
3591 * Else returns 0.
3592 */
3593static inline
3594int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3595{
3596 if (!sds->power_savings_balance)
3597 return 0;
3598
3599 if (sds->this != sds->group_leader ||
3600 sds->group_leader == sds->group_min)
3601 return 0;
3602
3603 env->imbalance = sds->min_load_per_task;
3604 sds->busiest = sds->group_min;
3605
3606 return 1;
3607
3608}
3609#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3610static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3611 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3612{
3613 return;
3614}
3615
3616static inline void update_sd_power_savings_stats(struct sched_group *group,
3617 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3618{
3619 return;
3620}
3621
3622static inline
3623int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3624{
3625 return 0;
3626}
3627#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3628
3629
3630unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3631{ 3479{
3632 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3932,7 +3780,6 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3932 if (child && child->flags & SD_PREFER_SIBLING) 3780 if (child && child->flags & SD_PREFER_SIBLING)
3933 prefer_sibling = 1; 3781 prefer_sibling = 1;
3934 3782
3935 init_sd_power_savings_stats(env->sd, sds, env->idle);
3936 load_idx = get_sd_load_idx(env->sd, env->idle); 3783 load_idx = get_sd_load_idx(env->sd, env->idle);
3937 3784
3938 do { 3785 do {
@@ -3981,7 +3828,6 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3981 sds->group_imb = sgs.group_imb; 3828 sds->group_imb = sgs.group_imb;
3982 } 3829 }
3983 3830
3984 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3985 sg = sg->next; 3831 sg = sg->next;
3986 } while (sg != env->sd->groups); 3832 } while (sg != env->sd->groups);
3987} 3833}
@@ -4276,12 +4122,6 @@ force_balance:
4276 return sds.busiest; 4122 return sds.busiest;
4277 4123
4278out_balanced: 4124out_balanced:
4279 /*
4280 * There is no obvious imbalance. But check if we can do some balancing
4281 * to save power.
4282 */
4283 if (check_power_save_busiest_group(env, &sds))
4284 return sds.busiest;
4285ret: 4125ret:
4286 env->imbalance = 0; 4126 env->imbalance = 0;
4287 return NULL; 4127 return NULL;
@@ -4359,28 +4199,6 @@ static int need_active_balance(struct lb_env *env)
4359 */ 4199 */
4360 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) 4200 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4361 return 1; 4201 return 1;
4362
4363 /*
4364 * The only task running in a non-idle cpu can be moved to this
4365 * cpu in an attempt to completely freeup the other CPU
4366 * package.
4367 *
4368 * The package power saving logic comes from
4369 * find_busiest_group(). If there are no imbalance, then
4370 * f_b_g() will return NULL. However when sched_mc={1,2} then
4371 * f_b_g() will select a group from which a running task may be
4372 * pulled to this cpu in order to make the other package idle.
4373 * If there is no opportunity to make a package idle and if
4374 * there are no imbalance, then f_b_g() will return NULL and no
4375 * action will be taken in load_balance_newidle().
4376 *
4377 * Under normal task pull operation due to imbalance, there
4378 * will be more than one task in the source run queue and
4379 * move_tasks() will succeed. ld_moved will be true and this
4380 * active balance code will not be triggered.
4381 */
4382 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4383 return 0;
4384 } 4202 }
4385 4203
4386 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4204 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4700,104 +4518,15 @@ static struct {
4700 unsigned long next_balance; /* in jiffy units */ 4518 unsigned long next_balance; /* in jiffy units */
4701} nohz ____cacheline_aligned; 4519} nohz ____cacheline_aligned;
4702 4520
4703#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4521static inline int find_new_ilb(int call_cpu)
4704/**
4705 * lowest_flag_domain - Return lowest sched_domain containing flag.
4706 * @cpu: The cpu whose lowest level of sched domain is to
4707 * be returned.
4708 * @flag: The flag to check for the lowest sched_domain
4709 * for the given cpu.
4710 *
4711 * Returns the lowest sched_domain of a cpu which contains the given flag.
4712 */
4713static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4714{
4715 struct sched_domain *sd;
4716
4717 for_each_domain(cpu, sd)
4718 if (sd->flags & flag)
4719 break;
4720
4721 return sd;
4722}
4723
4724/**
4725 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4726 * @cpu: The cpu whose domains we're iterating over.
4727 * @sd: variable holding the value of the power_savings_sd
4728 * for cpu.
4729 * @flag: The flag to filter the sched_domains to be iterated.
4730 *
4731 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4732 * set, starting from the lowest sched_domain to the highest.
4733 */
4734#define for_each_flag_domain(cpu, sd, flag) \
4735 for (sd = lowest_flag_domain(cpu, flag); \
4736 (sd && (sd->flags & flag)); sd = sd->parent)
4737
4738/**
4739 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4740 * @cpu: The cpu which is nominating a new idle_load_balancer.
4741 *
4742 * Returns: Returns the id of the idle load balancer if it exists,
4743 * Else, returns >= nr_cpu_ids.
4744 *
4745 * This algorithm picks the idle load balancer such that it belongs to a
4746 * semi-idle powersavings sched_domain. The idea is to try and avoid
4747 * completely idle packages/cores just for the purpose of idle load balancing
4748 * when there are other idle cpu's which are better suited for that job.
4749 */
4750static int find_new_ilb(int cpu)
4751{ 4522{
4752 int ilb = cpumask_first(nohz.idle_cpus_mask); 4523 int ilb = cpumask_first(nohz.idle_cpus_mask);
4753 struct sched_group *ilbg;
4754 struct sched_domain *sd;
4755 4524
4756 /*
4757 * Have idle load balancer selection from semi-idle packages only
4758 * when power-aware load balancing is enabled
4759 */
4760 if (!(sched_smt_power_savings || sched_mc_power_savings))
4761 goto out_done;
4762
4763 /*
4764 * Optimize for the case when we have no idle CPUs or only one
4765 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4766 */
4767 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4768 goto out_done;
4769
4770 rcu_read_lock();
4771 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4772 ilbg = sd->groups;
4773
4774 do {
4775 if (ilbg->group_weight !=
4776 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4777 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4778 sched_group_cpus(ilbg));
4779 goto unlock;
4780 }
4781
4782 ilbg = ilbg->next;
4783
4784 } while (ilbg != sd->groups);
4785 }
4786unlock:
4787 rcu_read_unlock();
4788
4789out_done:
4790 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4525 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4791 return ilb; 4526 return ilb;
4792 4527
4793 return nr_cpu_ids; 4528 return nr_cpu_ids;
4794} 4529}
4795#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4796static inline int find_new_ilb(int call_cpu)
4797{
4798 return nr_cpu_ids;
4799}
4800#endif
4801 4530
4802/* 4531/*
4803 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4532 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the