aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2012-01-09 05:28:35 -0500
committerIngo Molnar <mingo@kernel.org>2012-05-17 07:48:56 -0400
commit8e7fbcbc22c12414bcc9dfdd683637f58fb32759 (patch)
treea438021ddeadddd8f0745293aeb8c80dbe3c999c /kernel/sched
parentfac536f7e4927f34d480dc066f3a578c743b8f0e (diff)
sched: Remove stale power aware scheduling remnants and dysfunctional knobs
It's been broken forever (i.e. it's not scheduling in a power aware fashion), as reported by Suresh and others sending patches, and nobody cares enough to fix it properly ... so remove it to make space free for something better. There's various problems with the code as it stands today, first and foremost the user interface which is bound to topology levels and has multiple values per level. This results in a state explosion which the administrator or distro needs to master and almost nobody does. Furthermore large configuration state spaces aren't good, it means the thing doesn't just work right because it's either under so many impossibe to meet constraints, or even if there's an achievable state workloads have to be aware of it precisely and can never meet it for dynamic workloads. So pushing this kind of decision to user-space was a bad idea even with a single knob - it's exponentially worse with knobs on every node of the topology. There is a proposal to replace the user interface with a single 3 state knob: sched_balance_policy := { performance, power, auto } where 'auto' would be the preferred default which looks at things like Battery/AC mode and possible cpufreq state or whatever the hw exposes to show us power use expectations - but there's been no progress on it in the past many months. Aside from that, the actual implementation of the various knobs is known to be broken. There have been sporadic attempts at fixing things but these always stop short of reaching a mergable state. Therefore this wholesale removal with the hopes of spurring people who care to come forward once again and work on a coherent replacement. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/1326104915.2442.53.camel@twins Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c94
-rw-r--r--kernel/sched/fair.c275
2 files changed, 2 insertions, 367 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd314d7cd9f8..24ca677b5457 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5929,8 +5929,6 @@ static const struct cpumask *cpu_cpu_mask(int cpu)
5929 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
5930} 5930}
5931 5931
5932int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5933
5934struct sd_data { 5932struct sd_data {
5935 struct sched_domain **__percpu sd; 5933 struct sched_domain **__percpu sd;
5936 struct sched_group **__percpu sg; 5934 struct sched_group **__percpu sg;
@@ -6322,7 +6320,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6322 | 0*SD_WAKE_AFFINE 6320 | 0*SD_WAKE_AFFINE
6323 | 0*SD_PREFER_LOCAL 6321 | 0*SD_PREFER_LOCAL
6324 | 0*SD_SHARE_CPUPOWER 6322 | 0*SD_SHARE_CPUPOWER
6325 | 0*SD_POWERSAVINGS_BALANCE
6326 | 0*SD_SHARE_PKG_RESOURCES 6323 | 0*SD_SHARE_PKG_RESOURCES
6327 | 1*SD_SERIALIZE 6324 | 1*SD_SERIALIZE
6328 | 0*SD_PREFER_SIBLING 6325 | 0*SD_PREFER_SIBLING
@@ -6819,97 +6816,6 @@ match2:
6819 mutex_unlock(&sched_domains_mutex); 6816 mutex_unlock(&sched_domains_mutex);
6820} 6817}
6821 6818
6822#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6823static void reinit_sched_domains(void)
6824{
6825 get_online_cpus();
6826
6827 /* Destroy domains first to force the rebuild */
6828 partition_sched_domains(0, NULL, NULL);
6829
6830 rebuild_sched_domains();
6831 put_online_cpus();
6832}
6833
6834static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6835{
6836 unsigned int level = 0;
6837
6838 if (sscanf(buf, "%u", &level) != 1)
6839 return -EINVAL;
6840
6841 /*
6842 * level is always be positive so don't check for
6843 * level < POWERSAVINGS_BALANCE_NONE which is 0
6844 * What happens on 0 or 1 byte write,
6845 * need to check for count as well?
6846 */
6847
6848 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6849 return -EINVAL;
6850
6851 if (smt)
6852 sched_smt_power_savings = level;
6853 else
6854 sched_mc_power_savings = level;
6855
6856 reinit_sched_domains();
6857
6858 return count;
6859}
6860
6861#ifdef CONFIG_SCHED_MC
6862static ssize_t sched_mc_power_savings_show(struct device *dev,
6863 struct device_attribute *attr,
6864 char *buf)
6865{
6866 return sprintf(buf, "%u\n", sched_mc_power_savings);
6867}
6868static ssize_t sched_mc_power_savings_store(struct device *dev,
6869 struct device_attribute *attr,
6870 const char *buf, size_t count)
6871{
6872 return sched_power_savings_store(buf, count, 0);
6873}
6874static DEVICE_ATTR(sched_mc_power_savings, 0644,
6875 sched_mc_power_savings_show,
6876 sched_mc_power_savings_store);
6877#endif
6878
6879#ifdef CONFIG_SCHED_SMT
6880static ssize_t sched_smt_power_savings_show(struct device *dev,
6881 struct device_attribute *attr,
6882 char *buf)
6883{
6884 return sprintf(buf, "%u\n", sched_smt_power_savings);
6885}
6886static ssize_t sched_smt_power_savings_store(struct device *dev,
6887 struct device_attribute *attr,
6888 const char *buf, size_t count)
6889{
6890 return sched_power_savings_store(buf, count, 1);
6891}
6892static DEVICE_ATTR(sched_smt_power_savings, 0644,
6893 sched_smt_power_savings_show,
6894 sched_smt_power_savings_store);
6895#endif
6896
6897int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6898{
6899 int err = 0;
6900
6901#ifdef CONFIG_SCHED_SMT
6902 if (smt_capable())
6903 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6904#endif
6905#ifdef CONFIG_SCHED_MC
6906 if (!err && mc_capable())
6907 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6908#endif
6909 return err;
6910}
6911#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6912
6913/* 6819/*
6914 * Update cpusets according to cpu_active mask. If cpusets are 6820 * Update cpusets according to cpu_active mask. If cpusets are
6915 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6821 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b42f4487329..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
3435 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3436 3433
3437 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3438#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3439 int power_savings_balance; /* Is powersave balance needed for this sd */
3440 struct sched_group *group_min; /* Least loaded group in sd */
3441 struct sched_group *group_leader; /* Group which relieves group_min */
3442 unsigned long min_load_per_task; /* load_per_task in group_min */
3443 unsigned long leader_nr_running; /* Nr running of group_leader */
3444 unsigned long min_nr_running; /* Nr running of group_min */
3445#endif
3446}; 3435};
3447 3436
3448/* 3437/*
@@ -3486,147 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3486 return load_idx; 3475 return load_idx;
3487} 3476}
3488 3477
3489
3490#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3491/**
3492 * init_sd_power_savings_stats - Initialize power savings statistics for
3493 * the given sched_domain, during load balancing.
3494 *
3495 * @sd: Sched domain whose power-savings statistics are to be initialized.
3496 * @sds: Variable containing the statistics for sd.
3497 * @idle: Idle status of the CPU at which we're performing load-balancing.
3498 */
3499static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3500 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3501{
3502 /*
3503 * Busy processors will not participate in power savings
3504 * balance.
3505 */
3506 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3507 sds->power_savings_balance = 0;
3508 else {
3509 sds->power_savings_balance = 1;
3510 sds->min_nr_running = ULONG_MAX;
3511 sds->leader_nr_running = 0;
3512 }
3513}
3514
3515/**
3516 * update_sd_power_savings_stats - Update the power saving stats for a
3517 * sched_domain while performing load balancing.
3518 *
3519 * @group: sched_group belonging to the sched_domain under consideration.
3520 * @sds: Variable containing the statistics of the sched_domain
3521 * @local_group: Does group contain the CPU for which we're performing
3522 * load balancing ?
3523 * @sgs: Variable containing the statistics of the group.
3524 */
3525static inline void update_sd_power_savings_stats(struct sched_group *group,
3526 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3527{
3528
3529 if (!sds->power_savings_balance)
3530 return;
3531
3532 /*
3533 * If the local group is idle or completely loaded
3534 * no need to do power savings balance at this domain
3535 */
3536 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3537 !sds->this_nr_running))
3538 sds->power_savings_balance = 0;
3539
3540 /*
3541 * If a group is already running at full capacity or idle,
3542 * don't include that group in power savings calculations
3543 */
3544 if (!sds->power_savings_balance ||
3545 sgs->sum_nr_running >= sgs->group_capacity ||
3546 !sgs->sum_nr_running)
3547 return;
3548
3549 /*
3550 * Calculate the group which has the least non-idle load.
3551 * This is the group from where we need to pick up the load
3552 * for saving power
3553 */
3554 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3555 (sgs->sum_nr_running == sds->min_nr_running &&
3556 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3557 sds->group_min = group;
3558 sds->min_nr_running = sgs->sum_nr_running;
3559 sds->min_load_per_task = sgs->sum_weighted_load /
3560 sgs->sum_nr_running;
3561 }
3562
3563 /*
3564 * Calculate the group which is almost near its
3565 * capacity but still has some space to pick up some load
3566 * from other group and save more power
3567 */
3568 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3569 return;
3570
3571 if (sgs->sum_nr_running > sds->leader_nr_running ||
3572 (sgs->sum_nr_running == sds->leader_nr_running &&
3573 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3574 sds->group_leader = group;
3575 sds->leader_nr_running = sgs->sum_nr_running;
3576 }
3577}
3578
3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @env: load balance environment
3582 * @sds: Variable containing the statistics of the sched_domain
3583 * under consideration.
3584 *
3585 * Description:
3586 * Check if we have potential to perform some power-savings balance.
3587 * If yes, set the busiest group to be the least loaded group in the
3588 * sched_domain, so that it's CPUs can be put to idle.
3589 *
3590 * Returns 1 if there is potential to perform power-savings balance.
3591 * Else returns 0.
3592 */
3593static inline
3594int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3595{
3596 if (!sds->power_savings_balance)
3597 return 0;
3598
3599 if (sds->this != sds->group_leader ||
3600 sds->group_leader == sds->group_min)
3601 return 0;
3602
3603 env->imbalance = sds->min_load_per_task;
3604 sds->busiest = sds->group_min;
3605
3606 return 1;
3607
3608}
3609#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3610static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3611 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3612{
3613 return;
3614}
3615
3616static inline void update_sd_power_savings_stats(struct sched_group *group,
3617 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3618{
3619 return;
3620}
3621
3622static inline
3623int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3624{
3625 return 0;
3626}
3627#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3628
3629
3630unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3631{ 3479{
3632 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3932,7 +3780,6 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3932 if (child && child->flags & SD_PREFER_SIBLING) 3780 if (child && child->flags & SD_PREFER_SIBLING)
3933 prefer_sibling = 1; 3781 prefer_sibling = 1;
3934 3782
3935 init_sd_power_savings_stats(env->sd, sds, env->idle);
3936 load_idx = get_sd_load_idx(env->sd, env->idle); 3783 load_idx = get_sd_load_idx(env->sd, env->idle);
3937 3784
3938 do { 3785 do {
@@ -3981,7 +3828,6 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3981 sds->group_imb = sgs.group_imb; 3828 sds->group_imb = sgs.group_imb;
3982 } 3829 }
3983 3830
3984 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3985 sg = sg->next; 3831 sg = sg->next;
3986 } while (sg != env->sd->groups); 3832 } while (sg != env->sd->groups);
3987} 3833}
@@ -4276,12 +4122,6 @@ force_balance:
4276 return sds.busiest; 4122 return sds.busiest;
4277 4123
4278out_balanced: 4124out_balanced:
4279 /*
4280 * There is no obvious imbalance. But check if we can do some balancing
4281 * to save power.
4282 */
4283 if (check_power_save_busiest_group(env, &sds))
4284 return sds.busiest;
4285ret: 4125ret:
4286 env->imbalance = 0; 4126 env->imbalance = 0;
4287 return NULL; 4127 return NULL;
@@ -4359,28 +4199,6 @@ static int need_active_balance(struct lb_env *env)
4359 */ 4199 */
4360 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) 4200 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4361 return 1; 4201 return 1;
4362
4363 /*
4364 * The only task running in a non-idle cpu can be moved to this
4365 * cpu in an attempt to completely freeup the other CPU
4366 * package.
4367 *
4368 * The package power saving logic comes from
4369 * find_busiest_group(). If there are no imbalance, then
4370 * f_b_g() will return NULL. However when sched_mc={1,2} then
4371 * f_b_g() will select a group from which a running task may be
4372 * pulled to this cpu in order to make the other package idle.
4373 * If there is no opportunity to make a package idle and if
4374 * there are no imbalance, then f_b_g() will return NULL and no
4375 * action will be taken in load_balance_newidle().
4376 *
4377 * Under normal task pull operation due to imbalance, there
4378 * will be more than one task in the source run queue and
4379 * move_tasks() will succeed. ld_moved will be true and this
4380 * active balance code will not be triggered.
4381 */
4382 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4383 return 0;
4384 } 4202 }
4385 4203
4386 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4204 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4700,104 +4518,15 @@ static struct {
4700 unsigned long next_balance; /* in jiffy units */ 4518 unsigned long next_balance; /* in jiffy units */
4701} nohz ____cacheline_aligned; 4519} nohz ____cacheline_aligned;
4702 4520
4703#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4521static inline int find_new_ilb(int call_cpu)
4704/**
4705 * lowest_flag_domain - Return lowest sched_domain containing flag.
4706 * @cpu: The cpu whose lowest level of sched domain is to
4707 * be returned.
4708 * @flag: The flag to check for the lowest sched_domain
4709 * for the given cpu.
4710 *
4711 * Returns the lowest sched_domain of a cpu which contains the given flag.
4712 */
4713static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4714{
4715 struct sched_domain *sd;
4716
4717 for_each_domain(cpu, sd)
4718 if (sd->flags & flag)
4719 break;
4720
4721 return sd;
4722}
4723
4724/**
4725 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4726 * @cpu: The cpu whose domains we're iterating over.
4727 * @sd: variable holding the value of the power_savings_sd
4728 * for cpu.
4729 * @flag: The flag to filter the sched_domains to be iterated.
4730 *
4731 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4732 * set, starting from the lowest sched_domain to the highest.
4733 */
4734#define for_each_flag_domain(cpu, sd, flag) \
4735 for (sd = lowest_flag_domain(cpu, flag); \
4736 (sd && (sd->flags & flag)); sd = sd->parent)
4737
4738/**
4739 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4740 * @cpu: The cpu which is nominating a new idle_load_balancer.
4741 *
4742 * Returns: Returns the id of the idle load balancer if it exists,
4743 * Else, returns >= nr_cpu_ids.
4744 *
4745 * This algorithm picks the idle load balancer such that it belongs to a
4746 * semi-idle powersavings sched_domain. The idea is to try and avoid
4747 * completely idle packages/cores just for the purpose of idle load balancing
4748 * when there are other idle cpu's which are better suited for that job.
4749 */
4750static int find_new_ilb(int cpu)
4751{ 4522{
4752 int ilb = cpumask_first(nohz.idle_cpus_mask); 4523 int ilb = cpumask_first(nohz.idle_cpus_mask);
4753 struct sched_group *ilbg;
4754 struct sched_domain *sd;
4755 4524
4756 /*
4757 * Have idle load balancer selection from semi-idle packages only
4758 * when power-aware load balancing is enabled
4759 */
4760 if (!(sched_smt_power_savings || sched_mc_power_savings))
4761 goto out_done;
4762
4763 /*
4764 * Optimize for the case when we have no idle CPUs or only one
4765 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4766 */
4767 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4768 goto out_done;
4769
4770 rcu_read_lock();
4771 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4772 ilbg = sd->groups;
4773
4774 do {
4775 if (ilbg->group_weight !=
4776 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4777 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4778 sched_group_cpus(ilbg));
4779 goto unlock;
4780 }
4781
4782 ilbg = ilbg->next;
4783
4784 } while (ilbg != sd->groups);
4785 }
4786unlock:
4787 rcu_read_unlock();
4788
4789out_done:
4790 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4525 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4791 return ilb; 4526 return ilb;
4792 4527
4793 return nr_cpu_ids; 4528 return nr_cpu_ids;
4794} 4529}
4795#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4796static inline int find_new_ilb(int call_cpu)
4797{
4798 return nr_cpu_ids;
4799}
4800#endif
4801 4530
4802/* 4531/*
4803 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4532 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the