aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorGautham R Shenoy <ego@in.ibm.com>2009-04-14 00:55:30 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-14 05:49:19 -0400
commitf711f6090a81cbd396b63de90f415d33f563af9b (patch)
treeb2212ddcae5a88a1ca6a521a892003af1d5a35bc /kernel
parent002f128b473fb82f454654be5081b0919ee01ab2 (diff)
sched: Nominate idle load balancer from a semi-idle package.
Currently the nomination of idle-load balancer is done by choosing the first idle cpu in the nohz.cpu_mask. This may not be power-efficient, since such an idle cpu could come from a completely idle core/package thereby preventing the whole core/package from being in a low-power state. For eg, consider a quad-core dual package system. The cpu numbering need not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7]. With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep as fewer Packages/Cores active. But the current idle load balancer logic goes against this by choosing the first_cpu in the nohz.cpu_mask and not taking the system topology into consideration. Improve the algorithm to nominate the idle load balancer from a semi idle cores/packages thereby increasing the probability of the cores/packages being in deeper sleep states for longer duration. The algorithm is activated only when sched_mc/smt_power_savings != 0. Signed-off-by: Gautham R Shenoy <ego@in.ibm.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c127
1 files changed, 118 insertions, 9 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5724508c3b66..b0fefa300b40 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4240static struct {
4241 atomic_t load_balancer; 4241 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4242 cpumask_var_t cpu_mask;
4243 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4244} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4245 .load_balancer = ATOMIC_INIT(-1),
4245}; 4246};
4246 4247
4248#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4249/**
4250 * lowest_flag_domain - Return lowest sched_domain containing flag.
4251 * @cpu: The cpu whose lowest level of sched domain is to
4252 * be returned.
4253 * @flag: The flag to check for the lowest sched_domain
4254 * for the given cpu.
4255 *
4256 * Returns the lowest sched_domain of a cpu which contains the given flag.
4257 */
4258static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4259{
4260 struct sched_domain *sd;
4261
4262 for_each_domain(cpu, sd)
4263 if (sd && (sd->flags & flag))
4264 break;
4265
4266 return sd;
4267}
4268
4269/**
4270 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4271 * @cpu: The cpu whose domains we're iterating over.
4272 * @sd: variable holding the value of the power_savings_sd
4273 * for cpu.
4274 * @flag: The flag to filter the sched_domains to be iterated.
4275 *
4276 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4277 * set, starting from the lowest sched_domain to the highest.
4278 */
4279#define for_each_flag_domain(cpu, sd, flag) \
4280 for (sd = lowest_flag_domain(cpu, flag); \
4281 (sd && (sd->flags & flag)); sd = sd->parent)
4282
4283/**
4284 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4285 * @ilb_group: group to be checked for semi-idleness
4286 *
4287 * Returns: 1 if the group is semi-idle. 0 otherwise.
4288 *
4289 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4290 * and atleast one non-idle CPU. This helper function checks if the given
4291 * sched_group is semi-idle or not.
4292 */
4293static inline int is_semi_idle_group(struct sched_group *ilb_group)
4294{
4295 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4296 sched_group_cpus(ilb_group));
4297
4298 /*
4299 * A sched_group is semi-idle when it has atleast one busy cpu
4300 * and atleast one idle cpu.
4301 */
4302 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4303 return 0;
4304
4305 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4306 return 0;
4307
4308 return 1;
4309}
4310/**
4311 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4312 * @cpu: The cpu which is nominating a new idle_load_balancer.
4313 *
4314 * Returns: Returns the id of the idle load balancer if it exists,
4315 * Else, returns >= nr_cpu_ids.
4316 *
4317 * This algorithm picks the idle load balancer such that it belongs to a
4318 * semi-idle powersavings sched_domain. The idea is to try and avoid
4319 * completely idle packages/cores just for the purpose of idle load balancing
4320 * when there are other idle cpu's which are better suited for that job.
4321 */
4322static int find_new_ilb(int cpu)
4323{
4324 struct sched_domain *sd;
4325 struct sched_group *ilb_group;
4326
4327 /*
4328 * Have idle load balancer selection from semi-idle packages only
4329 * when power-aware load balancing is enabled
4330 */
4331 if (!(sched_smt_power_savings || sched_mc_power_savings))
4332 goto out_done;
4333
4334 /*
4335 * Optimize for the case when we have no idle CPUs or only one
4336 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4337 */
4338 if (cpumask_weight(nohz.cpu_mask) < 2)
4339 goto out_done;
4340
4341 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4342 ilb_group = sd->groups;
4343
4344 do {
4345 if (is_semi_idle_group(ilb_group))
4346 return cpumask_first(nohz.ilb_grp_nohz_mask);
4347
4348 ilb_group = ilb_group->next;
4349
4350 } while (ilb_group != sd->groups);
4351 }
4352
4353out_done:
4354 return cpumask_first(nohz.cpu_mask);
4355}
4356#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4357static inline int find_new_ilb(int call_cpu)
4358{
4359 return first_cpu(nohz.cpu_mask);
4360}
4361#endif
4362
4247/* 4363/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4364 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4365 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4584 }
4469 4585
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4586 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4587 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4588
4481 if (ilb < nr_cpu_ids) 4589 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4590 resched_cpu(ilb);
@@ -9051,6 +9159,7 @@ void __init sched_init(void)
9051#ifdef CONFIG_SMP 9159#ifdef CONFIG_SMP
9052#ifdef CONFIG_NO_HZ 9160#ifdef CONFIG_NO_HZ
9053 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9161 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9162 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9054#endif 9163#endif
9055 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9164 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9056#endif /* SMP */ 9165#endif /* SMP */