diff options
author | Gautham R Shenoy <ego@in.ibm.com> | 2009-04-14 00:55:30 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-04-14 05:49:19 -0400 |
commit | f711f6090a81cbd396b63de90f415d33f563af9b (patch) | |
tree | b2212ddcae5a88a1ca6a521a892003af1d5a35bc | |
parent | 002f128b473fb82f454654be5081b0919ee01ab2 (diff) |
sched: Nominate idle load balancer from a semi-idle package.
Currently the nomination of idle-load balancer is done by choosing the first
idle cpu in the nohz.cpu_mask. This may not be power-efficient, since
such an idle cpu could come from a completely idle core/package thereby
preventing the whole core/package from being in a low-power state.
For eg, consider a quad-core dual package system. The cpu numbering need
not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7].
With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep
as fewer Packages/Cores active. But the current idle load balancer logic
goes against this by choosing the first_cpu in the nohz.cpu_mask and not
taking the system topology into consideration.
Improve the algorithm to nominate the idle load balancer from a semi idle
cores/packages thereby increasing the probability of the cores/packages being
in deeper sleep states for longer duration.
The algorithm is activated only when sched_mc/smt_power_savings != 0.
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | kernel/sched.c | 127 |
1 files changed, 118 insertions, 9 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b66..b0fefa300b40 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
4240 | static struct { | 4240 | static struct { |
4241 | atomic_t load_balancer; | 4241 | atomic_t load_balancer; |
4242 | cpumask_var_t cpu_mask; | 4242 | cpumask_var_t cpu_mask; |
4243 | cpumask_var_t ilb_grp_nohz_mask; | ||
4243 | } nohz ____cacheline_aligned = { | 4244 | } nohz ____cacheline_aligned = { |
4244 | .load_balancer = ATOMIC_INIT(-1), | 4245 | .load_balancer = ATOMIC_INIT(-1), |
4245 | }; | 4246 | }; |
4246 | 4247 | ||
4248 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4249 | /** | ||
4250 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4251 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4252 | * be returned. | ||
4253 | * @flag: The flag to check for the lowest sched_domain | ||
4254 | * for the given cpu. | ||
4255 | * | ||
4256 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4257 | */ | ||
4258 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4259 | { | ||
4260 | struct sched_domain *sd; | ||
4261 | |||
4262 | for_each_domain(cpu, sd) | ||
4263 | if (sd && (sd->flags & flag)) | ||
4264 | break; | ||
4265 | |||
4266 | return sd; | ||
4267 | } | ||
4268 | |||
4269 | /** | ||
4270 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4271 | * @cpu: The cpu whose domains we're iterating over. | ||
4272 | * @sd: variable holding the value of the power_savings_sd | ||
4273 | * for cpu. | ||
4274 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4275 | * | ||
4276 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4277 | * set, starting from the lowest sched_domain to the highest. | ||
4278 | */ | ||
4279 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4280 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4281 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4282 | |||
4283 | /** | ||
4284 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4285 | * @ilb_group: group to be checked for semi-idleness | ||
4286 | * | ||
4287 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4288 | * | ||
4289 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4290 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4291 | * sched_group is semi-idle or not. | ||
4292 | */ | ||
4293 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4294 | { | ||
4295 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4296 | sched_group_cpus(ilb_group)); | ||
4297 | |||
4298 | /* | ||
4299 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4300 | * and atleast one idle cpu. | ||
4301 | */ | ||
4302 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4303 | return 0; | ||
4304 | |||
4305 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4306 | return 0; | ||
4307 | |||
4308 | return 1; | ||
4309 | } | ||
4310 | /** | ||
4311 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4312 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4313 | * | ||
4314 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4315 | * Else, returns >= nr_cpu_ids. | ||
4316 | * | ||
4317 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4318 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4319 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4320 | * when there are other idle cpu's which are better suited for that job. | ||
4321 | */ | ||
4322 | static int find_new_ilb(int cpu) | ||
4323 | { | ||
4324 | struct sched_domain *sd; | ||
4325 | struct sched_group *ilb_group; | ||
4326 | |||
4327 | /* | ||
4328 | * Have idle load balancer selection from semi-idle packages only | ||
4329 | * when power-aware load balancing is enabled | ||
4330 | */ | ||
4331 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4332 | goto out_done; | ||
4333 | |||
4334 | /* | ||
4335 | * Optimize for the case when we have no idle CPUs or only one | ||
4336 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4337 | */ | ||
4338 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4339 | goto out_done; | ||
4340 | |||
4341 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4342 | ilb_group = sd->groups; | ||
4343 | |||
4344 | do { | ||
4345 | if (is_semi_idle_group(ilb_group)) | ||
4346 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4347 | |||
4348 | ilb_group = ilb_group->next; | ||
4349 | |||
4350 | } while (ilb_group != sd->groups); | ||
4351 | } | ||
4352 | |||
4353 | out_done: | ||
4354 | return cpumask_first(nohz.cpu_mask); | ||
4355 | } | ||
4356 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4357 | static inline int find_new_ilb(int call_cpu) | ||
4358 | { | ||
4359 | return first_cpu(nohz.cpu_mask); | ||
4360 | } | ||
4361 | #endif | ||
4362 | |||
4247 | /* | 4363 | /* |
4248 | * This routine will try to nominate the ilb (idle load balancing) | 4364 | * This routine will try to nominate the ilb (idle load balancing) |
4249 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 4365 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
@@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
4468 | } | 4584 | } |
4469 | 4585 | ||
4470 | if (atomic_read(&nohz.load_balancer) == -1) { | 4586 | if (atomic_read(&nohz.load_balancer) == -1) { |
4471 | /* | 4587 | int ilb = find_new_ilb(cpu); |
4472 | * simple selection for now: Nominate the | ||
4473 | * first cpu in the nohz list to be the next | ||
4474 | * ilb owner. | ||
4475 | * | ||
4476 | * TBD: Traverse the sched domains and nominate | ||
4477 | * the nearest cpu in the nohz.cpu_mask. | ||
4478 | */ | ||
4479 | int ilb = cpumask_first(nohz.cpu_mask); | ||
4480 | 4588 | ||
4481 | if (ilb < nr_cpu_ids) | 4589 | if (ilb < nr_cpu_ids) |
4482 | resched_cpu(ilb); | 4590 | resched_cpu(ilb); |
@@ -9051,6 +9159,7 @@ void __init sched_init(void) | |||
9051 | #ifdef CONFIG_SMP | 9159 | #ifdef CONFIG_SMP |
9052 | #ifdef CONFIG_NO_HZ | 9160 | #ifdef CONFIG_NO_HZ |
9053 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); | 9161 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); |
9162 | alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); | ||
9054 | #endif | 9163 | #endif |
9055 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 9164 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
9056 | #endif /* SMP */ | 9165 | #endif /* SMP */ |