diff options
author | Venkatesh Pallipadi <venki@google.com> | 2010-05-21 20:09:41 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-06-09 04:34:52 -0400 |
commit | 83cd4fe27ad8446619b2e030b171b858501de87d (patch) | |
tree | 81c7d26f4f00139ae355017239371d91cc4b2aef /kernel/sched.c | |
parent | fdf3e95d3916f18bf8703fb065499fdbc4dfe34c (diff) |
sched: Change nohz idle load balancing logic to push model
In the new push model, all idle CPUs indeed go into nohz mode. There is
still the concept of idle load balancer (performing the load balancing
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
balancer when any of the nohz CPUs need idle load balancing.
The kickee CPU does the idle load balancing on behalf of all idle CPUs
instead of the normal idle balance.
This addresses the below two problems with the current nohz ilb logic:
* the idle load balancer continued to have periodic ticks during idle and
wokeup frequently, even though it did not have any rebalancing to do on
behalf of any of the idle CPUs.
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
periodic wakeup can result in a periodic additional interrupt on a CPU
doing the timer broadcast.
Also currently we are migrating the unpinned timers from an idle to the cpu
doing idle load balancing (when all the cpus in the system are idle,
there is no idle load balancing cpu and timers get added to the same idle cpu
where the request was made. So the existing optimization works only on semi idle
system).
And In semi idle system, we no longer have periodic ticks on the idle load
balancer CPU. Using that cpu will add more delays to the timers than intended
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
causing mysterious slowdowns during boot etc.
For now, in the semi idle case, use the nearest busy cpu for migrating timers
from an idle cpu. This is good for power-savings anyway.
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 34 |
1 files changed, 31 insertions, 3 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index a757f6b11cbd..132950b33dde 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -460,7 +460,7 @@ struct rq { | |||
460 | unsigned long last_load_update_tick; | 460 | unsigned long last_load_update_tick; |
461 | #ifdef CONFIG_NO_HZ | 461 | #ifdef CONFIG_NO_HZ |
462 | u64 nohz_stamp; | 462 | u64 nohz_stamp; |
463 | unsigned char in_nohz_recently; | 463 | unsigned char nohz_balance_kick; |
464 | #endif | 464 | #endif |
465 | unsigned int skip_clock_update; | 465 | unsigned int skip_clock_update; |
466 | 466 | ||
@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu) | |||
1195 | 1195 | ||
1196 | #ifdef CONFIG_NO_HZ | 1196 | #ifdef CONFIG_NO_HZ |
1197 | /* | 1197 | /* |
1198 | * In the semi idle case, use the nearest busy cpu for migrating timers | ||
1199 | * from an idle cpu. This is good for power-savings. | ||
1200 | * | ||
1201 | * We don't do similar optimization for completely idle system, as | ||
1202 | * selecting an idle cpu will add more delays to the timers than intended | ||
1203 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | ||
1204 | */ | ||
1205 | int get_nohz_timer_target(void) | ||
1206 | { | ||
1207 | int cpu = smp_processor_id(); | ||
1208 | int i; | ||
1209 | struct sched_domain *sd; | ||
1210 | |||
1211 | for_each_domain(cpu, sd) { | ||
1212 | for_each_cpu(i, sched_domain_span(sd)) | ||
1213 | if (!idle_cpu(i)) | ||
1214 | return i; | ||
1215 | } | ||
1216 | return cpu; | ||
1217 | } | ||
1218 | /* | ||
1198 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1219 | * When add_timer_on() enqueues a timer into the timer wheel of an |
1199 | * idle CPU then this timer might expire before the next timer event | 1220 | * idle CPU then this timer might expire before the next timer event |
1200 | * which is scheduled to wake up that CPU. In case of a completely | 1221 | * which is scheduled to wake up that CPU. In case of a completely |
@@ -7791,6 +7812,10 @@ void __init sched_init(void) | |||
7791 | rq->idle_stamp = 0; | 7812 | rq->idle_stamp = 0; |
7792 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7813 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7793 | rq_attach_root(rq, &def_root_domain); | 7814 | rq_attach_root(rq, &def_root_domain); |
7815 | #ifdef CONFIG_NO_HZ | ||
7816 | rq->nohz_balance_kick = 0; | ||
7817 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
7818 | #endif | ||
7794 | #endif | 7819 | #endif |
7795 | init_rq_hrtick(rq); | 7820 | init_rq_hrtick(rq); |
7796 | atomic_set(&rq->nr_iowait, 0); | 7821 | atomic_set(&rq->nr_iowait, 0); |
@@ -7835,8 +7860,11 @@ void __init sched_init(void) | |||
7835 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7860 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7836 | #ifdef CONFIG_SMP | 7861 | #ifdef CONFIG_SMP |
7837 | #ifdef CONFIG_NO_HZ | 7862 | #ifdef CONFIG_NO_HZ |
7838 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 7863 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7839 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 7864 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
7865 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
7866 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
7867 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
7840 | #endif | 7868 | #endif |
7841 | /* May be allocated at isolcpus cmdline parse time */ | 7869 | /* May be allocated at isolcpus cmdline parse time */ |
7842 | if (cpu_isolated_map == NULL) | 7870 | if (cpu_isolated_map == NULL) |