aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorVenkatesh Pallipadi <venki@google.com>2010-05-21 20:09:41 -0400
committerIngo Molnar <mingo@elte.hu>2010-06-09 04:34:52 -0400
commit83cd4fe27ad8446619b2e030b171b858501de87d (patch)
tree81c7d26f4f00139ae355017239371d91cc4b2aef /kernel/sched.c
parentfdf3e95d3916f18bf8703fb065499fdbc4dfe34c (diff)
sched: Change nohz idle load balancing logic to push model
In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c34
1 files changed, 31 insertions, 3 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index a757f6b11cbd..132950b33dde 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -460,7 +460,7 @@ struct rq {
460 unsigned long last_load_update_tick; 460 unsigned long last_load_update_tick;
461#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
462 u64 nohz_stamp; 462 u64 nohz_stamp;
463 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
464#endif 464#endif
465 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
466 466
@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu)
1195 1195
1196#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1197/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1198 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1199 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1200 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
7791 rq->idle_stamp = 0; 7812 rq->idle_stamp = 0;
7792 rq->avg_idle = 2*sysctl_sched_migration_cost; 7813 rq->avg_idle = 2*sysctl_sched_migration_cost;
7793 rq_attach_root(rq, &def_root_domain); 7814 rq_attach_root(rq, &def_root_domain);
7815#ifdef CONFIG_NO_HZ
7816 rq->nohz_balance_kick = 0;
7817 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7818#endif
7794#endif 7819#endif
7795 init_rq_hrtick(rq); 7820 init_rq_hrtick(rq);
7796 atomic_set(&rq->nr_iowait, 0); 7821 atomic_set(&rq->nr_iowait, 0);
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
7835 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7860 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7836#ifdef CONFIG_SMP 7861#ifdef CONFIG_SMP
7837#ifdef CONFIG_NO_HZ 7862#ifdef CONFIG_NO_HZ
7838 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7863 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7839 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7864 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7865 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7866 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7867 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7840#endif 7868#endif
7841 /* May be allocated at isolcpus cmdline parse time */ 7869 /* May be allocated at isolcpus cmdline parse time */
7842 if (cpu_isolated_map == NULL) 7870 if (cpu_isolated_map == NULL)