aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorVenkatesh Pallipadi <venki@google.com>2010-05-21 20:09:41 -0400
committerIngo Molnar <mingo@elte.hu>2010-06-09 04:34:52 -0400
commit83cd4fe27ad8446619b2e030b171b858501de87d (patch)
tree81c7d26f4f00139ae355017239371d91cc4b2aef /kernel
parentfdf3e95d3916f18bf8703fb065499fdbc4dfe34c (diff)
sched: Change nohz idle load balancing logic to push model
In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/hrtimer.c8
-rw-r--r--kernel/sched.c34
-rw-r--r--kernel/sched_fair.c329
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/timer.c8
5 files changed, 234 insertions, 153 deletions
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..e934339fbbef 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
diff --git a/kernel/sched.c b/kernel/sched.c
index a757f6b11cbd..132950b33dde 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -460,7 +460,7 @@ struct rq {
460 unsigned long last_load_update_tick; 460 unsigned long last_load_update_tick;
461#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
462 u64 nohz_stamp; 462 u64 nohz_stamp;
463 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
464#endif 464#endif
465 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
466 466
@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu)
1195 1195
1196#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1197/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1198 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1199 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1200 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
7791 rq->idle_stamp = 0; 7812 rq->idle_stamp = 0;
7792 rq->avg_idle = 2*sysctl_sched_migration_cost; 7813 rq->avg_idle = 2*sysctl_sched_migration_cost;
7793 rq_attach_root(rq, &def_root_domain); 7814 rq_attach_root(rq, &def_root_domain);
7815#ifdef CONFIG_NO_HZ
7816 rq->nohz_balance_kick = 0;
7817 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7818#endif
7794#endif 7819#endif
7795 init_rq_hrtick(rq); 7820 init_rq_hrtick(rq);
7796 atomic_set(&rq->nr_iowait, 0); 7821 atomic_set(&rq->nr_iowait, 0);
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
7835 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7860 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7836#ifdef CONFIG_SMP 7861#ifdef CONFIG_SMP
7837#ifdef CONFIG_NO_HZ 7862#ifdef CONFIG_NO_HZ
7838 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7863 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7839 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7864 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7865 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7866 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7867 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7840#endif 7868#endif
7841 /* May be allocated at isolcpus cmdline parse time */ 7869 /* May be allocated at isolcpus cmdline parse time */
7842 if (cpu_isolated_map == NULL) 7870 if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 22b8b4f2b616..6ee2e0af665b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3091,13 +3091,40 @@ out_unlock:
3091} 3091}
3092 3092
3093#ifdef CONFIG_NO_HZ 3093#ifdef CONFIG_NO_HZ
3094
3095static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3096
3097static void trigger_sched_softirq(void *data)
3098{
3099 raise_softirq_irqoff(SCHED_SOFTIRQ);
3100}
3101
3102static inline void init_sched_softirq_csd(struct call_single_data *csd)
3103{
3104 csd->func = trigger_sched_softirq;
3105 csd->info = NULL;
3106 csd->flags = 0;
3107 csd->priv = 0;
3108}
3109
3110/*
3111 * idle load balancing details
3112 * - One of the idle CPUs nominates itself as idle load_balancer, while
3113 * entering idle.
3114 * - This idle load balancer CPU will also go into tickless mode when
3115 * it is idle, just like all other idle CPUs
3116 * - When one of the busy CPUs notice that there may be an idle rebalancing
3117 * needed, they will kick the idle load balancer, which then does idle
3118 * load balancing for all the idle CPUs.
3119 */
3094static struct { 3120static struct {
3095 atomic_t load_balancer; 3121 atomic_t load_balancer;
3096 cpumask_var_t cpu_mask; 3122 atomic_t first_pick_cpu;
3097 cpumask_var_t ilb_grp_nohz_mask; 3123 atomic_t second_pick_cpu;
3098} nohz ____cacheline_aligned = { 3124 cpumask_var_t idle_cpus_mask;
3099 .load_balancer = ATOMIC_INIT(-1), 3125 cpumask_var_t grp_idle_mask;
3100}; 3126 unsigned long next_balance; /* in jiffy units */
3127} nohz ____cacheline_aligned;
3101 3128
3102int get_nohz_load_balancer(void) 3129int get_nohz_load_balancer(void)
3103{ 3130{
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3151 */ 3178 */
3152static inline int is_semi_idle_group(struct sched_group *ilb_group) 3179static inline int is_semi_idle_group(struct sched_group *ilb_group)
3153{ 3180{
3154 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3181 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3155 sched_group_cpus(ilb_group)); 3182 sched_group_cpus(ilb_group));
3156 3183
3157 /* 3184 /*
3158 * A sched_group is semi-idle when it has atleast one busy cpu 3185 * A sched_group is semi-idle when it has atleast one busy cpu
3159 * and atleast one idle cpu. 3186 * and atleast one idle cpu.
3160 */ 3187 */
3161 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3188 if (cpumask_empty(nohz.grp_idle_mask))
3162 return 0; 3189 return 0;
3163 3190
3164 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3191 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3165 return 0; 3192 return 0;
3166 3193
3167 return 1; 3194 return 1;
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
3194 * Optimize for the case when we have no idle CPUs or only one 3221 * Optimize for the case when we have no idle CPUs or only one
3195 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3222 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3196 */ 3223 */
3197 if (cpumask_weight(nohz.cpu_mask) < 2) 3224 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3198 goto out_done; 3225 goto out_done;
3199 3226
3200 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3227 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
3202 3229
3203 do { 3230 do {
3204 if (is_semi_idle_group(ilb_group)) 3231 if (is_semi_idle_group(ilb_group))
3205 return cpumask_first(nohz.ilb_grp_nohz_mask); 3232 return cpumask_first(nohz.grp_idle_mask);
3206 3233
3207 ilb_group = ilb_group->next; 3234 ilb_group = ilb_group->next;
3208 3235
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
3210 } 3237 }
3211 3238
3212out_done: 3239out_done:
3213 return cpumask_first(nohz.cpu_mask); 3240 return nr_cpu_ids;
3214} 3241}
3215#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3242#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3216static inline int find_new_ilb(int call_cpu) 3243static inline int find_new_ilb(int call_cpu)
3217{ 3244{
3218 return cpumask_first(nohz.cpu_mask); 3245 return nr_cpu_ids;
3219} 3246}
3220#endif 3247#endif
3221 3248
3222/* 3249/*
3250 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3251 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3252 * CPU (if there is one).
3253 */
3254static void nohz_balancer_kick(int cpu)
3255{
3256 int ilb_cpu;
3257
3258 nohz.next_balance++;
3259
3260 ilb_cpu = get_nohz_load_balancer();
3261
3262 if (ilb_cpu >= nr_cpu_ids) {
3263 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3264 if (ilb_cpu >= nr_cpu_ids)
3265 return;
3266 }
3267
3268 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3269 struct call_single_data *cp;
3270
3271 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3272 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3273 __smp_call_function_single(ilb_cpu, cp, 0);
3274 }
3275 return;
3276}
3277
3278/*
3223 * This routine will try to nominate the ilb (idle load balancing) 3279 * This routine will try to nominate the ilb (idle load balancing)
3224 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3280 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3225 * load balancing on behalf of all those cpus. If all the cpus in the system 3281 * load balancing on behalf of all those cpus.
3226 * go into this tickless mode, then there will be no ilb owner (as there is
3227 * no need for one) and all the cpus will sleep till the next wakeup event
3228 * arrives...
3229 *
3230 * For the ilb owner, tick is not stopped. And this tick will be used
3231 * for idle load balancing. ilb owner will still be part of
3232 * nohz.cpu_mask..
3233 * 3282 *
3234 * While stopping the tick, this cpu will become the ilb owner if there 3283 * When the ilb owner becomes busy, we will not have new ilb owner until some
3235 * is no other owner. And will be the owner till that cpu becomes busy 3284 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3236 * or if all cpus in the system stop their ticks at which point 3285 * idle load balancing by kicking one of the idle CPUs.
3237 * there is no need for ilb owner.
3238 * 3286 *
3239 * When the ilb owner becomes busy, it nominates another owner, during the 3287 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3240 * next busy scheduler_tick() 3288 * ilb owner CPU in future (when there is a need for idle load balancing on
3289 * behalf of all idle CPUs).
3241 */ 3290 */
3242int select_nohz_load_balancer(int stop_tick) 3291void select_nohz_load_balancer(int stop_tick)
3243{ 3292{
3244 int cpu = smp_processor_id(); 3293 int cpu = smp_processor_id();
3245 3294
3246 if (stop_tick) { 3295 if (stop_tick) {
3247 cpu_rq(cpu)->in_nohz_recently = 1;
3248
3249 if (!cpu_active(cpu)) { 3296 if (!cpu_active(cpu)) {
3250 if (atomic_read(&nohz.load_balancer) != cpu) 3297 if (atomic_read(&nohz.load_balancer) != cpu)
3251 return 0; 3298 return;
3252 3299
3253 /* 3300 /*
3254 * If we are going offline and still the leader, 3301 * If we are going offline and still the leader,
3255 * give up! 3302 * give up!
3256 */ 3303 */
3257 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3304 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3305 nr_cpu_ids) != cpu)
3258 BUG(); 3306 BUG();
3259 3307
3260 return 0; 3308 return;
3261 } 3309 }
3262 3310
3263 cpumask_set_cpu(cpu, nohz.cpu_mask); 3311 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3264 3312
3265 /* time for ilb owner also to sleep */ 3313 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3266 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3314 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3267 if (atomic_read(&nohz.load_balancer) == cpu) 3315 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3268 atomic_set(&nohz.load_balancer, -1); 3316 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3269 return 0;
3270 }
3271 3317
3272 if (atomic_read(&nohz.load_balancer) == -1) { 3318 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3273 /* make me the ilb owner */
3274 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3275 return 1;
3276 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3277 int new_ilb; 3319 int new_ilb;
3278 3320
3279 if (!(sched_smt_power_savings || 3321 /* make me the ilb owner */
3280 sched_mc_power_savings)) 3322 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3281 return 1; 3323 cpu) != nr_cpu_ids)
3324 return;
3325
3282 /* 3326 /*
3283 * Check to see if there is a more power-efficient 3327 * Check to see if there is a more power-efficient
3284 * ilb. 3328 * ilb.
3285 */ 3329 */
3286 new_ilb = find_new_ilb(cpu); 3330 new_ilb = find_new_ilb(cpu);
3287 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3288 atomic_set(&nohz.load_balancer, -1); 3332 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3289 resched_cpu(new_ilb); 3333 resched_cpu(new_ilb);
3290 return 0; 3334 return;
3291 } 3335 }
3292 return 1; 3336 return;
3293 } 3337 }
3294 } else { 3338 } else {
3295 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3339 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3296 return 0; 3340 return;
3297 3341
3298 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3342 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3299 3343
3300 if (atomic_read(&nohz.load_balancer) == cpu) 3344 if (atomic_read(&nohz.load_balancer) == cpu)
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3346 nr_cpu_ids) != cpu)
3302 BUG(); 3347 BUG();
3303 } 3348 }
3304 return 0; 3349 return;
3305} 3350}
3306#endif 3351#endif
3307 3352
@@ -3383,11 +3428,101 @@ out:
3383 rq->next_balance = next_balance; 3428 rq->next_balance = next_balance;
3384} 3429}
3385 3430
3431#ifdef CONFIG_NO_HZ
3386/* 3432/*
3387 * run_rebalance_domains is triggered when needed from the scheduler tick. 3433 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3388 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3389 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3434 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3390 */ 3435 */
3436static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3437{
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 struct rq *rq;
3440 int balance_cpu;
3441
3442 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3443 return;
3444
3445 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3446 if (balance_cpu == this_cpu)
3447 continue;
3448
3449 /*
3450 * If this cpu gets work to do, stop the load balancing
3451 * work being done for other cpus. Next load
3452 * balancing owner will pick it up.
3453 */
3454 if (need_resched()) {
3455 this_rq->nohz_balance_kick = 0;
3456 break;
3457 }
3458
3459 raw_spin_lock_irq(&this_rq->lock);
3460 update_cpu_load(this_rq);
3461 raw_spin_unlock_irq(&this_rq->lock);
3462
3463 rebalance_domains(balance_cpu, CPU_IDLE);
3464
3465 rq = cpu_rq(balance_cpu);
3466 if (time_after(this_rq->next_balance, rq->next_balance))
3467 this_rq->next_balance = rq->next_balance;
3468 }
3469 nohz.next_balance = this_rq->next_balance;
3470 this_rq->nohz_balance_kick = 0;
3471}
3472
3473/*
3474 * Current heuristic for kicking the idle load balancer
3475 * - first_pick_cpu is the one of the busy CPUs. It will kick
3476 * idle load balancer when it has more than one process active. This
3477 * eliminates the need for idle load balancing altogether when we have
3478 * only one running process in the system (common case).
3479 * - If there are more than one busy CPU, idle load balancer may have
3480 * to run for active_load_balance to happen (i.e., two busy CPUs are
3481 * SMT or core siblings and can run better if they move to different
3482 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3483 * which will kick idle load balancer as soon as it has any load.
3484 */
3485static inline int nohz_kick_needed(struct rq *rq, int cpu)
3486{
3487 unsigned long now = jiffies;
3488 int ret;
3489 int first_pick_cpu, second_pick_cpu;
3490
3491 if (time_before(now, nohz.next_balance))
3492 return 0;
3493
3494 if (!rq->nr_running)
3495 return 0;
3496
3497 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3498 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3499
3500 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3501 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3502 return 0;
3503
3504 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3505 if (ret == nr_cpu_ids || ret == cpu) {
3506 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3507 if (rq->nr_running > 1)
3508 return 1;
3509 } else {
3510 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3511 if (ret == nr_cpu_ids || ret == cpu) {
3512 if (rq->nr_running)
3513 return 1;
3514 }
3515 }
3516 return 0;
3517}
3518#else
3519static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3520#endif
3521
3522/*
3523 * run_rebalance_domains is triggered when needed from the scheduler tick.
3524 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3525 */
3391static void run_rebalance_domains(struct softirq_action *h) 3526static void run_rebalance_domains(struct softirq_action *h)
3392{ 3527{
3393 int this_cpu = smp_processor_id(); 3528 int this_cpu = smp_processor_id();
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3397 3532
3398 rebalance_domains(this_cpu, idle); 3533 rebalance_domains(this_cpu, idle);
3399 3534
3400#ifdef CONFIG_NO_HZ
3401 /* 3535 /*
3402 * If this cpu is the owner for idle load balancing, then do the 3536 * If this cpu has a pending nohz_balance_kick, then do the
3403 * balancing on behalf of the other idle cpus whose ticks are 3537 * balancing on behalf of the other idle cpus whose ticks are
3404 * stopped. 3538 * stopped.
3405 */ 3539 */
3406 if (this_rq->idle_at_tick && 3540 nohz_idle_balance(this_cpu, idle);
3407 atomic_read(&nohz.load_balancer) == this_cpu) {
3408 struct rq *rq;
3409 int balance_cpu;
3410
3411 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3412 if (balance_cpu == this_cpu)
3413 continue;
3414
3415 /*
3416 * If this cpu gets work to do, stop the load balancing
3417 * work being done for other cpus. Next load
3418 * balancing owner will pick it up.
3419 */
3420 if (need_resched())
3421 break;
3422
3423 rq = cpu_rq(balance_cpu);
3424 raw_spin_lock_irq(&rq->lock);
3425 update_cpu_load(rq);
3426 raw_spin_unlock_irq(&rq->lock);
3427 rebalance_domains(balance_cpu, CPU_IDLE);
3428
3429 if (time_after(this_rq->next_balance, rq->next_balance))
3430 this_rq->next_balance = rq->next_balance;
3431 }
3432 }
3433#endif
3434} 3541}
3435 3542
3436static inline int on_null_domain(int cpu) 3543static inline int on_null_domain(int cpu)
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
3440 3547
3441/* 3548/*
3442 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3549 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3443 *
3444 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3445 * idle load balancing owner or decide to stop the periodic load balancing,
3446 * if the whole system is idle.
3447 */ 3550 */
3448static inline void trigger_load_balance(struct rq *rq, int cpu) 3551static inline void trigger_load_balance(struct rq *rq, int cpu)
3449{ 3552{
3450#ifdef CONFIG_NO_HZ
3451 /*
3452 * If we were in the nohz mode recently and busy at the current
3453 * scheduler tick, then check if we need to nominate new idle
3454 * load balancer.
3455 */
3456 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3457 rq->in_nohz_recently = 0;
3458
3459 if (atomic_read(&nohz.load_balancer) == cpu) {
3460 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3461 atomic_set(&nohz.load_balancer, -1);
3462 }
3463
3464 if (atomic_read(&nohz.load_balancer) == -1) {
3465 int ilb = find_new_ilb(cpu);
3466
3467 if (ilb < nr_cpu_ids)
3468 resched_cpu(ilb);
3469 }
3470 }
3471
3472 /*
3473 * If this cpu is idle and doing idle load balancing for all the
3474 * cpus with ticks stopped, is it time for that to stop?
3475 */
3476 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3477 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3478 resched_cpu(cpu);
3479 return;
3480 }
3481
3482 /*
3483 * If this cpu is idle and the idle load balancing is done by
3484 * someone else, then no need raise the SCHED_SOFTIRQ
3485 */
3486 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3487 cpumask_test_cpu(cpu, nohz.cpu_mask))
3488 return;
3489#endif
3490 /* Don't need to rebalance while attached to NULL domain */ 3553 /* Don't need to rebalance while attached to NULL domain */
3491 if (time_after_eq(jiffies, rq->next_balance) && 3554 if (time_after_eq(jiffies, rq->next_balance) &&
3492 likely(!on_null_domain(cpu))) 3555 likely(!on_null_domain(cpu)))
3493 raise_softirq(SCHED_SOFTIRQ); 3556 raise_softirq(SCHED_SOFTIRQ);
3557#ifdef CONFIG_NO_HZ
3558 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3559 nohz_balancer_kick(cpu);
3560#endif
3494} 3561}
3495 3562
3496static void rq_online_fair(struct rq *rq) 3563static void rq_online_fair(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c034..5f171f04ab00 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
408 * the scheduler tick in nohz_restart_sched_tick. 408 * the scheduler tick in nohz_restart_sched_tick.
409 */ 409 */
410 if (!ts->tick_stopped) { 410 if (!ts->tick_stopped) {
411 if (select_nohz_load_balancer(1)) { 411 select_nohz_load_balancer(1);
412 /*
413 * sched tick not stopped!
414 */
415 cpumask_clear_cpu(cpu, nohz_cpu_mask);
416 goto out;
417 }
418 412
419 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 413 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
420 ts->tick_stopped = 1; 414 ts->tick_stopped = 1;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..48d6aec0789c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
679 cpu = smp_processor_id(); 679 cpu = smp_processor_id();
680 680
681#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 681#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
682 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 682 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
683 int preferred_cpu = get_nohz_load_balancer(); 683 cpu = get_nohz_timer_target();
684
685 if (preferred_cpu >= 0)
686 cpu = preferred_cpu;
687 }
688#endif 684#endif
689 new_base = per_cpu(tvec_bases, cpu); 685 new_base = per_cpu(tvec_bases, cpu);
690 686