diff options
author | Venkatesh Pallipadi <venki@google.com> | 2010-05-21 20:09:41 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-06-09 04:34:52 -0400 |
commit | 83cd4fe27ad8446619b2e030b171b858501de87d (patch) | |
tree | 81c7d26f4f00139ae355017239371d91cc4b2aef /kernel | |
parent | fdf3e95d3916f18bf8703fb065499fdbc4dfe34c (diff) |
sched: Change nohz idle load balancing logic to push model
In the new push model, all idle CPUs indeed go into nohz mode. There is
still the concept of idle load balancer (performing the load balancing
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
balancer when any of the nohz CPUs need idle load balancing.
The kickee CPU does the idle load balancing on behalf of all idle CPUs
instead of the normal idle balance.
This addresses the below two problems with the current nohz ilb logic:
* the idle load balancer continued to have periodic ticks during idle and
wokeup frequently, even though it did not have any rebalancing to do on
behalf of any of the idle CPUs.
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
periodic wakeup can result in a periodic additional interrupt on a CPU
doing the timer broadcast.
Also currently we are migrating the unpinned timers from an idle to the cpu
doing idle load balancing (when all the cpus in the system are idle,
there is no idle load balancing cpu and timers get added to the same idle cpu
where the request was made. So the existing optimization works only on semi idle
system).
And In semi idle system, we no longer have periodic ticks on the idle load
balancer CPU. Using that cpu will add more delays to the timers than intended
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
causing mysterious slowdowns during boot etc.
For now, in the semi idle case, use the nearest busy cpu for migrating timers
from an idle cpu. This is good for power-savings anyway.
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/hrtimer.c | 8 | ||||
-rw-r--r-- | kernel/sched.c | 34 | ||||
-rw-r--r-- | kernel/sched_fair.c | 329 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 8 | ||||
-rw-r--r-- | kernel/timer.c | 8 |
5 files changed, 234 insertions, 153 deletions
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0f..e934339fbbef 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
144 | static int hrtimer_get_target(int this_cpu, int pinned) | 144 | static int hrtimer_get_target(int this_cpu, int pinned) |
145 | { | 145 | { |
146 | #ifdef CONFIG_NO_HZ | 146 | #ifdef CONFIG_NO_HZ |
147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { | 147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) |
148 | int preferred_cpu = get_nohz_load_balancer(); | 148 | return get_nohz_timer_target(); |
149 | |||
150 | if (preferred_cpu >= 0) | ||
151 | return preferred_cpu; | ||
152 | } | ||
153 | #endif | 149 | #endif |
154 | return this_cpu; | 150 | return this_cpu; |
155 | } | 151 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index a757f6b11cbd..132950b33dde 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -460,7 +460,7 @@ struct rq { | |||
460 | unsigned long last_load_update_tick; | 460 | unsigned long last_load_update_tick; |
461 | #ifdef CONFIG_NO_HZ | 461 | #ifdef CONFIG_NO_HZ |
462 | u64 nohz_stamp; | 462 | u64 nohz_stamp; |
463 | unsigned char in_nohz_recently; | 463 | unsigned char nohz_balance_kick; |
464 | #endif | 464 | #endif |
465 | unsigned int skip_clock_update; | 465 | unsigned int skip_clock_update; |
466 | 466 | ||
@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu) | |||
1195 | 1195 | ||
1196 | #ifdef CONFIG_NO_HZ | 1196 | #ifdef CONFIG_NO_HZ |
1197 | /* | 1197 | /* |
1198 | * In the semi idle case, use the nearest busy cpu for migrating timers | ||
1199 | * from an idle cpu. This is good for power-savings. | ||
1200 | * | ||
1201 | * We don't do similar optimization for completely idle system, as | ||
1202 | * selecting an idle cpu will add more delays to the timers than intended | ||
1203 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | ||
1204 | */ | ||
1205 | int get_nohz_timer_target(void) | ||
1206 | { | ||
1207 | int cpu = smp_processor_id(); | ||
1208 | int i; | ||
1209 | struct sched_domain *sd; | ||
1210 | |||
1211 | for_each_domain(cpu, sd) { | ||
1212 | for_each_cpu(i, sched_domain_span(sd)) | ||
1213 | if (!idle_cpu(i)) | ||
1214 | return i; | ||
1215 | } | ||
1216 | return cpu; | ||
1217 | } | ||
1218 | /* | ||
1198 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1219 | * When add_timer_on() enqueues a timer into the timer wheel of an |
1199 | * idle CPU then this timer might expire before the next timer event | 1220 | * idle CPU then this timer might expire before the next timer event |
1200 | * which is scheduled to wake up that CPU. In case of a completely | 1221 | * which is scheduled to wake up that CPU. In case of a completely |
@@ -7791,6 +7812,10 @@ void __init sched_init(void) | |||
7791 | rq->idle_stamp = 0; | 7812 | rq->idle_stamp = 0; |
7792 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7813 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7793 | rq_attach_root(rq, &def_root_domain); | 7814 | rq_attach_root(rq, &def_root_domain); |
7815 | #ifdef CONFIG_NO_HZ | ||
7816 | rq->nohz_balance_kick = 0; | ||
7817 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
7818 | #endif | ||
7794 | #endif | 7819 | #endif |
7795 | init_rq_hrtick(rq); | 7820 | init_rq_hrtick(rq); |
7796 | atomic_set(&rq->nr_iowait, 0); | 7821 | atomic_set(&rq->nr_iowait, 0); |
@@ -7835,8 +7860,11 @@ void __init sched_init(void) | |||
7835 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7860 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7836 | #ifdef CONFIG_SMP | 7861 | #ifdef CONFIG_SMP |
7837 | #ifdef CONFIG_NO_HZ | 7862 | #ifdef CONFIG_NO_HZ |
7838 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 7863 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7839 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 7864 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
7865 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
7866 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
7867 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
7840 | #endif | 7868 | #endif |
7841 | /* May be allocated at isolcpus cmdline parse time */ | 7869 | /* May be allocated at isolcpus cmdline parse time */ |
7842 | if (cpu_isolated_map == NULL) | 7870 | if (cpu_isolated_map == NULL) |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 22b8b4f2b616..6ee2e0af665b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -3091,13 +3091,40 @@ out_unlock: | |||
3091 | } | 3091 | } |
3092 | 3092 | ||
3093 | #ifdef CONFIG_NO_HZ | 3093 | #ifdef CONFIG_NO_HZ |
3094 | |||
3095 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3096 | |||
3097 | static void trigger_sched_softirq(void *data) | ||
3098 | { | ||
3099 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3100 | } | ||
3101 | |||
3102 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3103 | { | ||
3104 | csd->func = trigger_sched_softirq; | ||
3105 | csd->info = NULL; | ||
3106 | csd->flags = 0; | ||
3107 | csd->priv = 0; | ||
3108 | } | ||
3109 | |||
3110 | /* | ||
3111 | * idle load balancing details | ||
3112 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
3113 | * entering idle. | ||
3114 | * - This idle load balancer CPU will also go into tickless mode when | ||
3115 | * it is idle, just like all other idle CPUs | ||
3116 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
3117 | * needed, they will kick the idle load balancer, which then does idle | ||
3118 | * load balancing for all the idle CPUs. | ||
3119 | */ | ||
3094 | static struct { | 3120 | static struct { |
3095 | atomic_t load_balancer; | 3121 | atomic_t load_balancer; |
3096 | cpumask_var_t cpu_mask; | 3122 | atomic_t first_pick_cpu; |
3097 | cpumask_var_t ilb_grp_nohz_mask; | 3123 | atomic_t second_pick_cpu; |
3098 | } nohz ____cacheline_aligned = { | 3124 | cpumask_var_t idle_cpus_mask; |
3099 | .load_balancer = ATOMIC_INIT(-1), | 3125 | cpumask_var_t grp_idle_mask; |
3100 | }; | 3126 | unsigned long next_balance; /* in jiffy units */ |
3127 | } nohz ____cacheline_aligned; | ||
3101 | 3128 | ||
3102 | int get_nohz_load_balancer(void) | 3129 | int get_nohz_load_balancer(void) |
3103 | { | 3130 | { |
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3151 | */ | 3178 | */ |
3152 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | 3179 | static inline int is_semi_idle_group(struct sched_group *ilb_group) |
3153 | { | 3180 | { |
3154 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | 3181 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, |
3155 | sched_group_cpus(ilb_group)); | 3182 | sched_group_cpus(ilb_group)); |
3156 | 3183 | ||
3157 | /* | 3184 | /* |
3158 | * A sched_group is semi-idle when it has atleast one busy cpu | 3185 | * A sched_group is semi-idle when it has atleast one busy cpu |
3159 | * and atleast one idle cpu. | 3186 | * and atleast one idle cpu. |
3160 | */ | 3187 | */ |
3161 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | 3188 | if (cpumask_empty(nohz.grp_idle_mask)) |
3162 | return 0; | 3189 | return 0; |
3163 | 3190 | ||
3164 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | 3191 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) |
3165 | return 0; | 3192 | return 0; |
3166 | 3193 | ||
3167 | return 1; | 3194 | return 1; |
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu) | |||
3194 | * Optimize for the case when we have no idle CPUs or only one | 3221 | * Optimize for the case when we have no idle CPUs or only one |
3195 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | 3222 | * idle CPU. Don't walk the sched_domain hierarchy in such cases |
3196 | */ | 3223 | */ |
3197 | if (cpumask_weight(nohz.cpu_mask) < 2) | 3224 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3198 | goto out_done; | 3225 | goto out_done; |
3199 | 3226 | ||
3200 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3227 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu) | |||
3202 | 3229 | ||
3203 | do { | 3230 | do { |
3204 | if (is_semi_idle_group(ilb_group)) | 3231 | if (is_semi_idle_group(ilb_group)) |
3205 | return cpumask_first(nohz.ilb_grp_nohz_mask); | 3232 | return cpumask_first(nohz.grp_idle_mask); |
3206 | 3233 | ||
3207 | ilb_group = ilb_group->next; | 3234 | ilb_group = ilb_group->next; |
3208 | 3235 | ||
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu) | |||
3210 | } | 3237 | } |
3211 | 3238 | ||
3212 | out_done: | 3239 | out_done: |
3213 | return cpumask_first(nohz.cpu_mask); | 3240 | return nr_cpu_ids; |
3214 | } | 3241 | } |
3215 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3242 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3216 | static inline int find_new_ilb(int call_cpu) | 3243 | static inline int find_new_ilb(int call_cpu) |
3217 | { | 3244 | { |
3218 | return cpumask_first(nohz.cpu_mask); | 3245 | return nr_cpu_ids; |
3219 | } | 3246 | } |
3220 | #endif | 3247 | #endif |
3221 | 3248 | ||
3222 | /* | 3249 | /* |
3250 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
3251 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
3252 | * CPU (if there is one). | ||
3253 | */ | ||
3254 | static void nohz_balancer_kick(int cpu) | ||
3255 | { | ||
3256 | int ilb_cpu; | ||
3257 | |||
3258 | nohz.next_balance++; | ||
3259 | |||
3260 | ilb_cpu = get_nohz_load_balancer(); | ||
3261 | |||
3262 | if (ilb_cpu >= nr_cpu_ids) { | ||
3263 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
3264 | if (ilb_cpu >= nr_cpu_ids) | ||
3265 | return; | ||
3266 | } | ||
3267 | |||
3268 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | ||
3269 | struct call_single_data *cp; | ||
3270 | |||
3271 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | ||
3272 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | ||
3273 | __smp_call_function_single(ilb_cpu, cp, 0); | ||
3274 | } | ||
3275 | return; | ||
3276 | } | ||
3277 | |||
3278 | /* | ||
3223 | * This routine will try to nominate the ilb (idle load balancing) | 3279 | * This routine will try to nominate the ilb (idle load balancing) |
3224 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3280 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
3225 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3281 | * load balancing on behalf of all those cpus. |
3226 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3227 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3228 | * arrives... | ||
3229 | * | ||
3230 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3231 | * for idle load balancing. ilb owner will still be part of | ||
3232 | * nohz.cpu_mask.. | ||
3233 | * | 3282 | * |
3234 | * While stopping the tick, this cpu will become the ilb owner if there | 3283 | * When the ilb owner becomes busy, we will not have new ilb owner until some |
3235 | * is no other owner. And will be the owner till that cpu becomes busy | 3284 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick |
3236 | * or if all cpus in the system stop their ticks at which point | 3285 | * idle load balancing by kicking one of the idle CPUs. |
3237 | * there is no need for ilb owner. | ||
3238 | * | 3286 | * |
3239 | * When the ilb owner becomes busy, it nominates another owner, during the | 3287 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this |
3240 | * next busy scheduler_tick() | 3288 | * ilb owner CPU in future (when there is a need for idle load balancing on |
3289 | * behalf of all idle CPUs). | ||
3241 | */ | 3290 | */ |
3242 | int select_nohz_load_balancer(int stop_tick) | 3291 | void select_nohz_load_balancer(int stop_tick) |
3243 | { | 3292 | { |
3244 | int cpu = smp_processor_id(); | 3293 | int cpu = smp_processor_id(); |
3245 | 3294 | ||
3246 | if (stop_tick) { | 3295 | if (stop_tick) { |
3247 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3248 | |||
3249 | if (!cpu_active(cpu)) { | 3296 | if (!cpu_active(cpu)) { |
3250 | if (atomic_read(&nohz.load_balancer) != cpu) | 3297 | if (atomic_read(&nohz.load_balancer) != cpu) |
3251 | return 0; | 3298 | return; |
3252 | 3299 | ||
3253 | /* | 3300 | /* |
3254 | * If we are going offline and still the leader, | 3301 | * If we are going offline and still the leader, |
3255 | * give up! | 3302 | * give up! |
3256 | */ | 3303 | */ |
3257 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3304 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3305 | nr_cpu_ids) != cpu) | ||
3258 | BUG(); | 3306 | BUG(); |
3259 | 3307 | ||
3260 | return 0; | 3308 | return; |
3261 | } | 3309 | } |
3262 | 3310 | ||
3263 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 3311 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
3264 | 3312 | ||
3265 | /* time for ilb owner also to sleep */ | 3313 | if (atomic_read(&nohz.first_pick_cpu) == cpu) |
3266 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | 3314 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); |
3267 | if (atomic_read(&nohz.load_balancer) == cpu) | 3315 | if (atomic_read(&nohz.second_pick_cpu) == cpu) |
3268 | atomic_set(&nohz.load_balancer, -1); | 3316 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); |
3269 | return 0; | ||
3270 | } | ||
3271 | 3317 | ||
3272 | if (atomic_read(&nohz.load_balancer) == -1) { | 3318 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { |
3273 | /* make me the ilb owner */ | ||
3274 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3275 | return 1; | ||
3276 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3277 | int new_ilb; | 3319 | int new_ilb; |
3278 | 3320 | ||
3279 | if (!(sched_smt_power_savings || | 3321 | /* make me the ilb owner */ |
3280 | sched_mc_power_savings)) | 3322 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, |
3281 | return 1; | 3323 | cpu) != nr_cpu_ids) |
3324 | return; | ||
3325 | |||
3282 | /* | 3326 | /* |
3283 | * Check to see if there is a more power-efficient | 3327 | * Check to see if there is a more power-efficient |
3284 | * ilb. | 3328 | * ilb. |
3285 | */ | 3329 | */ |
3286 | new_ilb = find_new_ilb(cpu); | 3330 | new_ilb = find_new_ilb(cpu); |
3287 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | 3331 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { |
3288 | atomic_set(&nohz.load_balancer, -1); | 3332 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
3289 | resched_cpu(new_ilb); | 3333 | resched_cpu(new_ilb); |
3290 | return 0; | 3334 | return; |
3291 | } | 3335 | } |
3292 | return 1; | 3336 | return; |
3293 | } | 3337 | } |
3294 | } else { | 3338 | } else { |
3295 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 3339 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) |
3296 | return 0; | 3340 | return; |
3297 | 3341 | ||
3298 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | 3342 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
3299 | 3343 | ||
3300 | if (atomic_read(&nohz.load_balancer) == cpu) | 3344 | if (atomic_read(&nohz.load_balancer) == cpu) |
3301 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3345 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3346 | nr_cpu_ids) != cpu) | ||
3302 | BUG(); | 3347 | BUG(); |
3303 | } | 3348 | } |
3304 | return 0; | 3349 | return; |
3305 | } | 3350 | } |
3306 | #endif | 3351 | #endif |
3307 | 3352 | ||
@@ -3383,11 +3428,101 @@ out: | |||
3383 | rq->next_balance = next_balance; | 3428 | rq->next_balance = next_balance; |
3384 | } | 3429 | } |
3385 | 3430 | ||
3431 | #ifdef CONFIG_NO_HZ | ||
3386 | /* | 3432 | /* |
3387 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3433 | * In CONFIG_NO_HZ case, the idle balance kickee will do the |
3388 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3389 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3434 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3390 | */ | 3435 | */ |
3436 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | ||
3437 | { | ||
3438 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3439 | struct rq *rq; | ||
3440 | int balance_cpu; | ||
3441 | |||
3442 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | ||
3443 | return; | ||
3444 | |||
3445 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
3446 | if (balance_cpu == this_cpu) | ||
3447 | continue; | ||
3448 | |||
3449 | /* | ||
3450 | * If this cpu gets work to do, stop the load balancing | ||
3451 | * work being done for other cpus. Next load | ||
3452 | * balancing owner will pick it up. | ||
3453 | */ | ||
3454 | if (need_resched()) { | ||
3455 | this_rq->nohz_balance_kick = 0; | ||
3456 | break; | ||
3457 | } | ||
3458 | |||
3459 | raw_spin_lock_irq(&this_rq->lock); | ||
3460 | update_cpu_load(this_rq); | ||
3461 | raw_spin_unlock_irq(&this_rq->lock); | ||
3462 | |||
3463 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3464 | |||
3465 | rq = cpu_rq(balance_cpu); | ||
3466 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3467 | this_rq->next_balance = rq->next_balance; | ||
3468 | } | ||
3469 | nohz.next_balance = this_rq->next_balance; | ||
3470 | this_rq->nohz_balance_kick = 0; | ||
3471 | } | ||
3472 | |||
3473 | /* | ||
3474 | * Current heuristic for kicking the idle load balancer | ||
3475 | * - first_pick_cpu is the one of the busy CPUs. It will kick | ||
3476 | * idle load balancer when it has more than one process active. This | ||
3477 | * eliminates the need for idle load balancing altogether when we have | ||
3478 | * only one running process in the system (common case). | ||
3479 | * - If there are more than one busy CPU, idle load balancer may have | ||
3480 | * to run for active_load_balance to happen (i.e., two busy CPUs are | ||
3481 | * SMT or core siblings and can run better if they move to different | ||
3482 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
3483 | * which will kick idle load balancer as soon as it has any load. | ||
3484 | */ | ||
3485 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | ||
3486 | { | ||
3487 | unsigned long now = jiffies; | ||
3488 | int ret; | ||
3489 | int first_pick_cpu, second_pick_cpu; | ||
3490 | |||
3491 | if (time_before(now, nohz.next_balance)) | ||
3492 | return 0; | ||
3493 | |||
3494 | if (!rq->nr_running) | ||
3495 | return 0; | ||
3496 | |||
3497 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | ||
3498 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | ||
3499 | |||
3500 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | ||
3501 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
3502 | return 0; | ||
3503 | |||
3504 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | ||
3505 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3506 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
3507 | if (rq->nr_running > 1) | ||
3508 | return 1; | ||
3509 | } else { | ||
3510 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | ||
3511 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3512 | if (rq->nr_running) | ||
3513 | return 1; | ||
3514 | } | ||
3515 | } | ||
3516 | return 0; | ||
3517 | } | ||
3518 | #else | ||
3519 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | ||
3520 | #endif | ||
3521 | |||
3522 | /* | ||
3523 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3524 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | ||
3525 | */ | ||
3391 | static void run_rebalance_domains(struct softirq_action *h) | 3526 | static void run_rebalance_domains(struct softirq_action *h) |
3392 | { | 3527 | { |
3393 | int this_cpu = smp_processor_id(); | 3528 | int this_cpu = smp_processor_id(); |
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3397 | 3532 | ||
3398 | rebalance_domains(this_cpu, idle); | 3533 | rebalance_domains(this_cpu, idle); |
3399 | 3534 | ||
3400 | #ifdef CONFIG_NO_HZ | ||
3401 | /* | 3535 | /* |
3402 | * If this cpu is the owner for idle load balancing, then do the | 3536 | * If this cpu has a pending nohz_balance_kick, then do the |
3403 | * balancing on behalf of the other idle cpus whose ticks are | 3537 | * balancing on behalf of the other idle cpus whose ticks are |
3404 | * stopped. | 3538 | * stopped. |
3405 | */ | 3539 | */ |
3406 | if (this_rq->idle_at_tick && | 3540 | nohz_idle_balance(this_cpu, idle); |
3407 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3408 | struct rq *rq; | ||
3409 | int balance_cpu; | ||
3410 | |||
3411 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3412 | if (balance_cpu == this_cpu) | ||
3413 | continue; | ||
3414 | |||
3415 | /* | ||
3416 | * If this cpu gets work to do, stop the load balancing | ||
3417 | * work being done for other cpus. Next load | ||
3418 | * balancing owner will pick it up. | ||
3419 | */ | ||
3420 | if (need_resched()) | ||
3421 | break; | ||
3422 | |||
3423 | rq = cpu_rq(balance_cpu); | ||
3424 | raw_spin_lock_irq(&rq->lock); | ||
3425 | update_cpu_load(rq); | ||
3426 | raw_spin_unlock_irq(&rq->lock); | ||
3427 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3428 | |||
3429 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3430 | this_rq->next_balance = rq->next_balance; | ||
3431 | } | ||
3432 | } | ||
3433 | #endif | ||
3434 | } | 3541 | } |
3435 | 3542 | ||
3436 | static inline int on_null_domain(int cpu) | 3543 | static inline int on_null_domain(int cpu) |
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu) | |||
3440 | 3547 | ||
3441 | /* | 3548 | /* |
3442 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3549 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3443 | * | ||
3444 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3445 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3446 | * if the whole system is idle. | ||
3447 | */ | 3550 | */ |
3448 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3551 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3449 | { | 3552 | { |
3450 | #ifdef CONFIG_NO_HZ | ||
3451 | /* | ||
3452 | * If we were in the nohz mode recently and busy at the current | ||
3453 | * scheduler tick, then check if we need to nominate new idle | ||
3454 | * load balancer. | ||
3455 | */ | ||
3456 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3457 | rq->in_nohz_recently = 0; | ||
3458 | |||
3459 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3460 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3461 | atomic_set(&nohz.load_balancer, -1); | ||
3462 | } | ||
3463 | |||
3464 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3465 | int ilb = find_new_ilb(cpu); | ||
3466 | |||
3467 | if (ilb < nr_cpu_ids) | ||
3468 | resched_cpu(ilb); | ||
3469 | } | ||
3470 | } | ||
3471 | |||
3472 | /* | ||
3473 | * If this cpu is idle and doing idle load balancing for all the | ||
3474 | * cpus with ticks stopped, is it time for that to stop? | ||
3475 | */ | ||
3476 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3477 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3478 | resched_cpu(cpu); | ||
3479 | return; | ||
3480 | } | ||
3481 | |||
3482 | /* | ||
3483 | * If this cpu is idle and the idle load balancing is done by | ||
3484 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3485 | */ | ||
3486 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3487 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3488 | return; | ||
3489 | #endif | ||
3490 | /* Don't need to rebalance while attached to NULL domain */ | 3553 | /* Don't need to rebalance while attached to NULL domain */ |
3491 | if (time_after_eq(jiffies, rq->next_balance) && | 3554 | if (time_after_eq(jiffies, rq->next_balance) && |
3492 | likely(!on_null_domain(cpu))) | 3555 | likely(!on_null_domain(cpu))) |
3493 | raise_softirq(SCHED_SOFTIRQ); | 3556 | raise_softirq(SCHED_SOFTIRQ); |
3557 | #ifdef CONFIG_NO_HZ | ||
3558 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | ||
3559 | nohz_balancer_kick(cpu); | ||
3560 | #endif | ||
3494 | } | 3561 | } |
3495 | 3562 | ||
3496 | static void rq_online_fair(struct rq *rq) | 3563 | static void rq_online_fair(struct rq *rq) |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c034..5f171f04ab00 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
408 | * the scheduler tick in nohz_restart_sched_tick. | 408 | * the scheduler tick in nohz_restart_sched_tick. |
409 | */ | 409 | */ |
410 | if (!ts->tick_stopped) { | 410 | if (!ts->tick_stopped) { |
411 | if (select_nohz_load_balancer(1)) { | 411 | select_nohz_load_balancer(1); |
412 | /* | ||
413 | * sched tick not stopped! | ||
414 | */ | ||
415 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
416 | goto out; | ||
417 | } | ||
418 | 412 | ||
419 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 413 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
420 | ts->tick_stopped = 1; | 414 | ts->tick_stopped = 1; |
diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8d4e18..48d6aec0789c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
679 | cpu = smp_processor_id(); | 679 | cpu = smp_processor_id(); |
680 | 680 | ||
681 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 681 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) |
682 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { | 682 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) |
683 | int preferred_cpu = get_nohz_load_balancer(); | 683 | cpu = get_nohz_timer_target(); |
684 | |||
685 | if (preferred_cpu >= 0) | ||
686 | cpu = preferred_cpu; | ||
687 | } | ||
688 | #endif | 684 | #endif |
689 | new_base = per_cpu(tvec_bases, cpu); | 685 | new_base = per_cpu(tvec_bases, cpu); |
690 | 686 | ||