aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c421
1 files changed, 315 insertions, 106 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..dc85ceb90832 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1233} 1256}
1234 1257
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1246 1259
1247static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1281,6 +1294,10 @@ static void resched_task(struct task_struct *p)
1281static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1294static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1282{ 1295{
1283} 1296}
1297
1298static void sched_avg_update(struct rq *rq)
1299{
1300}
1284#endif /* CONFIG_SMP */ 1301#endif /* CONFIG_SMP */
1285 1302
1286#if BITS_PER_LONG == 32 1303#if BITS_PER_LONG == 32
@@ -1652,7 +1669,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1669 if (root_task_group_empty())
1653 return; 1670 return;
1654 1671
1655 now = cpu_clock(raw_smp_processor_id()); 1672 now = local_clock();
1656 elapsed = now - sd->last_update; 1673 elapsed = now - sd->last_update;
1657 1674
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1822,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1822static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1823static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1824static int get_update_sysctl_factor(void);
1825static void update_cpu_load(struct rq *this_rq);
1808 1826
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1827static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1828{
@@ -2267,11 +2285,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2285}
2268#endif 2286#endif
2269 2287
2270/*** 2288static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2289 bool is_sync, bool is_migrate, bool is_local,
2290 unsigned long en_flags)
2291{
2292 schedstat_inc(p, se.statistics.nr_wakeups);
2293 if (is_sync)
2294 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2295 if (is_migrate)
2296 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2297 if (is_local)
2298 schedstat_inc(p, se.statistics.nr_wakeups_local);
2299 else
2300 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2301
2302 activate_task(rq, p, en_flags);
2303}
2304
2305static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2306 int wake_flags, bool success)
2307{
2308 trace_sched_wakeup(p, success);
2309 check_preempt_curr(rq, p, wake_flags);
2310
2311 p->state = TASK_RUNNING;
2312#ifdef CONFIG_SMP
2313 if (p->sched_class->task_woken)
2314 p->sched_class->task_woken(rq, p);
2315
2316 if (unlikely(rq->idle_stamp)) {
2317 u64 delta = rq->clock - rq->idle_stamp;
2318 u64 max = 2*sysctl_sched_migration_cost;
2319
2320 if (delta > max)
2321 rq->avg_idle = max;
2322 else
2323 update_avg(&rq->avg_idle, delta);
2324 rq->idle_stamp = 0;
2325 }
2326#endif
2327 /* if a worker is waking up, notify workqueue */
2328 if ((p->flags & PF_WQ_WORKER) && success)
2329 wq_worker_waking_up(p, cpu_of(rq));
2330}
2331
2332/**
2271 * try_to_wake_up - wake up a thread 2333 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2334 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2335 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2336 * @wake_flags: wake modifier flags (WF_*)
2275 * 2337 *
2276 * Put it on the run-queue if it's not already there. The "current" 2338 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2339 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2341,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2341 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2342 * runnable without the overhead of this.
2281 * 2343 *
2282 * returns failure only if the task is already active. 2344 * Returns %true if @p was woken up, %false if it was already running
2345 * or @state didn't match @p's state.
2283 */ 2346 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2347static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2348 int wake_flags)
@@ -2359,38 +2422,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2422
2360out_activate: 2423out_activate:
2361#endif /* CONFIG_SMP */ 2424#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2425 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2426 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2427 success = 1;
2373
2374out_running: 2428out_running:
2375 trace_sched_wakeup(p, success); 2429 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2430out:
2395 task_rq_unlock(rq, &flags); 2431 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2432 put_cpu();
@@ -2399,6 +2435,37 @@ out:
2399} 2435}
2400 2436
2401/** 2437/**
2438 * try_to_wake_up_local - try to wake up a local task with rq lock held
2439 * @p: the thread to be awakened
2440 *
2441 * Put @p on the run-queue if it's not alredy there. The caller must
2442 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2443 * the current task. this_rq() stays locked over invocation.
2444 */
2445static void try_to_wake_up_local(struct task_struct *p)
2446{
2447 struct rq *rq = task_rq(p);
2448 bool success = false;
2449
2450 BUG_ON(rq != this_rq());
2451 BUG_ON(p == current);
2452 lockdep_assert_held(&rq->lock);
2453
2454 if (!(p->state & TASK_NORMAL))
2455 return;
2456
2457 if (!p->se.on_rq) {
2458 if (likely(!task_running(rq, p))) {
2459 schedstat_inc(rq, ttwu_count);
2460 schedstat_inc(rq, ttwu_local);
2461 }
2462 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2463 success = true;
2464 }
2465 ttwu_post_activation(p, rq, 0, success);
2466}
2467
2468/**
2402 * wake_up_process - Wake up a specific process 2469 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2470 * @p: The process to be woken up.
2404 * 2471 *
@@ -3012,23 +3079,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3079}
3013 3080
3014/* 3081/*
3082 * The exact cpuload at various idx values, calculated at every tick would be
3083 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3084 *
3085 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3086 * on nth tick when cpu may be busy, then we have:
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3089 *
3090 * decay_load_missed() below does efficient calculation of
3091 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3092 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3093 *
3094 * The calculation is approximated on a 128 point scale.
3095 * degrade_zero_ticks is the number of ticks after which load at any
3096 * particular idx is approximated to be zero.
3097 * degrade_factor is a precomputed table, a row for each load idx.
3098 * Each column corresponds to degradation factor for a power of two ticks,
3099 * based on 128 point scale.
3100 * Example:
3101 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3102 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3103 *
3104 * With this power of 2 load factors, we can degrade the load n times
3105 * by looking at 1 bits in n and doing as many mult/shift instead of
3106 * n mult/shifts needed by the exact degradation.
3107 */
3108#define DEGRADE_SHIFT 7
3109static const unsigned char
3110 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3111static const unsigned char
3112 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3113 {0, 0, 0, 0, 0, 0, 0, 0},
3114 {64, 32, 8, 0, 0, 0, 0, 0},
3115 {96, 72, 40, 12, 1, 0, 0},
3116 {112, 98, 75, 43, 15, 1, 0},
3117 {120, 112, 98, 76, 45, 16, 2} };
3118
3119/*
3120 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3121 * would be when CPU is idle and so we just decay the old load without
3122 * adding any new load.
3123 */
3124static unsigned long
3125decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3126{
3127 int j = 0;
3128
3129 if (!missed_updates)
3130 return load;
3131
3132 if (missed_updates >= degrade_zero_ticks[idx])
3133 return 0;
3134
3135 if (idx == 1)
3136 return load >> missed_updates;
3137
3138 while (missed_updates) {
3139 if (missed_updates % 2)
3140 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3141
3142 missed_updates >>= 1;
3143 j++;
3144 }
3145 return load;
3146}
3147
3148/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3149 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3150 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3151 * every tick. We fix it up based on jiffies.
3017 */ 3152 */
3018static void update_cpu_load(struct rq *this_rq) 3153static void update_cpu_load(struct rq *this_rq)
3019{ 3154{
3020 unsigned long this_load = this_rq->load.weight; 3155 unsigned long this_load = this_rq->load.weight;
3156 unsigned long curr_jiffies = jiffies;
3157 unsigned long pending_updates;
3021 int i, scale; 3158 int i, scale;
3022 3159
3023 this_rq->nr_load_updates++; 3160 this_rq->nr_load_updates++;
3024 3161
3162 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3163 if (curr_jiffies == this_rq->last_load_update_tick)
3164 return;
3165
3166 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3167 this_rq->last_load_update_tick = curr_jiffies;
3168
3025 /* Update our load: */ 3169 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3170 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3171 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3172 unsigned long old_load, new_load;
3028 3173
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3174 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3175
3031 old_load = this_rq->cpu_load[i]; 3176 old_load = this_rq->cpu_load[i];
3177 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3178 new_load = this_load;
3033 /* 3179 /*
3034 * Round up the averaging division if load is increasing. This 3180 * Round up the averaging division if load is increasing. This
@@ -3036,10 +3182,18 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3182 * example.
3037 */ 3183 */
3038 if (new_load > old_load) 3184 if (new_load > old_load)
3039 new_load += scale-1; 3185 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3186
3187 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3188 }
3042 3189
3190 sched_avg_update(this_rq);
3191}
3192
3193static void update_cpu_load_active(struct rq *this_rq)
3194{
3195 update_cpu_load(this_rq);
3196
3043 calc_load_account_active(this_rq); 3197 calc_load_account_active(this_rq);
3044} 3198}
3045 3199
@@ -3359,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3359 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3513 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3360 3514
3361 if (total) { 3515 if (total) {
3362 u64 temp; 3516 u64 temp = rtime;
3363 3517
3364 temp = (u64)(rtime * utime); 3518 temp *= utime;
3365 do_div(temp, total); 3519 do_div(temp, total);
3366 utime = (cputime_t)temp; 3520 utime = (cputime_t)temp;
3367 } else 3521 } else
@@ -3392,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3392 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3546 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3393 3547
3394 if (total) { 3548 if (total) {
3395 u64 temp; 3549 u64 temp = rtime;
3396 3550
3397 temp = (u64)(rtime * cputime.utime); 3551 temp *= cputime.utime;
3398 do_div(temp, total); 3552 do_div(temp, total);
3399 utime = (cputime_t)temp; 3553 utime = (cputime_t)temp;
3400 } else 3554 } else
@@ -3426,7 +3580,7 @@ void scheduler_tick(void)
3426 3580
3427 raw_spin_lock(&rq->lock); 3581 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3582 update_rq_clock(rq);
3429 update_cpu_load(rq); 3583 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3584 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3585 raw_spin_unlock(&rq->lock);
3432 3586
@@ -3598,7 +3752,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3752 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3753 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3754 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3755
3603 release_kernel_lock(prev); 3756 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3757need_resched_nonpreemptible:
@@ -3611,11 +3764,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3764 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3765 clear_tsk_need_resched(prev);
3613 3766
3767 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3768 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3769 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3770 prev->state = TASK_RUNNING;
3617 else 3771 } else {
3772 /*
3773 * If a worker is going to sleep, notify and
3774 * ask workqueue whether it wants to wake up a
3775 * task to maintain concurrency. If so, wake
3776 * up the task.
3777 */
3778 if (prev->flags & PF_WQ_WORKER) {
3779 struct task_struct *to_wakeup;
3780
3781 to_wakeup = wq_worker_sleeping(prev, cpu);
3782 if (to_wakeup)
3783 try_to_wake_up_local(to_wakeup);
3784 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3785 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3786 }
3619 switch_count = &prev->nvcsw; 3787 switch_count = &prev->nvcsw;
3620 } 3788 }
3621 3789
@@ -3637,8 +3805,10 @@ need_resched_nonpreemptible:
3637 3805
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3806 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3807 /*
3640 * the context switch might have flipped the stack from under 3808 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3809 * and restored the local variables which were saved when
3810 * this task called schedule() in the past. prev == current
3811 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3812 */
3643 cpu = smp_processor_id(); 3813 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3814 rq = cpu_rq(cpu);
@@ -3647,11 +3817,8 @@ need_resched_nonpreemptible:
3647 3817
3648 post_schedule(rq); 3818 post_schedule(rq);
3649 3819
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3820 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3821 goto need_resched_nonpreemptible;
3654 }
3655 3822
3656 preempt_enable_no_resched(); 3823 preempt_enable_no_resched();
3657 if (need_resched()) 3824 if (need_resched())
@@ -3704,8 +3871,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3704 /* 3871 /*
3705 * Owner changed, break to re-assess state. 3872 * Owner changed, break to re-assess state.
3706 */ 3873 */
3707 if (lock->owner != owner) 3874 if (lock->owner != owner) {
3875 /*
3876 * If the lock has switched to a different owner,
3877 * we likely have heavy contention. Return 0 to quit
3878 * optimistic spinning and not contend further:
3879 */
3880 if (lock->owner)
3881 return 0;
3708 break; 3882 break;
3883 }
3709 3884
3710 /* 3885 /*
3711 * Is that owner really running on that cpu? 3886 * Is that owner really running on that cpu?
@@ -3726,7 +3901,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3726 * off of preempt_enable. Kernel preemptions off return from interrupt 3901 * off of preempt_enable. Kernel preemptions off return from interrupt
3727 * occur there and call schedule directly. 3902 * occur there and call schedule directly.
3728 */ 3903 */
3729asmlinkage void __sched preempt_schedule(void) 3904asmlinkage void __sched notrace preempt_schedule(void)
3730{ 3905{
3731 struct thread_info *ti = current_thread_info(); 3906 struct thread_info *ti = current_thread_info();
3732 3907
@@ -3738,9 +3913,9 @@ asmlinkage void __sched preempt_schedule(void)
3738 return; 3913 return;
3739 3914
3740 do { 3915 do {
3741 add_preempt_count(PREEMPT_ACTIVE); 3916 add_preempt_count_notrace(PREEMPT_ACTIVE);
3742 schedule(); 3917 schedule();
3743 sub_preempt_count(PREEMPT_ACTIVE); 3918 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3744 3919
3745 /* 3920 /*
3746 * Check again in case we missed a preemption opportunity 3921 * Check again in case we missed a preemption opportunity
@@ -4441,12 +4616,8 @@ recheck:
4441 */ 4616 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4617 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4618 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4619 unsigned long rlim_rtprio =
4445 4620 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4621
4451 /* can't set/change the rt policy */ 4622 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4623 if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5987,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 5987 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 5988static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 5989 .notifier_call = migration_call,
5819 .priority = 10 5990 .priority = CPU_PRI_MIGRATION,
5820}; 5991};
5821 5992
5993static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5994 unsigned long action, void *hcpu)
5995{
5996 switch (action & ~CPU_TASKS_FROZEN) {
5997 case CPU_ONLINE:
5998 case CPU_DOWN_FAILED:
5999 set_cpu_active((long)hcpu, true);
6000 return NOTIFY_OK;
6001 default:
6002 return NOTIFY_DONE;
6003 }
6004}
6005
6006static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6007 unsigned long action, void *hcpu)
6008{
6009 switch (action & ~CPU_TASKS_FROZEN) {
6010 case CPU_DOWN_PREPARE:
6011 set_cpu_active((long)hcpu, false);
6012 return NOTIFY_OK;
6013 default:
6014 return NOTIFY_DONE;
6015 }
6016}
6017
5822static int __init migration_init(void) 6018static int __init migration_init(void)
5823{ 6019{
5824 void *cpu = (void *)(long)smp_processor_id(); 6020 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6021 int err;
5826 6022
5827 /* Start one for the boot CPU: */ 6023 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6024 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6025 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6026 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6027 register_cpu_notifier(&migration_notifier);
5832 6028
6029 /* Register cpu active notifiers */
6030 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6031 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6032
5833 return 0; 6033 return 0;
5834} 6034}
5835early_initcall(migration_init); 6035early_initcall(migration_init);
@@ -6064,23 +6264,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6264 free_rootdomain(old_rd);
6065} 6265}
6066 6266
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6267static int init_rootdomain(struct root_domain *rd)
6068{ 6268{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6269 memset(rd, 0, sizeof(*rd));
6072 6270
6073 if (bootmem) 6271 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6272 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6273 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6274 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6275 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6276 goto free_online;
6082 6277
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6278 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6279 goto free_rto_mask;
6085 return 0; 6280 return 0;
6086 6281
@@ -6096,7 +6291,7 @@ out:
6096 6291
6097static void init_defrootdomain(void) 6292static void init_defrootdomain(void)
6098{ 6293{
6099 init_rootdomain(&def_root_domain, true); 6294 init_rootdomain(&def_root_domain);
6100 6295
6101 atomic_set(&def_root_domain.refcount, 1); 6296 atomic_set(&def_root_domain.refcount, 1);
6102} 6297}
@@ -6109,7 +6304,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6304 if (!rd)
6110 return NULL; 6305 return NULL;
6111 6306
6112 if (init_rootdomain(rd, false) != 0) { 6307 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6308 kfree(rd);
6114 return NULL; 6309 return NULL;
6115 } 6310 }
@@ -7288,29 +7483,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7483}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7484#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7485
7291#ifndef CONFIG_CPUSETS
7292/* 7486/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7487 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7488 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7489 * around partition_sched_domains().
7295 */ 7490 */
7296static int update_sched_domains(struct notifier_block *nfb, 7491static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7492 void *hcpu)
7298{ 7493{
7299 switch (action) { 7494 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7495 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7496 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7497 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7498 return NOTIFY_OK;
7499 default:
7500 return NOTIFY_DONE;
7501 }
7502}
7308 7503
7504static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7505 void *hcpu)
7506{
7507 switch (action & ~CPU_TASKS_FROZEN) {
7508 case CPU_DOWN_PREPARE:
7509 cpuset_update_active_cpus();
7510 return NOTIFY_OK;
7309 default: 7511 default:
7310 return NOTIFY_DONE; 7512 return NOTIFY_DONE;
7311 } 7513 }
7312} 7514}
7313#endif
7314 7515
7315static int update_runtime(struct notifier_block *nfb, 7516static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7517 unsigned long action, void *hcpu)
@@ -7356,10 +7557,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7557 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7558 put_online_cpus();
7358 7559
7359#ifndef CONFIG_CPUSETS 7560 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7561 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7562
7364 /* RT runtime code needs to handle some hotplug events */ 7563 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7564 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7803,9 @@ void __init sched_init(void)
7604 7803
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7804 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 7805 rq->cpu_load[j] = 0;
7806
7807 rq->last_load_update_tick = jiffies;
7808
7607#ifdef CONFIG_SMP 7809#ifdef CONFIG_SMP
7608 rq->sd = NULL; 7810 rq->sd = NULL;
7609 rq->rd = NULL; 7811 rq->rd = NULL;
@@ -7617,6 +7819,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 7819 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 7820 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 7821 rq_attach_root(rq, &def_root_domain);
7822#ifdef CONFIG_NO_HZ
7823 rq->nohz_balance_kick = 0;
7824 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7825#endif
7620#endif 7826#endif
7621 init_rq_hrtick(rq); 7827 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 7828 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7867,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7867 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 7868#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 7869#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7870 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7871 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7872 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7873 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7874 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 7875#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 7876 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 7877 if (cpu_isolated_map == NULL)