aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c381
1 files changed, 293 insertions, 88 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 63b4a14682fa..f6c9bb6ac70b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1642,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
1642 if (root_task_group_empty()) 1665 if (root_task_group_empty())
1643 return; 1666 return;
1644 1667
1645 now = cpu_clock(raw_smp_processor_id()); 1668 now = local_clock();
1646 elapsed = now - sd->last_update; 1669 elapsed = now - sd->last_update;
1647 1670
1648 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1671 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1795,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1795static void calc_load_account_idle(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1796static void update_sysctl(void); 1819static void update_sysctl(void);
1797static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821static void update_cpu_load(struct rq *this_rq);
1798 1822
1799static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1800{ 1824{
@@ -2257,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
2257} 2281}
2258#endif 2282#endif
2259 2283
2260/*** 2284static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2285 bool is_sync, bool is_migrate, bool is_local,
2286 unsigned long en_flags)
2287{
2288 schedstat_inc(p, se.statistics.nr_wakeups);
2289 if (is_sync)
2290 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2291 if (is_migrate)
2292 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2293 if (is_local)
2294 schedstat_inc(p, se.statistics.nr_wakeups_local);
2295 else
2296 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2297
2298 activate_task(rq, p, en_flags);
2299}
2300
2301static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2302 int wake_flags, bool success)
2303{
2304 trace_sched_wakeup(p, success);
2305 check_preempt_curr(rq, p, wake_flags);
2306
2307 p->state = TASK_RUNNING;
2308#ifdef CONFIG_SMP
2309 if (p->sched_class->task_woken)
2310 p->sched_class->task_woken(rq, p);
2311
2312 if (unlikely(rq->idle_stamp)) {
2313 u64 delta = rq->clock - rq->idle_stamp;
2314 u64 max = 2*sysctl_sched_migration_cost;
2315
2316 if (delta > max)
2317 rq->avg_idle = max;
2318 else
2319 update_avg(&rq->avg_idle, delta);
2320 rq->idle_stamp = 0;
2321 }
2322#endif
2323 /* if a worker is waking up, notify workqueue */
2324 if ((p->flags & PF_WQ_WORKER) && success)
2325 wq_worker_waking_up(p, cpu_of(rq));
2326}
2327
2328/**
2261 * try_to_wake_up - wake up a thread 2329 * try_to_wake_up - wake up a thread
2262 * @p: the to-be-woken-up thread 2330 * @p: the thread to be awakened
2263 * @state: the mask of task states that can be woken 2331 * @state: the mask of task states that can be woken
2264 * @sync: do a synchronous wakeup? 2332 * @wake_flags: wake modifier flags (WF_*)
2265 * 2333 *
2266 * Put it on the run-queue if it's not already there. The "current" 2334 * Put it on the run-queue if it's not already there. The "current"
2267 * thread is always on the run-queue (except when the actual 2335 * thread is always on the run-queue (except when the actual
@@ -2269,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
2269 * the simpler "current->state = TASK_RUNNING" to mark yourself 2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2270 * runnable without the overhead of this. 2338 * runnable without the overhead of this.
2271 * 2339 *
2272 * returns failure only if the task is already active. 2340 * Returns %true if @p was woken up, %false if it was already running
2341 * or @state didn't match @p's state.
2273 */ 2342 */
2274static int try_to_wake_up(struct task_struct *p, unsigned int state, 2343static int try_to_wake_up(struct task_struct *p, unsigned int state,
2275 int wake_flags) 2344 int wake_flags)
@@ -2349,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2349 2418
2350out_activate: 2419out_activate:
2351#endif /* CONFIG_SMP */ 2420#endif /* CONFIG_SMP */
2352 schedstat_inc(p, se.statistics.nr_wakeups); 2421 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2353 if (wake_flags & WF_SYNC) 2422 cpu == this_cpu, en_flags);
2354 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2355 if (orig_cpu != cpu)
2356 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2357 if (cpu == this_cpu)
2358 schedstat_inc(p, se.statistics.nr_wakeups_local);
2359 else
2360 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2361 activate_task(rq, p, en_flags);
2362 success = 1; 2423 success = 1;
2363
2364out_running: 2424out_running:
2365 trace_sched_wakeup(p, success); 2425 ttwu_post_activation(p, rq, wake_flags, success);
2366 check_preempt_curr(rq, p, wake_flags);
2367
2368 p->state = TASK_RUNNING;
2369#ifdef CONFIG_SMP
2370 if (p->sched_class->task_woken)
2371 p->sched_class->task_woken(rq, p);
2372
2373 if (unlikely(rq->idle_stamp)) {
2374 u64 delta = rq->clock - rq->idle_stamp;
2375 u64 max = 2*sysctl_sched_migration_cost;
2376
2377 if (delta > max)
2378 rq->avg_idle = max;
2379 else
2380 update_avg(&rq->avg_idle, delta);
2381 rq->idle_stamp = 0;
2382 }
2383#endif
2384out: 2426out:
2385 task_rq_unlock(rq, &flags); 2427 task_rq_unlock(rq, &flags);
2386 put_cpu(); 2428 put_cpu();
@@ -2389,6 +2431,37 @@ out:
2389} 2431}
2390 2432
2391/** 2433/**
2434 * try_to_wake_up_local - try to wake up a local task with rq lock held
2435 * @p: the thread to be awakened
2436 *
2437 * Put @p on the run-queue if it's not alredy there. The caller must
2438 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2439 * the current task. this_rq() stays locked over invocation.
2440 */
2441static void try_to_wake_up_local(struct task_struct *p)
2442{
2443 struct rq *rq = task_rq(p);
2444 bool success = false;
2445
2446 BUG_ON(rq != this_rq());
2447 BUG_ON(p == current);
2448 lockdep_assert_held(&rq->lock);
2449
2450 if (!(p->state & TASK_NORMAL))
2451 return;
2452
2453 if (!p->se.on_rq) {
2454 if (likely(!task_running(rq, p))) {
2455 schedstat_inc(rq, ttwu_count);
2456 schedstat_inc(rq, ttwu_local);
2457 }
2458 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2459 success = true;
2460 }
2461 ttwu_post_activation(p, rq, 0, success);
2462}
2463
2464/**
2392 * wake_up_process - Wake up a specific process 2465 * wake_up_process - Wake up a specific process
2393 * @p: The process to be woken up. 2466 * @p: The process to be woken up.
2394 * 2467 *
@@ -3002,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
3002} 3075}
3003 3076
3004/* 3077/*
3078 * The exact cpuload at various idx values, calculated at every tick would be
3079 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3080 *
3081 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3082 * on nth tick when cpu may be busy, then we have:
3083 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3084 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3085 *
3086 * decay_load_missed() below does efficient calculation of
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3089 *
3090 * The calculation is approximated on a 128 point scale.
3091 * degrade_zero_ticks is the number of ticks after which load at any
3092 * particular idx is approximated to be zero.
3093 * degrade_factor is a precomputed table, a row for each load idx.
3094 * Each column corresponds to degradation factor for a power of two ticks,
3095 * based on 128 point scale.
3096 * Example:
3097 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3098 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3099 *
3100 * With this power of 2 load factors, we can degrade the load n times
3101 * by looking at 1 bits in n and doing as many mult/shift instead of
3102 * n mult/shifts needed by the exact degradation.
3103 */
3104#define DEGRADE_SHIFT 7
3105static const unsigned char
3106 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3107static const unsigned char
3108 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3109 {0, 0, 0, 0, 0, 0, 0, 0},
3110 {64, 32, 8, 0, 0, 0, 0, 0},
3111 {96, 72, 40, 12, 1, 0, 0},
3112 {112, 98, 75, 43, 15, 1, 0},
3113 {120, 112, 98, 76, 45, 16, 2} };
3114
3115/*
3116 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3117 * would be when CPU is idle and so we just decay the old load without
3118 * adding any new load.
3119 */
3120static unsigned long
3121decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3122{
3123 int j = 0;
3124
3125 if (!missed_updates)
3126 return load;
3127
3128 if (missed_updates >= degrade_zero_ticks[idx])
3129 return 0;
3130
3131 if (idx == 1)
3132 return load >> missed_updates;
3133
3134 while (missed_updates) {
3135 if (missed_updates % 2)
3136 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3137
3138 missed_updates >>= 1;
3139 j++;
3140 }
3141 return load;
3142}
3143
3144/*
3005 * Update rq->cpu_load[] statistics. This function is usually called every 3145 * Update rq->cpu_load[] statistics. This function is usually called every
3006 * scheduler tick (TICK_NSEC). 3146 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3147 * every tick. We fix it up based on jiffies.
3007 */ 3148 */
3008static void update_cpu_load(struct rq *this_rq) 3149static void update_cpu_load(struct rq *this_rq)
3009{ 3150{
3010 unsigned long this_load = this_rq->load.weight; 3151 unsigned long this_load = this_rq->load.weight;
3152 unsigned long curr_jiffies = jiffies;
3153 unsigned long pending_updates;
3011 int i, scale; 3154 int i, scale;
3012 3155
3013 this_rq->nr_load_updates++; 3156 this_rq->nr_load_updates++;
3014 3157
3158 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3159 if (curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3015 /* Update our load: */ 3165 /* Update our load: */
3016 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3166 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3167 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3017 unsigned long old_load, new_load; 3168 unsigned long old_load, new_load;
3018 3169
3019 /* scale is effectively 1 << i now, and >> i divides by scale */ 3170 /* scale is effectively 1 << i now, and >> i divides by scale */
3020 3171
3021 old_load = this_rq->cpu_load[i]; 3172 old_load = this_rq->cpu_load[i];
3173 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3022 new_load = this_load; 3174 new_load = this_load;
3023 /* 3175 /*
3024 * Round up the averaging division if load is increasing. This 3176 * Round up the averaging division if load is increasing. This
@@ -3026,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
3026 * example. 3178 * example.
3027 */ 3179 */
3028 if (new_load > old_load) 3180 if (new_load > old_load)
3029 new_load += scale-1; 3181 new_load += scale - 1;
3030 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3182
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3031 } 3184 }
3185}
3186
3187static void update_cpu_load_active(struct rq *this_rq)
3188{
3189 update_cpu_load(this_rq);
3032 3190
3033 calc_load_account_active(this_rq); 3191 calc_load_account_active(this_rq);
3034} 3192}
@@ -3416,7 +3574,7 @@ void scheduler_tick(void)
3416 3574
3417 raw_spin_lock(&rq->lock); 3575 raw_spin_lock(&rq->lock);
3418 update_rq_clock(rq); 3576 update_rq_clock(rq);
3419 update_cpu_load(rq); 3577 update_cpu_load_active(rq);
3420 curr->sched_class->task_tick(rq, curr, 0); 3578 curr->sched_class->task_tick(rq, curr, 0);
3421 raw_spin_unlock(&rq->lock); 3579 raw_spin_unlock(&rq->lock);
3422 3580
@@ -3588,7 +3746,6 @@ need_resched:
3588 rq = cpu_rq(cpu); 3746 rq = cpu_rq(cpu);
3589 rcu_note_context_switch(cpu); 3747 rcu_note_context_switch(cpu);
3590 prev = rq->curr; 3748 prev = rq->curr;
3591 switch_count = &prev->nivcsw;
3592 3749
3593 release_kernel_lock(prev); 3750 release_kernel_lock(prev);
3594need_resched_nonpreemptible: 3751need_resched_nonpreemptible:
@@ -3601,11 +3758,26 @@ need_resched_nonpreemptible:
3601 raw_spin_lock_irq(&rq->lock); 3758 raw_spin_lock_irq(&rq->lock);
3602 clear_tsk_need_resched(prev); 3759 clear_tsk_need_resched(prev);
3603 3760
3761 switch_count = &prev->nivcsw;
3604 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3605 if (unlikely(signal_pending_state(prev->state, prev))) 3763 if (unlikely(signal_pending_state(prev->state, prev))) {
3606 prev->state = TASK_RUNNING; 3764 prev->state = TASK_RUNNING;
3607 else 3765 } else {
3766 /*
3767 * If a worker is going to sleep, notify and
3768 * ask workqueue whether it wants to wake up a
3769 * task to maintain concurrency. If so, wake
3770 * up the task.
3771 */
3772 if (prev->flags & PF_WQ_WORKER) {
3773 struct task_struct *to_wakeup;
3774
3775 to_wakeup = wq_worker_sleeping(prev, cpu);
3776 if (to_wakeup)
3777 try_to_wake_up_local(to_wakeup);
3778 }
3608 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3779 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3780 }
3609 switch_count = &prev->nvcsw; 3781 switch_count = &prev->nvcsw;
3610 } 3782 }
3611 3783
@@ -3627,8 +3799,10 @@ need_resched_nonpreemptible:
3627 3799
3628 context_switch(rq, prev, next); /* unlocks the rq */ 3800 context_switch(rq, prev, next); /* unlocks the rq */
3629 /* 3801 /*
3630 * the context switch might have flipped the stack from under 3802 * The context switch have flipped the stack from under us
3631 * us, hence refresh the local variables. 3803 * and restored the local variables which were saved when
3804 * this task called schedule() in the past. prev == current
3805 * is still correct, but it can be moved to another cpu/rq.
3632 */ 3806 */
3633 cpu = smp_processor_id(); 3807 cpu = smp_processor_id();
3634 rq = cpu_rq(cpu); 3808 rq = cpu_rq(cpu);
@@ -3637,11 +3811,8 @@ need_resched_nonpreemptible:
3637 3811
3638 post_schedule(rq); 3812 post_schedule(rq);
3639 3813
3640 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3814 if (unlikely(reacquire_kernel_lock(prev)))
3641 prev = rq->curr;
3642 switch_count = &prev->nivcsw;
3643 goto need_resched_nonpreemptible; 3815 goto need_resched_nonpreemptible;
3644 }
3645 3816
3646 preempt_enable_no_resched(); 3817 preempt_enable_no_resched();
3647 if (need_resched()) 3818 if (need_resched())
@@ -4431,12 +4602,8 @@ recheck:
4431 */ 4602 */
4432 if (user && !capable(CAP_SYS_NICE)) { 4603 if (user && !capable(CAP_SYS_NICE)) {
4433 if (rt_policy(policy)) { 4604 if (rt_policy(policy)) {
4434 unsigned long rlim_rtprio; 4605 unsigned long rlim_rtprio =
4435 4606 task_rlimit(p, RLIMIT_RTPRIO);
4436 if (!lock_task_sighand(p, &flags))
4437 return -ESRCH;
4438 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4439 unlock_task_sighand(p, &flags);
4440 4607
4441 /* can't set/change the rt policy */ 4608 /* can't set/change the rt policy */
4442 if (policy != p->policy && !rlim_rtprio) 4609 if (policy != p->policy && !rlim_rtprio)
@@ -5806,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5806 */ 5973 */
5807static struct notifier_block __cpuinitdata migration_notifier = { 5974static struct notifier_block __cpuinitdata migration_notifier = {
5808 .notifier_call = migration_call, 5975 .notifier_call = migration_call,
5809 .priority = 10 5976 .priority = CPU_PRI_MIGRATION,
5810}; 5977};
5811 5978
5979static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5980 unsigned long action, void *hcpu)
5981{
5982 switch (action & ~CPU_TASKS_FROZEN) {
5983 case CPU_ONLINE:
5984 case CPU_DOWN_FAILED:
5985 set_cpu_active((long)hcpu, true);
5986 return NOTIFY_OK;
5987 default:
5988 return NOTIFY_DONE;
5989 }
5990}
5991
5992static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5993 unsigned long action, void *hcpu)
5994{
5995 switch (action & ~CPU_TASKS_FROZEN) {
5996 case CPU_DOWN_PREPARE:
5997 set_cpu_active((long)hcpu, false);
5998 return NOTIFY_OK;
5999 default:
6000 return NOTIFY_DONE;
6001 }
6002}
6003
5812static int __init migration_init(void) 6004static int __init migration_init(void)
5813{ 6005{
5814 void *cpu = (void *)(long)smp_processor_id(); 6006 void *cpu = (void *)(long)smp_processor_id();
5815 int err; 6007 int err;
5816 6008
5817 /* Start one for the boot CPU: */ 6009 /* Initialize migration for the boot CPU */
5818 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6010 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5819 BUG_ON(err == NOTIFY_BAD); 6011 BUG_ON(err == NOTIFY_BAD);
5820 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6012 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5821 register_cpu_notifier(&migration_notifier); 6013 register_cpu_notifier(&migration_notifier);
5822 6014
6015 /* Register cpu active notifiers */
6016 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6017 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6018
5823 return 0; 6019 return 0;
5824} 6020}
5825early_initcall(migration_init); 6021early_initcall(migration_init);
@@ -6054,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6054 free_rootdomain(old_rd); 6250 free_rootdomain(old_rd);
6055} 6251}
6056 6252
6057static int init_rootdomain(struct root_domain *rd, bool bootmem) 6253static int init_rootdomain(struct root_domain *rd)
6058{ 6254{
6059 gfp_t gfp = GFP_KERNEL;
6060
6061 memset(rd, 0, sizeof(*rd)); 6255 memset(rd, 0, sizeof(*rd));
6062 6256
6063 if (bootmem) 6257 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6064 gfp = GFP_NOWAIT;
6065
6066 if (!alloc_cpumask_var(&rd->span, gfp))
6067 goto out; 6258 goto out;
6068 if (!alloc_cpumask_var(&rd->online, gfp)) 6259 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6069 goto free_span; 6260 goto free_span;
6070 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6261 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6071 goto free_online; 6262 goto free_online;
6072 6263
6073 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6264 if (cpupri_init(&rd->cpupri) != 0)
6074 goto free_rto_mask; 6265 goto free_rto_mask;
6075 return 0; 6266 return 0;
6076 6267
@@ -6086,7 +6277,7 @@ out:
6086 6277
6087static void init_defrootdomain(void) 6278static void init_defrootdomain(void)
6088{ 6279{
6089 init_rootdomain(&def_root_domain, true); 6280 init_rootdomain(&def_root_domain);
6090 6281
6091 atomic_set(&def_root_domain.refcount, 1); 6282 atomic_set(&def_root_domain.refcount, 1);
6092} 6283}
@@ -6099,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
6099 if (!rd) 6290 if (!rd)
6100 return NULL; 6291 return NULL;
6101 6292
6102 if (init_rootdomain(rd, false) != 0) { 6293 if (init_rootdomain(rd) != 0) {
6103 kfree(rd); 6294 kfree(rd);
6104 return NULL; 6295 return NULL;
6105 } 6296 }
@@ -7278,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7278} 7469}
7279#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7470#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7280 7471
7281#ifndef CONFIG_CPUSETS
7282/* 7472/*
7283 * Add online and remove offline CPUs from the scheduler domains. 7473 * Update cpusets according to cpu_active mask. If cpusets are
7284 * When cpusets are enabled they take over this function. 7474 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7475 * around partition_sched_domains().
7285 */ 7476 */
7286static int update_sched_domains(struct notifier_block *nfb, 7477static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7287 unsigned long action, void *hcpu) 7478 void *hcpu)
7288{ 7479{
7289 switch (action) { 7480 switch (action & ~CPU_TASKS_FROZEN) {
7290 case CPU_ONLINE: 7481 case CPU_ONLINE:
7291 case CPU_ONLINE_FROZEN:
7292 case CPU_DOWN_PREPARE:
7293 case CPU_DOWN_PREPARE_FROZEN:
7294 case CPU_DOWN_FAILED: 7482 case CPU_DOWN_FAILED:
7295 case CPU_DOWN_FAILED_FROZEN: 7483 cpuset_update_active_cpus();
7296 partition_sched_domains(1, NULL, NULL);
7297 return NOTIFY_OK; 7484 return NOTIFY_OK;
7485 default:
7486 return NOTIFY_DONE;
7487 }
7488}
7298 7489
7490static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7491 void *hcpu)
7492{
7493 switch (action & ~CPU_TASKS_FROZEN) {
7494 case CPU_DOWN_PREPARE:
7495 cpuset_update_active_cpus();
7496 return NOTIFY_OK;
7299 default: 7497 default:
7300 return NOTIFY_DONE; 7498 return NOTIFY_DONE;
7301 } 7499 }
7302} 7500}
7303#endif
7304 7501
7305static int update_runtime(struct notifier_block *nfb, 7502static int update_runtime(struct notifier_block *nfb,
7306 unsigned long action, void *hcpu) 7503 unsigned long action, void *hcpu)
@@ -7346,10 +7543,8 @@ void __init sched_init_smp(void)
7346 mutex_unlock(&sched_domains_mutex); 7543 mutex_unlock(&sched_domains_mutex);
7347 put_online_cpus(); 7544 put_online_cpus();
7348 7545
7349#ifndef CONFIG_CPUSETS 7546 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7350 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7547 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7351 hotcpu_notifier(update_sched_domains, 0);
7352#endif
7353 7548
7354 /* RT runtime code needs to handle some hotplug events */ 7549 /* RT runtime code needs to handle some hotplug events */
7355 hotcpu_notifier(update_runtime, 0); 7550 hotcpu_notifier(update_runtime, 0);
@@ -7594,6 +7789,9 @@ void __init sched_init(void)
7594 7789
7595 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7790 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7596 rq->cpu_load[j] = 0; 7791 rq->cpu_load[j] = 0;
7792
7793 rq->last_load_update_tick = jiffies;
7794
7597#ifdef CONFIG_SMP 7795#ifdef CONFIG_SMP
7598 rq->sd = NULL; 7796 rq->sd = NULL;
7599 rq->rd = NULL; 7797 rq->rd = NULL;
@@ -7607,6 +7805,10 @@ void __init sched_init(void)
7607 rq->idle_stamp = 0; 7805 rq->idle_stamp = 0;
7608 rq->avg_idle = 2*sysctl_sched_migration_cost; 7806 rq->avg_idle = 2*sysctl_sched_migration_cost;
7609 rq_attach_root(rq, &def_root_domain); 7807 rq_attach_root(rq, &def_root_domain);
7808#ifdef CONFIG_NO_HZ
7809 rq->nohz_balance_kick = 0;
7810 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7811#endif
7610#endif 7812#endif
7611 init_rq_hrtick(rq); 7813 init_rq_hrtick(rq);
7612 atomic_set(&rq->nr_iowait, 0); 7814 atomic_set(&rq->nr_iowait, 0);
@@ -7651,8 +7853,11 @@ void __init sched_init(void)
7651 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7853 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7652#ifdef CONFIG_SMP 7854#ifdef CONFIG_SMP
7653#ifdef CONFIG_NO_HZ 7855#ifdef CONFIG_NO_HZ
7654 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7856 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7655 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7857 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7858 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7859 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7860 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7656#endif 7861#endif
7657 /* May be allocated at isolcpus cmdline parse time */ 7862 /* May be allocated at isolcpus cmdline parse time */
7658 if (cpu_isolated_map == NULL) 7863 if (cpu_isolated_map == NULL)