diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-20 13:31:44 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-20 13:31:44 -0400 |
commit | 2ba68940c893c8f0bfc8573c041254251bb6aeab (patch) | |
tree | fa83ebb01d32abd98123fa28f9f6f0b3eaeee25d /kernel | |
parent | 9c2b957db1772ebf942ae7a9346b14eba6c8ca66 (diff) | |
parent | 600e145882802d6ccbfe2c4aea243d97caeb91a9 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes for v3.4 from Ingo Molnar
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
printk: Make it compile with !CONFIG_PRINTK
sched/x86: Fix overflow in cyc2ns_offset
sched: Fix nohz load accounting -- again!
sched: Update yield() docs
printk/sched: Introduce special printk_sched() for those awkward moments
sched/nohz: Correctly initialize 'next_balance' in 'nohz' idle balancer
sched: Cleanup cpu_active madness
sched: Fix load-balance wreckage
sched: Clean up parameter passing of proc_sched_autogroup_set_nice()
sched: Ditch per cgroup task lists for load-balancing
sched: Rename load-balancing fields
sched: Move load-balancing arguments into helper struct
sched/rt: Do not submit new work when PI-blocked
sched/rt: Prevent idle task boosting
sched/wait: Add __wake_up_all_locked() API
sched/rt: Document scheduler related skip-resched-check sites
sched/rt: Use schedule_preempt_disabled()
sched/rt: Add schedule_preempt_disabled()
sched/rt: Do not throttle when PI boosting
sched/rt: Keep period timer ticking when rt throttling is active
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/mutex.c | 4 | ||||
-rw-r--r-- | kernel/printk.c | 40 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 12 | ||||
-rw-r--r-- | kernel/sched/core.c | 129 | ||||
-rw-r--r-- | kernel/sched/debug.c | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 392 | ||||
-rw-r--r-- | kernel/sched/rt.c | 43 | ||||
-rw-r--r-- | kernel/sched/sched.h | 15 | ||||
-rw-r--r-- | kernel/sched/stats.c | 4 | ||||
-rw-r--r-- | kernel/softirq.c | 8 |
10 files changed, 348 insertions, 300 deletions
diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
240 | 240 | ||
241 | /* didn't get the lock, go to sleep: */ | 241 | /* didn't get the lock, go to sleep: */ |
242 | spin_unlock_mutex(&lock->wait_lock, flags); | 242 | spin_unlock_mutex(&lock->wait_lock, flags); |
243 | preempt_enable_no_resched(); | 243 | schedule_preempt_disabled(); |
244 | schedule(); | ||
245 | preempt_disable(); | ||
246 | spin_lock_mutex(&lock->wait_lock, flags); | 244 | spin_lock_mutex(&lock->wait_lock, flags); |
247 | } | 245 | } |
248 | 246 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 0b3ea2cbd5fb..b663c2c95d39 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -1216,13 +1216,27 @@ int is_console_locked(void) | |||
1216 | return console_locked; | 1216 | return console_locked; |
1217 | } | 1217 | } |
1218 | 1218 | ||
1219 | /* | ||
1220 | * Delayed printk facility, for scheduler-internal messages: | ||
1221 | */ | ||
1222 | #define PRINTK_BUF_SIZE 512 | ||
1223 | |||
1224 | #define PRINTK_PENDING_WAKEUP 0x01 | ||
1225 | #define PRINTK_PENDING_SCHED 0x02 | ||
1226 | |||
1219 | static DEFINE_PER_CPU(int, printk_pending); | 1227 | static DEFINE_PER_CPU(int, printk_pending); |
1228 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
1220 | 1229 | ||
1221 | void printk_tick(void) | 1230 | void printk_tick(void) |
1222 | { | 1231 | { |
1223 | if (__this_cpu_read(printk_pending)) { | 1232 | if (__this_cpu_read(printk_pending)) { |
1224 | __this_cpu_write(printk_pending, 0); | 1233 | int pending = __this_cpu_xchg(printk_pending, 0); |
1225 | wake_up_interruptible(&log_wait); | 1234 | if (pending & PRINTK_PENDING_SCHED) { |
1235 | char *buf = __get_cpu_var(printk_sched_buf); | ||
1236 | printk(KERN_WARNING "[sched_delayed] %s", buf); | ||
1237 | } | ||
1238 | if (pending & PRINTK_PENDING_WAKEUP) | ||
1239 | wake_up_interruptible(&log_wait); | ||
1226 | } | 1240 | } |
1227 | } | 1241 | } |
1228 | 1242 | ||
@@ -1236,7 +1250,7 @@ int printk_needs_cpu(int cpu) | |||
1236 | void wake_up_klogd(void) | 1250 | void wake_up_klogd(void) |
1237 | { | 1251 | { |
1238 | if (waitqueue_active(&log_wait)) | 1252 | if (waitqueue_active(&log_wait)) |
1239 | this_cpu_write(printk_pending, 1); | 1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1240 | } | 1254 | } |
1241 | 1255 | ||
1242 | /** | 1256 | /** |
@@ -1629,6 +1643,26 @@ late_initcall(printk_late_init); | |||
1629 | 1643 | ||
1630 | #if defined CONFIG_PRINTK | 1644 | #if defined CONFIG_PRINTK |
1631 | 1645 | ||
1646 | int printk_sched(const char *fmt, ...) | ||
1647 | { | ||
1648 | unsigned long flags; | ||
1649 | va_list args; | ||
1650 | char *buf; | ||
1651 | int r; | ||
1652 | |||
1653 | local_irq_save(flags); | ||
1654 | buf = __get_cpu_var(printk_sched_buf); | ||
1655 | |||
1656 | va_start(args, fmt); | ||
1657 | r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); | ||
1658 | va_end(args); | ||
1659 | |||
1660 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); | ||
1661 | local_irq_restore(flags); | ||
1662 | |||
1663 | return r; | ||
1664 | } | ||
1665 | |||
1632 | /* | 1666 | /* |
1633 | * printk rate limiting, lifted from the networking subsystem. | 1667 | * printk rate limiting, lifted from the networking subsystem. |
1634 | * | 1668 | * |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); | |||
195 | 195 | ||
196 | #ifdef CONFIG_PROC_FS | 196 | #ifdef CONFIG_PROC_FS |
197 | 197 | ||
198 | int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | 198 | int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) |
199 | { | 199 | { |
200 | static unsigned long next = INITIAL_JIFFIES; | 200 | static unsigned long next = INITIAL_JIFFIES; |
201 | struct autogroup *ag; | 201 | struct autogroup *ag; |
202 | int err; | 202 | int err; |
203 | 203 | ||
204 | if (*nice < -20 || *nice > 19) | 204 | if (nice < -20 || nice > 19) |
205 | return -EINVAL; | 205 | return -EINVAL; |
206 | 206 | ||
207 | err = security_task_setnice(current, *nice); | 207 | err = security_task_setnice(current, nice); |
208 | if (err) | 208 | if (err) |
209 | return err; | 209 | return err; |
210 | 210 | ||
211 | if (*nice < 0 && !can_nice(current, *nice)) | 211 | if (nice < 0 && !can_nice(current, nice)) |
212 | return -EPERM; | 212 | return -EPERM; |
213 | 213 | ||
214 | /* this is a heavy operation taking global locks.. */ | 214 | /* this is a heavy operation taking global locks.. */ |
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | |||
219 | ag = autogroup_task_get(p); | 219 | ag = autogroup_task_get(p); |
220 | 220 | ||
221 | down_write(&ag->lock); | 221 | down_write(&ag->lock); |
222 | err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); | 222 | err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); |
223 | if (!err) | 223 | if (!err) |
224 | ag->nice = *nice; | 224 | ag->nice = nice; |
225 | up_write(&ag->lock); | 225 | up_write(&ag->lock); |
226 | 226 | ||
227 | autogroup_kref_put(ag); | 227 | autogroup_kref_put(ag); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c41ba49767a..d2bd4647586c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1284,7 +1284,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1284 | * leave kernel. | 1284 | * leave kernel. |
1285 | */ | 1285 | */ |
1286 | if (p->mm && printk_ratelimit()) { | 1286 | if (p->mm && printk_ratelimit()) { |
1287 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", | 1287 | printk_sched("process %d (%s) no longer affine to cpu%d\n", |
1288 | task_pid_nr(p), p->comm, cpu); | 1288 | task_pid_nr(p), p->comm, cpu); |
1289 | } | 1289 | } |
1290 | 1290 | ||
@@ -1507,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | |||
1507 | } | 1507 | } |
1508 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1508 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
1509 | 1509 | ||
1510 | static inline int ttwu_share_cache(int this_cpu, int that_cpu) | 1510 | bool cpus_share_cache(int this_cpu, int that_cpu) |
1511 | { | 1511 | { |
1512 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1512 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
1513 | } | 1513 | } |
@@ -1518,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
1518 | struct rq *rq = cpu_rq(cpu); | 1518 | struct rq *rq = cpu_rq(cpu); |
1519 | 1519 | ||
1520 | #if defined(CONFIG_SMP) | 1520 | #if defined(CONFIG_SMP) |
1521 | if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { | 1521 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { |
1522 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1522 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ |
1523 | ttwu_queue_remote(p, cpu); | 1523 | ttwu_queue_remote(p, cpu); |
1524 | return; | 1524 | return; |
@@ -2266,13 +2266,10 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
2266 | * Once we've updated the global active value, we need to apply the exponential | 2266 | * Once we've updated the global active value, we need to apply the exponential |
2267 | * weights adjusted to the number of cycles missed. | 2267 | * weights adjusted to the number of cycles missed. |
2268 | */ | 2268 | */ |
2269 | static void calc_global_nohz(unsigned long ticks) | 2269 | static void calc_global_nohz(void) |
2270 | { | 2270 | { |
2271 | long delta, active, n; | 2271 | long delta, active, n; |
2272 | 2272 | ||
2273 | if (time_before(jiffies, calc_load_update)) | ||
2274 | return; | ||
2275 | |||
2276 | /* | 2273 | /* |
2277 | * If we crossed a calc_load_update boundary, make sure to fold | 2274 | * If we crossed a calc_load_update boundary, make sure to fold |
2278 | * any pending idle changes, the respective CPUs might have | 2275 | * any pending idle changes, the respective CPUs might have |
@@ -2284,31 +2281,25 @@ static void calc_global_nohz(unsigned long ticks) | |||
2284 | atomic_long_add(delta, &calc_load_tasks); | 2281 | atomic_long_add(delta, &calc_load_tasks); |
2285 | 2282 | ||
2286 | /* | 2283 | /* |
2287 | * If we were idle for multiple load cycles, apply them. | 2284 | * It could be the one fold was all it took, we done! |
2288 | */ | 2285 | */ |
2289 | if (ticks >= LOAD_FREQ) { | 2286 | if (time_before(jiffies, calc_load_update + 10)) |
2290 | n = ticks / LOAD_FREQ; | 2287 | return; |
2291 | 2288 | ||
2292 | active = atomic_long_read(&calc_load_tasks); | 2289 | /* |
2293 | active = active > 0 ? active * FIXED_1 : 0; | 2290 | * Catch-up, fold however many we are behind still |
2291 | */ | ||
2292 | delta = jiffies - calc_load_update - 10; | ||
2293 | n = 1 + (delta / LOAD_FREQ); | ||
2294 | 2294 | ||
2295 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2295 | active = atomic_long_read(&calc_load_tasks); |
2296 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2296 | active = active > 0 ? active * FIXED_1 : 0; |
2297 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2298 | 2297 | ||
2299 | calc_load_update += n * LOAD_FREQ; | 2298 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
2300 | } | 2299 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
2300 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2301 | 2301 | ||
2302 | /* | 2302 | calc_load_update += n * LOAD_FREQ; |
2303 | * Its possible the remainder of the above division also crosses | ||
2304 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
2305 | * which comes after this will take care of that. | ||
2306 | * | ||
2307 | * Consider us being 11 ticks before a cycle completion, and us | ||
2308 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
2309 | * age us 4 cycles, and the test in calc_global_load() will | ||
2310 | * pick up the final one. | ||
2311 | */ | ||
2312 | } | 2303 | } |
2313 | #else | 2304 | #else |
2314 | void calc_load_account_idle(struct rq *this_rq) | 2305 | void calc_load_account_idle(struct rq *this_rq) |
@@ -2320,7 +2311,7 @@ static inline long calc_load_fold_idle(void) | |||
2320 | return 0; | 2311 | return 0; |
2321 | } | 2312 | } |
2322 | 2313 | ||
2323 | static void calc_global_nohz(unsigned long ticks) | 2314 | static void calc_global_nohz(void) |
2324 | { | 2315 | { |
2325 | } | 2316 | } |
2326 | #endif | 2317 | #endif |
@@ -2348,8 +2339,6 @@ void calc_global_load(unsigned long ticks) | |||
2348 | { | 2339 | { |
2349 | long active; | 2340 | long active; |
2350 | 2341 | ||
2351 | calc_global_nohz(ticks); | ||
2352 | |||
2353 | if (time_before(jiffies, calc_load_update + 10)) | 2342 | if (time_before(jiffies, calc_load_update + 10)) |
2354 | return; | 2343 | return; |
2355 | 2344 | ||
@@ -2361,6 +2350,16 @@ void calc_global_load(unsigned long ticks) | |||
2361 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | 2350 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); |
2362 | 2351 | ||
2363 | calc_load_update += LOAD_FREQ; | 2352 | calc_load_update += LOAD_FREQ; |
2353 | |||
2354 | /* | ||
2355 | * Account one period with whatever state we found before | ||
2356 | * folding in the nohz state and ageing the entire idle period. | ||
2357 | * | ||
2358 | * This avoids loosing a sample when we go idle between | ||
2359 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
2360 | * under-accounting. | ||
2361 | */ | ||
2362 | calc_global_nohz(); | ||
2364 | } | 2363 | } |
2365 | 2364 | ||
2366 | /* | 2365 | /* |
@@ -3220,14 +3219,14 @@ need_resched: | |||
3220 | 3219 | ||
3221 | post_schedule(rq); | 3220 | post_schedule(rq); |
3222 | 3221 | ||
3223 | preempt_enable_no_resched(); | 3222 | sched_preempt_enable_no_resched(); |
3224 | if (need_resched()) | 3223 | if (need_resched()) |
3225 | goto need_resched; | 3224 | goto need_resched; |
3226 | } | 3225 | } |
3227 | 3226 | ||
3228 | static inline void sched_submit_work(struct task_struct *tsk) | 3227 | static inline void sched_submit_work(struct task_struct *tsk) |
3229 | { | 3228 | { |
3230 | if (!tsk->state) | 3229 | if (!tsk->state || tsk_is_pi_blocked(tsk)) |
3231 | return; | 3230 | return; |
3232 | /* | 3231 | /* |
3233 | * If we are going to sleep and we have plugged IO queued, | 3232 | * If we are going to sleep and we have plugged IO queued, |
@@ -3246,6 +3245,18 @@ asmlinkage void __sched schedule(void) | |||
3246 | } | 3245 | } |
3247 | EXPORT_SYMBOL(schedule); | 3246 | EXPORT_SYMBOL(schedule); |
3248 | 3247 | ||
3248 | /** | ||
3249 | * schedule_preempt_disabled - called with preemption disabled | ||
3250 | * | ||
3251 | * Returns with preemption disabled. Note: preempt_count must be 1 | ||
3252 | */ | ||
3253 | void __sched schedule_preempt_disabled(void) | ||
3254 | { | ||
3255 | sched_preempt_enable_no_resched(); | ||
3256 | schedule(); | ||
3257 | preempt_disable(); | ||
3258 | } | ||
3259 | |||
3249 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 3260 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
3250 | 3261 | ||
3251 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | 3262 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
@@ -3406,9 +3417,9 @@ EXPORT_SYMBOL(__wake_up); | |||
3406 | /* | 3417 | /* |
3407 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 3418 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
3408 | */ | 3419 | */ |
3409 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 3420 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) |
3410 | { | 3421 | { |
3411 | __wake_up_common(q, mode, 1, 0, NULL); | 3422 | __wake_up_common(q, mode, nr, 0, NULL); |
3412 | } | 3423 | } |
3413 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 3424 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
3414 | 3425 | ||
@@ -3767,6 +3778,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3767 | 3778 | ||
3768 | rq = __task_rq_lock(p); | 3779 | rq = __task_rq_lock(p); |
3769 | 3780 | ||
3781 | /* | ||
3782 | * Idle task boosting is a nono in general. There is one | ||
3783 | * exception, when PREEMPT_RT and NOHZ is active: | ||
3784 | * | ||
3785 | * The idle task calls get_next_timer_interrupt() and holds | ||
3786 | * the timer wheel base->lock on the CPU and another CPU wants | ||
3787 | * to access the timer (probably to cancel it). We can safely | ||
3788 | * ignore the boosting request, as the idle CPU runs this code | ||
3789 | * with interrupts disabled and will complete the lock | ||
3790 | * protected section without being interrupted. So there is no | ||
3791 | * real need to boost. | ||
3792 | */ | ||
3793 | if (unlikely(p == rq->idle)) { | ||
3794 | WARN_ON(p != rq->curr); | ||
3795 | WARN_ON(p->pi_blocked_on); | ||
3796 | goto out_unlock; | ||
3797 | } | ||
3798 | |||
3770 | trace_sched_pi_setprio(p, prio); | 3799 | trace_sched_pi_setprio(p, prio); |
3771 | oldprio = p->prio; | 3800 | oldprio = p->prio; |
3772 | prev_class = p->sched_class; | 3801 | prev_class = p->sched_class; |
@@ -3790,11 +3819,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3790 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 3819 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
3791 | 3820 | ||
3792 | check_class_changed(rq, p, prev_class, oldprio); | 3821 | check_class_changed(rq, p, prev_class, oldprio); |
3822 | out_unlock: | ||
3793 | __task_rq_unlock(rq); | 3823 | __task_rq_unlock(rq); |
3794 | } | 3824 | } |
3795 | |||
3796 | #endif | 3825 | #endif |
3797 | |||
3798 | void set_user_nice(struct task_struct *p, long nice) | 3826 | void set_user_nice(struct task_struct *p, long nice) |
3799 | { | 3827 | { |
3800 | int old_prio, delta, on_rq; | 3828 | int old_prio, delta, on_rq; |
@@ -4474,7 +4502,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4474 | __release(rq->lock); | 4502 | __release(rq->lock); |
4475 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 4503 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
4476 | do_raw_spin_unlock(&rq->lock); | 4504 | do_raw_spin_unlock(&rq->lock); |
4477 | preempt_enable_no_resched(); | 4505 | sched_preempt_enable_no_resched(); |
4478 | 4506 | ||
4479 | schedule(); | 4507 | schedule(); |
4480 | 4508 | ||
@@ -4548,8 +4576,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
4548 | /** | 4576 | /** |
4549 | * yield - yield the current processor to other threads. | 4577 | * yield - yield the current processor to other threads. |
4550 | * | 4578 | * |
4551 | * This is a shortcut for kernel-space yielding - it marks the | 4579 | * Do not ever use this function, there's a 99% chance you're doing it wrong. |
4552 | * thread runnable and calls sys_sched_yield(). | 4580 | * |
4581 | * The scheduler is at all times free to pick the calling task as the most | ||
4582 | * eligible task to run, if removing the yield() call from your code breaks | ||
4583 | * it, its already broken. | ||
4584 | * | ||
4585 | * Typical broken usage is: | ||
4586 | * | ||
4587 | * while (!event) | ||
4588 | * yield(); | ||
4589 | * | ||
4590 | * where one assumes that yield() will let 'the other' process run that will | ||
4591 | * make event true. If the current task is a SCHED_FIFO task that will never | ||
4592 | * happen. Never use yield() as a progress guarantee!! | ||
4593 | * | ||
4594 | * If you want to use yield() to wait for something, use wait_event(). | ||
4595 | * If you want to use yield() to be 'nice' for others, use cond_resched(). | ||
4596 | * If you still want to use yield(), do not! | ||
4553 | */ | 4597 | */ |
4554 | void __sched yield(void) | 4598 | void __sched yield(void) |
4555 | { | 4599 | { |
@@ -5381,7 +5425,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | |||
5381 | unsigned long action, void *hcpu) | 5425 | unsigned long action, void *hcpu) |
5382 | { | 5426 | { |
5383 | switch (action & ~CPU_TASKS_FROZEN) { | 5427 | switch (action & ~CPU_TASKS_FROZEN) { |
5384 | case CPU_ONLINE: | 5428 | case CPU_STARTING: |
5385 | case CPU_DOWN_FAILED: | 5429 | case CPU_DOWN_FAILED: |
5386 | set_cpu_active((long)hcpu, true); | 5430 | set_cpu_active((long)hcpu, true); |
5387 | return NOTIFY_OK; | 5431 | return NOTIFY_OK; |
@@ -5753,7 +5797,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5753 | * | 5797 | * |
5754 | * Also keep a unique ID per domain (we use the first cpu number in | 5798 | * Also keep a unique ID per domain (we use the first cpu number in |
5755 | * the cpumask of the domain), this allows us to quickly tell if | 5799 | * the cpumask of the domain), this allows us to quickly tell if |
5756 | * two cpus are in the same cache domain, see ttwu_share_cache(). | 5800 | * two cpus are in the same cache domain, see cpus_share_cache(). |
5757 | */ | 5801 | */ |
5758 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5802 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5759 | DEFINE_PER_CPU(int, sd_llc_id); | 5803 | DEFINE_PER_CPU(int, sd_llc_id); |
@@ -6930,6 +6974,9 @@ void __init sched_init(void) | |||
6930 | rq->online = 0; | 6974 | rq->online = 0; |
6931 | rq->idle_stamp = 0; | 6975 | rq->idle_stamp = 0; |
6932 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6976 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
6977 | |||
6978 | INIT_LIST_HEAD(&rq->cfs_tasks); | ||
6979 | |||
6933 | rq_attach_root(rq, &def_root_domain); | 6980 | rq_attach_root(rq, &def_root_domain); |
6934 | #ifdef CONFIG_NO_HZ | 6981 | #ifdef CONFIG_NO_HZ |
6935 | rq->nohz_flags = 0; | 6982 | rq->nohz_flags = 0; |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
288 | 288 | ||
289 | P(yld_count); | 289 | P(yld_count); |
290 | 290 | ||
291 | P(sched_switch); | ||
292 | P(sched_count); | 291 | P(sched_count); |
293 | P(sched_goidle); | 292 | P(sched_goidle); |
294 | #ifdef CONFIG_SMP | 293 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd974faf467d..94340c7544a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 776 | * Scheduling class queueing methods: |
777 | */ | 777 | */ |
778 | 778 | ||
779 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
780 | static void | ||
781 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
782 | { | ||
783 | cfs_rq->task_weight += weight; | ||
784 | } | ||
785 | #else | ||
786 | static inline void | ||
787 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
788 | { | ||
789 | } | ||
790 | #endif | ||
791 | |||
792 | static void | 779 | static void |
793 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
794 | { | 781 | { |
795 | update_load_add(&cfs_rq->load, se->load.weight); | 782 | update_load_add(&cfs_rq->load, se->load.weight); |
796 | if (!parent_entity(se)) | 783 | if (!parent_entity(se)) |
797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
798 | if (entity_is_task(se)) { | 785 | #ifdef CONFIG_SMP |
799 | add_cfs_task_weight(cfs_rq, se->load.weight); | 786 | if (entity_is_task(se)) |
800 | list_add(&se->group_node, &cfs_rq->tasks); | 787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
801 | } | 788 | #endif |
802 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
803 | } | 790 | } |
804 | 791 | ||
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
808 | update_load_sub(&cfs_rq->load, se->load.weight); | 795 | update_load_sub(&cfs_rq->load, se->load.weight); |
809 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 797 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
811 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) |
812 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
813 | list_del_init(&se->group_node); | 799 | list_del_init(&se->group_node); |
814 | } | ||
815 | cfs_rq->nr_running--; | 800 | cfs_rq->nr_running--; |
816 | } | 801 | } |
817 | 802 | ||
@@ -2672,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2672 | /* | 2657 | /* |
2673 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2658 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2674 | */ | 2659 | */ |
2675 | rcu_read_lock(); | ||
2676 | |||
2677 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2678 | for_each_lower_domain(sd) { | 2661 | for_each_lower_domain(sd) { |
2679 | sg = sd->groups; | 2662 | sg = sd->groups; |
@@ -2695,8 +2678,6 @@ next: | |||
2695 | } while (sg != sd->groups); | 2678 | } while (sg != sd->groups); |
2696 | } | 2679 | } |
2697 | done: | 2680 | done: |
2698 | rcu_read_unlock(); | ||
2699 | |||
2700 | return target; | 2681 | return target; |
2701 | } | 2682 | } |
2702 | 2683 | ||
@@ -2922,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2922 | return; | 2903 | return; |
2923 | 2904 | ||
2924 | /* | 2905 | /* |
2925 | * This is possible from callers such as pull_task(), in which we | 2906 | * This is possible from callers such as move_task(), in which we |
2926 | * unconditionally check_prempt_curr() after an enqueue (which may have | 2907 | * unconditionally check_prempt_curr() after an enqueue (which may have |
2927 | * lead to a throttle). This both saves work and prevents false | 2908 | * lead to a throttle). This both saves work and prevents false |
2928 | * next-buddy nomination below. | 2909 | * next-buddy nomination below. |
@@ -3086,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3086 | * Fair scheduling class load-balancing methods: | 3067 | * Fair scheduling class load-balancing methods: |
3087 | */ | 3068 | */ |
3088 | 3069 | ||
3070 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3071 | |||
3072 | #define LBF_ALL_PINNED 0x01 | ||
3073 | #define LBF_NEED_BREAK 0x02 | ||
3074 | |||
3075 | struct lb_env { | ||
3076 | struct sched_domain *sd; | ||
3077 | |||
3078 | int src_cpu; | ||
3079 | struct rq *src_rq; | ||
3080 | |||
3081 | int dst_cpu; | ||
3082 | struct rq *dst_rq; | ||
3083 | |||
3084 | enum cpu_idle_type idle; | ||
3085 | long load_move; | ||
3086 | unsigned int flags; | ||
3087 | |||
3088 | unsigned int loop; | ||
3089 | unsigned int loop_break; | ||
3090 | unsigned int loop_max; | ||
3091 | }; | ||
3092 | |||
3089 | /* | 3093 | /* |
3090 | * pull_task - move a task from a remote runqueue to the local runqueue. | 3094 | * move_task - move a task from one runqueue to another runqueue. |
3091 | * Both runqueues must be locked. | 3095 | * Both runqueues must be locked. |
3092 | */ | 3096 | */ |
3093 | static void pull_task(struct rq *src_rq, struct task_struct *p, | 3097 | static void move_task(struct task_struct *p, struct lb_env *env) |
3094 | struct rq *this_rq, int this_cpu) | ||
3095 | { | 3098 | { |
3096 | deactivate_task(src_rq, p, 0); | 3099 | deactivate_task(env->src_rq, p, 0); |
3097 | set_task_cpu(p, this_cpu); | 3100 | set_task_cpu(p, env->dst_cpu); |
3098 | activate_task(this_rq, p, 0); | 3101 | activate_task(env->dst_rq, p, 0); |
3099 | check_preempt_curr(this_rq, p, 0); | 3102 | check_preempt_curr(env->dst_rq, p, 0); |
3100 | } | 3103 | } |
3101 | 3104 | ||
3102 | /* | 3105 | /* |
@@ -3131,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
3131 | return delta < (s64)sysctl_sched_migration_cost; | 3134 | return delta < (s64)sysctl_sched_migration_cost; |
3132 | } | 3135 | } |
3133 | 3136 | ||
3134 | #define LBF_ALL_PINNED 0x01 | ||
3135 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | ||
3136 | #define LBF_HAD_BREAK 0x04 | ||
3137 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3138 | #define LBF_ABORT 0x10 | ||
3139 | |||
3140 | /* | 3137 | /* |
3141 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3138 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
3142 | */ | 3139 | */ |
3143 | static | 3140 | static |
3144 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3141 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
3145 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3146 | int *lb_flags) | ||
3147 | { | 3142 | { |
3148 | int tsk_cache_hot = 0; | 3143 | int tsk_cache_hot = 0; |
3149 | /* | 3144 | /* |
@@ -3152,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3152 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3147 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3153 | * 3) are cache-hot on their current CPU. | 3148 | * 3) are cache-hot on their current CPU. |
3154 | */ | 3149 | */ |
3155 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { | 3150 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3156 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3151 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3157 | return 0; | 3152 | return 0; |
3158 | } | 3153 | } |
3159 | *lb_flags &= ~LBF_ALL_PINNED; | 3154 | env->flags &= ~LBF_ALL_PINNED; |
3160 | 3155 | ||
3161 | if (task_running(rq, p)) { | 3156 | if (task_running(env->src_rq, p)) { |
3162 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3157 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
3163 | return 0; | 3158 | return 0; |
3164 | } | 3159 | } |
@@ -3169,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3169 | * 2) too many balance attempts have failed. | 3164 | * 2) too many balance attempts have failed. |
3170 | */ | 3165 | */ |
3171 | 3166 | ||
3172 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); | 3167 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3173 | if (!tsk_cache_hot || | 3168 | if (!tsk_cache_hot || |
3174 | sd->nr_balance_failed > sd->cache_nice_tries) { | 3169 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3175 | #ifdef CONFIG_SCHEDSTATS | 3170 | #ifdef CONFIG_SCHEDSTATS |
3176 | if (tsk_cache_hot) { | 3171 | if (tsk_cache_hot) { |
3177 | schedstat_inc(sd, lb_hot_gained[idle]); | 3172 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3178 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3173 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3179 | } | 3174 | } |
3180 | #endif | 3175 | #endif |
@@ -3195,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3195 | * | 3190 | * |
3196 | * Called with both runqueues locked. | 3191 | * Called with both runqueues locked. |
3197 | */ | 3192 | */ |
3198 | static int | 3193 | static int move_one_task(struct lb_env *env) |
3199 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3200 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3201 | { | 3194 | { |
3202 | struct task_struct *p, *n; | 3195 | struct task_struct *p, *n; |
3203 | struct cfs_rq *cfs_rq; | ||
3204 | int pinned = 0; | ||
3205 | 3196 | ||
3206 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 3197 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3207 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 3198 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) |
3208 | if (throttled_lb_pair(task_group(p), | 3199 | continue; |
3209 | busiest->cpu, this_cpu)) | ||
3210 | break; | ||
3211 | 3200 | ||
3212 | if (!can_migrate_task(p, busiest, this_cpu, | 3201 | if (!can_migrate_task(p, env)) |
3213 | sd, idle, &pinned)) | 3202 | continue; |
3214 | continue; | ||
3215 | 3203 | ||
3216 | pull_task(busiest, p, this_rq, this_cpu); | 3204 | move_task(p, env); |
3217 | /* | 3205 | /* |
3218 | * Right now, this is only the second place pull_task() | 3206 | * Right now, this is only the second place move_task() |
3219 | * is called, so we can safely collect pull_task() | 3207 | * is called, so we can safely collect move_task() |
3220 | * stats here rather than inside pull_task(). | 3208 | * stats here rather than inside move_task(). |
3221 | */ | 3209 | */ |
3222 | schedstat_inc(sd, lb_gained[idle]); | 3210 | schedstat_inc(env->sd, lb_gained[env->idle]); |
3223 | return 1; | 3211 | return 1; |
3224 | } | ||
3225 | } | 3212 | } |
3226 | |||
3227 | return 0; | 3213 | return 0; |
3228 | } | 3214 | } |
3229 | 3215 | ||
3230 | static unsigned long | 3216 | static unsigned long task_h_load(struct task_struct *p); |
3231 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3217 | |
3232 | unsigned long max_load_move, struct sched_domain *sd, | 3218 | /* |
3233 | enum cpu_idle_type idle, int *lb_flags, | 3219 | * move_tasks tries to move up to load_move weighted load from busiest to |
3234 | struct cfs_rq *busiest_cfs_rq) | 3220 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | ||
3222 | * | ||
3223 | * Called with both runqueues locked. | ||
3224 | */ | ||
3225 | static int move_tasks(struct lb_env *env) | ||
3235 | { | 3226 | { |
3236 | int loops = 0, pulled = 0; | 3227 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
3237 | long rem_load_move = max_load_move; | 3228 | struct task_struct *p; |
3238 | struct task_struct *p, *n; | 3229 | unsigned long load; |
3230 | int pulled = 0; | ||
3231 | |||
3232 | if (env->load_move <= 0) | ||
3233 | return 0; | ||
3239 | 3234 | ||
3240 | if (max_load_move == 0) | 3235 | while (!list_empty(tasks)) { |
3241 | goto out; | 3236 | p = list_first_entry(tasks, struct task_struct, se.group_node); |
3242 | 3237 | ||
3243 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3238 | env->loop++; |
3244 | if (loops++ > sysctl_sched_nr_migrate) { | 3239 | /* We've more or less seen every task there is, call it quits */ |
3245 | *lb_flags |= LBF_NEED_BREAK; | 3240 | if (env->loop > env->loop_max) |
3241 | break; | ||
3242 | |||
3243 | /* take a breather every nr_migrate tasks */ | ||
3244 | if (env->loop > env->loop_break) { | ||
3245 | env->loop_break += sysctl_sched_nr_migrate; | ||
3246 | env->flags |= LBF_NEED_BREAK; | ||
3246 | break; | 3247 | break; |
3247 | } | 3248 | } |
3248 | 3249 | ||
3249 | if ((p->se.load.weight >> 1) > rem_load_move || | 3250 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
3250 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3251 | goto next; |
3251 | lb_flags)) | 3252 | |
3252 | continue; | 3253 | load = task_h_load(p); |
3254 | |||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | ||
3256 | goto next; | ||
3257 | |||
3258 | if ((load / 2) > env->load_move) | ||
3259 | goto next; | ||
3253 | 3260 | ||
3254 | pull_task(busiest, p, this_rq, this_cpu); | 3261 | if (!can_migrate_task(p, env)) |
3262 | goto next; | ||
3263 | |||
3264 | move_task(p, env); | ||
3255 | pulled++; | 3265 | pulled++; |
3256 | rem_load_move -= p->se.load.weight; | 3266 | env->load_move -= load; |
3257 | 3267 | ||
3258 | #ifdef CONFIG_PREEMPT | 3268 | #ifdef CONFIG_PREEMPT |
3259 | /* | 3269 | /* |
@@ -3261,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3261 | * kernels will stop after the first task is pulled to minimize | 3271 | * kernels will stop after the first task is pulled to minimize |
3262 | * the critical section. | 3272 | * the critical section. |
3263 | */ | 3273 | */ |
3264 | if (idle == CPU_NEWLY_IDLE) { | 3274 | if (env->idle == CPU_NEWLY_IDLE) |
3265 | *lb_flags |= LBF_ABORT; | ||
3266 | break; | 3275 | break; |
3267 | } | ||
3268 | #endif | 3276 | #endif |
3269 | 3277 | ||
3270 | /* | 3278 | /* |
3271 | * We only want to steal up to the prescribed amount of | 3279 | * We only want to steal up to the prescribed amount of |
3272 | * weighted load. | 3280 | * weighted load. |
3273 | */ | 3281 | */ |
3274 | if (rem_load_move <= 0) | 3282 | if (env->load_move <= 0) |
3275 | break; | 3283 | break; |
3284 | |||
3285 | continue; | ||
3286 | next: | ||
3287 | list_move_tail(&p->se.group_node, tasks); | ||
3276 | } | 3288 | } |
3277 | out: | 3289 | |
3278 | /* | 3290 | /* |
3279 | * Right now, this is one of only two places pull_task() is called, | 3291 | * Right now, this is one of only two places move_task() is called, |
3280 | * so we can safely collect pull_task() stats here rather than | 3292 | * so we can safely collect move_task() stats here rather than |
3281 | * inside pull_task(). | 3293 | * inside move_task(). |
3282 | */ | 3294 | */ |
3283 | schedstat_add(sd, lb_gained[idle], pulled); | 3295 | schedstat_add(env->sd, lb_gained[env->idle], pulled); |
3284 | 3296 | ||
3285 | return max_load_move - rem_load_move; | 3297 | return pulled; |
3286 | } | 3298 | } |
3287 | 3299 | ||
3288 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3300 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3362,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
3362 | 3374 | ||
3363 | static void update_h_load(long cpu) | 3375 | static void update_h_load(long cpu) |
3364 | { | 3376 | { |
3377 | rcu_read_lock(); | ||
3365 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3378 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
3379 | rcu_read_unlock(); | ||
3366 | } | 3380 | } |
3367 | 3381 | ||
3368 | static unsigned long | 3382 | static unsigned long task_h_load(struct task_struct *p) |
3369 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3370 | unsigned long max_load_move, | ||
3371 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3372 | int *lb_flags) | ||
3373 | { | 3383 | { |
3374 | long rem_load_move = max_load_move; | 3384 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
3375 | struct cfs_rq *busiest_cfs_rq; | 3385 | unsigned long load; |
3376 | |||
3377 | rcu_read_lock(); | ||
3378 | update_h_load(cpu_of(busiest)); | ||
3379 | |||
3380 | for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { | ||
3381 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
3382 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
3383 | u64 rem_load, moved_load; | ||
3384 | |||
3385 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3386 | break; | ||
3387 | |||
3388 | /* | ||
3389 | * empty group or part of a throttled hierarchy | ||
3390 | */ | ||
3391 | if (!busiest_cfs_rq->task_weight || | ||
3392 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
3393 | continue; | ||
3394 | |||
3395 | rem_load = (u64)rem_load_move * busiest_weight; | ||
3396 | rem_load = div_u64(rem_load, busiest_h_load + 1); | ||
3397 | |||
3398 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | ||
3399 | rem_load, sd, idle, lb_flags, | ||
3400 | busiest_cfs_rq); | ||
3401 | |||
3402 | if (!moved_load) | ||
3403 | continue; | ||
3404 | 3386 | ||
3405 | moved_load *= busiest_h_load; | 3387 | load = p->se.load.weight; |
3406 | moved_load = div_u64(moved_load, busiest_weight + 1); | 3388 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); |
3407 | 3389 | ||
3408 | rem_load_move -= moved_load; | 3390 | return load; |
3409 | if (rem_load_move < 0) | ||
3410 | break; | ||
3411 | } | ||
3412 | rcu_read_unlock(); | ||
3413 | |||
3414 | return max_load_move - rem_load_move; | ||
3415 | } | 3391 | } |
3416 | #else | 3392 | #else |
3417 | static inline void update_shares(int cpu) | 3393 | static inline void update_shares(int cpu) |
3418 | { | 3394 | { |
3419 | } | 3395 | } |
3420 | 3396 | ||
3421 | static unsigned long | 3397 | static inline void update_h_load(long cpu) |
3422 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3423 | unsigned long max_load_move, | ||
3424 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3425 | int *lb_flags) | ||
3426 | { | 3398 | { |
3427 | return balance_tasks(this_rq, this_cpu, busiest, | ||
3428 | max_load_move, sd, idle, lb_flags, | ||
3429 | &busiest->cfs); | ||
3430 | } | 3399 | } |
3431 | #endif | ||
3432 | 3400 | ||
3433 | /* | 3401 | static unsigned long task_h_load(struct task_struct *p) |
3434 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3435 | * this_rq, as part of a balancing operation within domain "sd". | ||
3436 | * Returns 1 if successful and 0 otherwise. | ||
3437 | * | ||
3438 | * Called with both runqueues locked. | ||
3439 | */ | ||
3440 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3441 | unsigned long max_load_move, | ||
3442 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3443 | int *lb_flags) | ||
3444 | { | 3402 | { |
3445 | unsigned long total_load_moved = 0, load_moved; | 3403 | return p->se.load.weight; |
3446 | |||
3447 | do { | ||
3448 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | ||
3449 | max_load_move - total_load_moved, | ||
3450 | sd, idle, lb_flags); | ||
3451 | |||
3452 | total_load_moved += load_moved; | ||
3453 | |||
3454 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3455 | break; | ||
3456 | |||
3457 | #ifdef CONFIG_PREEMPT | ||
3458 | /* | ||
3459 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3460 | * kernels will stop after the first task is pulled to minimize | ||
3461 | * the critical section. | ||
3462 | */ | ||
3463 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { | ||
3464 | *lb_flags |= LBF_ABORT; | ||
3465 | break; | ||
3466 | } | ||
3467 | #endif | ||
3468 | } while (load_moved && max_load_move > total_load_moved); | ||
3469 | |||
3470 | return total_load_moved > 0; | ||
3471 | } | 3404 | } |
3405 | #endif | ||
3472 | 3406 | ||
3473 | /********** Helpers for find_busiest_group ************************/ | 3407 | /********** Helpers for find_busiest_group ************************/ |
3474 | /* | 3408 | /* |
@@ -3778,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3778 | struct sched_domain *child = sd->child; | 3712 | struct sched_domain *child = sd->child; |
3779 | struct sched_group *group, *sdg = sd->groups; | 3713 | struct sched_group *group, *sdg = sd->groups; |
3780 | unsigned long power; | 3714 | unsigned long power; |
3715 | unsigned long interval; | ||
3716 | |||
3717 | interval = msecs_to_jiffies(sd->balance_interval); | ||
3718 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
3719 | sdg->sgp->next_update = jiffies + interval; | ||
3781 | 3720 | ||
3782 | if (!child) { | 3721 | if (!child) { |
3783 | update_cpu_power(sd, cpu); | 3722 | update_cpu_power(sd, cpu); |
@@ -3885,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3885 | * domains. In the newly idle case, we will allow all the cpu's | 3824 | * domains. In the newly idle case, we will allow all the cpu's |
3886 | * to do the newly idle load balance. | 3825 | * to do the newly idle load balance. |
3887 | */ | 3826 | */ |
3888 | if (idle != CPU_NEWLY_IDLE && local_group) { | 3827 | if (local_group) { |
3889 | if (balance_cpu != this_cpu) { | 3828 | if (idle != CPU_NEWLY_IDLE) { |
3890 | *balance = 0; | 3829 | if (balance_cpu != this_cpu) { |
3891 | return; | 3830 | *balance = 0; |
3892 | } | 3831 | return; |
3893 | update_group_power(sd, this_cpu); | 3832 | } |
3833 | update_group_power(sd, this_cpu); | ||
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
3835 | update_group_power(sd, this_cpu); | ||
3894 | } | 3836 | } |
3895 | 3837 | ||
3896 | /* Adjust by relative CPU power of the group */ | 3838 | /* Adjust by relative CPU power of the group */ |
@@ -4453,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4453 | struct sched_domain *sd, enum cpu_idle_type idle, | 4395 | struct sched_domain *sd, enum cpu_idle_type idle, |
4454 | int *balance) | 4396 | int *balance) |
4455 | { | 4397 | { |
4456 | int ld_moved, lb_flags = 0, active_balance = 0; | 4398 | int ld_moved, active_balance = 0; |
4457 | struct sched_group *group; | 4399 | struct sched_group *group; |
4458 | unsigned long imbalance; | 4400 | unsigned long imbalance; |
4459 | struct rq *busiest; | 4401 | struct rq *busiest; |
4460 | unsigned long flags; | 4402 | unsigned long flags; |
4461 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
4462 | 4404 | ||
4405 | struct lb_env env = { | ||
4406 | .sd = sd, | ||
4407 | .dst_cpu = this_cpu, | ||
4408 | .dst_rq = this_rq, | ||
4409 | .idle = idle, | ||
4410 | .loop_break = sysctl_sched_nr_migrate, | ||
4411 | }; | ||
4412 | |||
4463 | cpumask_copy(cpus, cpu_active_mask); | 4413 | cpumask_copy(cpus, cpu_active_mask); |
4464 | 4414 | ||
4465 | schedstat_inc(sd, lb_count[idle]); | 4415 | schedstat_inc(sd, lb_count[idle]); |
@@ -4494,32 +4444,34 @@ redo: | |||
4494 | * still unbalanced. ld_moved simply stays zero, so it is | 4444 | * still unbalanced. ld_moved simply stays zero, so it is |
4495 | * correctly treated as an imbalance. | 4445 | * correctly treated as an imbalance. |
4496 | */ | 4446 | */ |
4497 | lb_flags |= LBF_ALL_PINNED; | 4447 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | ||
4449 | env.src_cpu = busiest->cpu; | ||
4450 | env.src_rq = busiest; | ||
4451 | env.loop_max = busiest->nr_running; | ||
4452 | |||
4453 | more_balance: | ||
4498 | local_irq_save(flags); | 4454 | local_irq_save(flags); |
4499 | double_rq_lock(this_rq, busiest); | 4455 | double_rq_lock(this_rq, busiest); |
4500 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4456 | if (!env.loop) |
4501 | imbalance, sd, idle, &lb_flags); | 4457 | update_h_load(env.src_cpu); |
4458 | ld_moved += move_tasks(&env); | ||
4502 | double_rq_unlock(this_rq, busiest); | 4459 | double_rq_unlock(this_rq, busiest); |
4503 | local_irq_restore(flags); | 4460 | local_irq_restore(flags); |
4504 | 4461 | ||
4462 | if (env.flags & LBF_NEED_BREAK) { | ||
4463 | env.flags &= ~LBF_NEED_BREAK; | ||
4464 | goto more_balance; | ||
4465 | } | ||
4466 | |||
4505 | /* | 4467 | /* |
4506 | * some other cpu did the load balance for us. | 4468 | * some other cpu did the load balance for us. |
4507 | */ | 4469 | */ |
4508 | if (ld_moved && this_cpu != smp_processor_id()) | 4470 | if (ld_moved && this_cpu != smp_processor_id()) |
4509 | resched_cpu(this_cpu); | 4471 | resched_cpu(this_cpu); |
4510 | 4472 | ||
4511 | if (lb_flags & LBF_ABORT) | ||
4512 | goto out_balanced; | ||
4513 | |||
4514 | if (lb_flags & LBF_NEED_BREAK) { | ||
4515 | lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | ||
4516 | if (lb_flags & LBF_ABORT) | ||
4517 | goto out_balanced; | ||
4518 | goto redo; | ||
4519 | } | ||
4520 | |||
4521 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4473 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4522 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { | 4474 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4523 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4475 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4524 | if (!cpumask_empty(cpus)) | 4476 | if (!cpumask_empty(cpus)) |
4525 | goto redo; | 4477 | goto redo; |
@@ -4549,7 +4501,7 @@ redo: | |||
4549 | tsk_cpus_allowed(busiest->curr))) { | 4501 | tsk_cpus_allowed(busiest->curr))) { |
4550 | raw_spin_unlock_irqrestore(&busiest->lock, | 4502 | raw_spin_unlock_irqrestore(&busiest->lock, |
4551 | flags); | 4503 | flags); |
4552 | lb_flags |= LBF_ALL_PINNED; | 4504 | env.flags |= LBF_ALL_PINNED; |
4553 | goto out_one_pinned; | 4505 | goto out_one_pinned; |
4554 | } | 4506 | } |
4555 | 4507 | ||
@@ -4602,7 +4554,7 @@ out_balanced: | |||
4602 | 4554 | ||
4603 | out_one_pinned: | 4555 | out_one_pinned: |
4604 | /* tune up the balancing interval */ | 4556 | /* tune up the balancing interval */ |
4605 | if (((lb_flags & LBF_ALL_PINNED) && | 4557 | if (((env.flags & LBF_ALL_PINNED) && |
4606 | sd->balance_interval < MAX_PINNED_INTERVAL) || | 4558 | sd->balance_interval < MAX_PINNED_INTERVAL) || |
4607 | (sd->balance_interval < sd->max_interval)) | 4559 | (sd->balance_interval < sd->max_interval)) |
4608 | sd->balance_interval *= 2; | 4560 | sd->balance_interval *= 2; |
@@ -4712,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data) | |||
4712 | } | 4664 | } |
4713 | 4665 | ||
4714 | if (likely(sd)) { | 4666 | if (likely(sd)) { |
4667 | struct lb_env env = { | ||
4668 | .sd = sd, | ||
4669 | .dst_cpu = target_cpu, | ||
4670 | .dst_rq = target_rq, | ||
4671 | .src_cpu = busiest_rq->cpu, | ||
4672 | .src_rq = busiest_rq, | ||
4673 | .idle = CPU_IDLE, | ||
4674 | }; | ||
4675 | |||
4715 | schedstat_inc(sd, alb_count); | 4676 | schedstat_inc(sd, alb_count); |
4716 | 4677 | ||
4717 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 4678 | if (move_one_task(&env)) |
4718 | sd, CPU_IDLE)) | ||
4719 | schedstat_inc(sd, alb_pushed); | 4679 | schedstat_inc(sd, alb_pushed); |
4720 | else | 4680 | else |
4721 | schedstat_inc(sd, alb_failed); | 4681 | schedstat_inc(sd, alb_failed); |
@@ -4947,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
4947 | 4907 | ||
4948 | static DEFINE_SPINLOCK(balancing); | 4908 | static DEFINE_SPINLOCK(balancing); |
4949 | 4909 | ||
4950 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
4951 | |||
4952 | /* | 4910 | /* |
4953 | * Scale the max load_balance interval with the number of CPUs in the system. | 4911 | * Scale the max load_balance interval with the number of CPUs in the system. |
4954 | * This trades load-balance latency on larger machines for less cross talk. | 4912 | * This trades load-balance latency on larger machines for less cross talk. |
@@ -5342,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq) | |||
5342 | void init_cfs_rq(struct cfs_rq *cfs_rq) | 5300 | void init_cfs_rq(struct cfs_rq *cfs_rq) |
5343 | { | 5301 | { |
5344 | cfs_rq->tasks_timeline = RB_ROOT; | 5302 | cfs_rq->tasks_timeline = RB_ROOT; |
5345 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5346 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 5303 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
5347 | #ifndef CONFIG_64BIT | 5304 | #ifndef CONFIG_64BIT |
5348 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5305 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
@@ -5614,6 +5571,7 @@ __init void init_sched_fair_class(void) | |||
5614 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 5571 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
5615 | 5572 | ||
5616 | #ifdef CONFIG_NO_HZ | 5573 | #ifdef CONFIG_NO_HZ |
5574 | nohz.next_balance = jiffies; | ||
5617 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 5575 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
5618 | cpu_notifier(sched_ilb_notifier, 0); | 5576 | cpu_notifier(sched_ilb_notifier, 0); |
5619 | #endif | 5577 | #endif |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec5..b60dad720173 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq) | |||
778 | 778 | ||
779 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | 779 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) |
780 | { | 780 | { |
781 | int i, idle = 1; | 781 | int i, idle = 1, throttled = 0; |
782 | const struct cpumask *span; | 782 | const struct cpumask *span; |
783 | 783 | ||
784 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
785 | return 1; | ||
786 | |||
787 | span = sched_rt_period_mask(); | 784 | span = sched_rt_period_mask(); |
788 | for_each_cpu(i, span) { | 785 | for_each_cpu(i, span) { |
789 | int enqueue = 0; | 786 | int enqueue = 0; |
@@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
818 | if (!rt_rq_throttled(rt_rq)) | 815 | if (!rt_rq_throttled(rt_rq)) |
819 | enqueue = 1; | 816 | enqueue = 1; |
820 | } | 817 | } |
818 | if (rt_rq->rt_throttled) | ||
819 | throttled = 1; | ||
821 | 820 | ||
822 | if (enqueue) | 821 | if (enqueue) |
823 | sched_rt_rq_enqueue(rt_rq); | 822 | sched_rt_rq_enqueue(rt_rq); |
824 | raw_spin_unlock(&rq->lock); | 823 | raw_spin_unlock(&rq->lock); |
825 | } | 824 | } |
826 | 825 | ||
826 | if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) | ||
827 | return 1; | ||
828 | |||
827 | return idle; | 829 | return idle; |
828 | } | 830 | } |
829 | 831 | ||
@@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
855 | return 0; | 857 | return 0; |
856 | 858 | ||
857 | if (rt_rq->rt_time > runtime) { | 859 | if (rt_rq->rt_time > runtime) { |
858 | rt_rq->rt_throttled = 1; | 860 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
859 | printk_once(KERN_WARNING "sched: RT throttling activated\n"); | 861 | |
862 | /* | ||
863 | * Don't actually throttle groups that have no runtime assigned | ||
864 | * but accrue some time due to boosting. | ||
865 | */ | ||
866 | if (likely(rt_b->rt_runtime)) { | ||
867 | static bool once = false; | ||
868 | |||
869 | rt_rq->rt_throttled = 1; | ||
870 | |||
871 | if (!once) { | ||
872 | once = true; | ||
873 | printk_sched("sched: RT throttling activated\n"); | ||
874 | } | ||
875 | } else { | ||
876 | /* | ||
877 | * In case we did anyway, make it go away, | ||
878 | * replenishment is a joke, since it will replenish us | ||
879 | * with exactly 0 ns. | ||
880 | */ | ||
881 | rt_rq->rt_time = 0; | ||
882 | } | ||
883 | |||
860 | if (rt_rq_throttled(rt_rq)) { | 884 | if (rt_rq_throttled(rt_rq)) { |
861 | sched_rt_rq_dequeue(rt_rq); | 885 | sched_rt_rq_dequeue(rt_rq); |
862 | return 1; | 886 | return 1; |
@@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq) | |||
884 | if (unlikely((s64)delta_exec < 0)) | 908 | if (unlikely((s64)delta_exec < 0)) |
885 | delta_exec = 0; | 909 | delta_exec = 0; |
886 | 910 | ||
887 | schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); | 911 | schedstat_set(curr->se.statistics.exec_max, |
912 | max(curr->se.statistics.exec_max, delta_exec)); | ||
888 | 913 | ||
889 | curr->se.sum_exec_runtime += delta_exec; | 914 | curr->se.sum_exec_runtime += delta_exec; |
890 | account_group_exec_runtime(curr, delta_exec); | 915 | account_group_exec_runtime(curr, delta_exec); |
@@ -1972,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
1972 | if (--p->rt.time_slice) | 1997 | if (--p->rt.time_slice) |
1973 | return; | 1998 | return; |
1974 | 1999 | ||
1975 | p->rt.time_slice = DEF_TIMESLICE; | 2000 | p->rt.time_slice = RR_TIMESLICE; |
1976 | 2001 | ||
1977 | /* | 2002 | /* |
1978 | * Requeue to the end of queue if we are not the only element | 2003 | * Requeue to the end of queue if we are not the only element |
@@ -2000,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
2000 | * Time slice is 0 for SCHED_FIFO tasks | 2025 | * Time slice is 0 for SCHED_FIFO tasks |
2001 | */ | 2026 | */ |
2002 | if (task->policy == SCHED_RR) | 2027 | if (task->policy == SCHED_RR) |
2003 | return DEF_TIMESLICE; | 2028 | return RR_TIMESLICE; |
2004 | else | 2029 | else |
2005 | return 0; | 2030 | return 0; |
2006 | } | 2031 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b4cd6d8ea150..42b1f304b044 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running; | |||
36 | 36 | ||
37 | /* | 37 | /* |
38 | * These are the 'tuning knobs' of the scheduler: | 38 | * These are the 'tuning knobs' of the scheduler: |
39 | * | ||
40 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
41 | * Timeslices get refilled after they expire. | ||
42 | */ | 39 | */ |
43 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
44 | 40 | ||
45 | /* | 41 | /* |
46 | * single value that denotes runtime == period, ie unlimited time. | 42 | * single value that denotes runtime == period, ie unlimited time. |
@@ -216,9 +212,6 @@ struct cfs_rq { | |||
216 | struct rb_root tasks_timeline; | 212 | struct rb_root tasks_timeline; |
217 | struct rb_node *rb_leftmost; | 213 | struct rb_node *rb_leftmost; |
218 | 214 | ||
219 | struct list_head tasks; | ||
220 | struct list_head *balance_iterator; | ||
221 | |||
222 | /* | 215 | /* |
223 | * 'curr' points to currently running entity on this cfs_rq. | 216 | * 'curr' points to currently running entity on this cfs_rq. |
224 | * It is set to NULL otherwise (i.e when none are currently running). | 217 | * It is set to NULL otherwise (i.e when none are currently running). |
@@ -246,11 +239,6 @@ struct cfs_rq { | |||
246 | 239 | ||
247 | #ifdef CONFIG_SMP | 240 | #ifdef CONFIG_SMP |
248 | /* | 241 | /* |
249 | * the part of load.weight contributed by tasks | ||
250 | */ | ||
251 | unsigned long task_weight; | ||
252 | |||
253 | /* | ||
254 | * h_load = weight * f(tg) | 242 | * h_load = weight * f(tg) |
255 | * | 243 | * |
256 | * Where f(tg) is the recursive weight fraction assigned to | 244 | * Where f(tg) is the recursive weight fraction assigned to |
@@ -424,6 +412,8 @@ struct rq { | |||
424 | int cpu; | 412 | int cpu; |
425 | int online; | 413 | int online; |
426 | 414 | ||
415 | struct list_head cfs_tasks; | ||
416 | |||
427 | u64 rt_avg; | 417 | u64 rt_avg; |
428 | u64 age_stamp; | 418 | u64 age_stamp; |
429 | u64 idle_stamp; | 419 | u64 idle_stamp; |
@@ -462,7 +452,6 @@ struct rq { | |||
462 | unsigned int yld_count; | 452 | unsigned int yld_count; |
463 | 453 | ||
464 | /* schedule() stats */ | 454 | /* schedule() stats */ |
465 | unsigned int sched_switch; | ||
466 | unsigned int sched_count; | 455 | unsigned int sched_count; |
467 | unsigned int sched_goidle; | 456 | unsigned int sched_goidle; |
468 | 457 | ||
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
32 | 32 | ||
33 | /* runqueue-specific stats */ | 33 | /* runqueue-specific stats */ |
34 | seq_printf(seq, | 34 | seq_printf(seq, |
35 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | 35 | "cpu%d %u 0 %u %u %u %u %llu %llu %lu", |
36 | cpu, rq->yld_count, | 36 | cpu, rq->yld_count, |
37 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | 37 | rq->sched_count, rq->sched_goidle, |
38 | rq->ttwu_count, rq->ttwu_local, | 38 | rq->ttwu_count, rq->ttwu_local, |
39 | rq->rq_cpu_time, | 39 | rq->rq_cpu_time, |
40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | 40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 8afc6a8d4d7c..15352e0cbd5d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -343,7 +343,7 @@ void irq_exit(void) | |||
343 | tick_nohz_irq_exit(); | 343 | tick_nohz_irq_exit(); |
344 | #endif | 344 | #endif |
345 | rcu_irq_exit(); | 345 | rcu_irq_exit(); |
346 | preempt_enable_no_resched(); | 346 | sched_preempt_enable_no_resched(); |
347 | } | 347 | } |
348 | 348 | ||
349 | /* | 349 | /* |
@@ -740,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
740 | while (!kthread_should_stop()) { | 740 | while (!kthread_should_stop()) { |
741 | preempt_disable(); | 741 | preempt_disable(); |
742 | if (!local_softirq_pending()) { | 742 | if (!local_softirq_pending()) { |
743 | preempt_enable_no_resched(); | 743 | schedule_preempt_disabled(); |
744 | schedule(); | ||
745 | preempt_disable(); | ||
746 | } | 744 | } |
747 | 745 | ||
748 | __set_current_state(TASK_RUNNING); | 746 | __set_current_state(TASK_RUNNING); |
@@ -757,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
757 | if (local_softirq_pending()) | 755 | if (local_softirq_pending()) |
758 | __do_softirq(); | 756 | __do_softirq(); |
759 | local_irq_enable(); | 757 | local_irq_enable(); |
760 | preempt_enable_no_resched(); | 758 | sched_preempt_enable_no_resched(); |
761 | cond_resched(); | 759 | cond_resched(); |
762 | preempt_disable(); | 760 | preempt_disable(); |
763 | rcu_note_context_switch((long)__bind_cpu); | 761 | rcu_note_context_switch((long)__bind_cpu); |