diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-21 13:40:02 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-21 13:40:02 -0500 |
commit | e2defd02717ebc54ae2f4862271a3093665b426a (patch) | |
tree | bb724dc1041b72ac9a241fb9d00aae995fea6236 /kernel | |
parent | b5aeca54d0212515d820e5555115e2fc7847a68b (diff) | |
parent | 2636ed5f8d15ff9395731593537b4b3fdf2af24d (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Thiscontains misc fixes: preempt_schedule_common() and io_schedule()
recursion fixes, sched/dl fixes, a completion_done() revert, two
sched/rt fixes and a comment update patch"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/rt: Avoid obvious configuration fail
sched/autogroup: Fix failure to set cpu.rt_runtime_us
sched/dl: Do update_rq_clock() in yield_task_dl()
sched: Prevent recursion in io_schedule()
sched/completion: Serialize completion_done() with complete()
sched: Fix preempt_schedule_common() triggering tracing recursion
sched/dl: Prevent enqueue of a sleeping task in dl_task_timer()
sched: Make dl_task_time() use task_rq_lock()
sched: Clarify ordering between task_rq_lock() and move_queued_task()
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/auto_group.c | 6 | ||||
-rw-r--r-- | kernel/sched/completion.c | 19 | ||||
-rw-r--r-- | kernel/sched/core.c | 113 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 33 | ||||
-rw-r--r-- | kernel/sched/sched.h | 76 |
5 files changed, 148 insertions, 99 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 8a2e230fb86a..eae160dd669d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) | |||
87 | * so we don't have to move tasks around upon policy change, | 87 | * so we don't have to move tasks around upon policy change, |
88 | * or flail around trying to allocate bandwidth on the fly. | 88 | * or flail around trying to allocate bandwidth on the fly. |
89 | * A bandwidth exception in __sched_setscheduler() allows | 89 | * A bandwidth exception in __sched_setscheduler() allows |
90 | * the policy change to proceed. Thereafter, task_group() | 90 | * the policy change to proceed. |
91 | * returns &root_task_group, so zero bandwidth is required. | ||
92 | */ | 91 | */ |
93 | free_rt_sched_group(tg); | 92 | free_rt_sched_group(tg); |
94 | tg->rt_se = root_task_group.rt_se; | 93 | tg->rt_se = root_task_group.rt_se; |
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
115 | if (tg != &root_task_group) | 114 | if (tg != &root_task_group) |
116 | return false; | 115 | return false; |
117 | 116 | ||
118 | if (p->sched_class != &fair_sched_class) | ||
119 | return false; | ||
120 | |||
121 | /* | 117 | /* |
122 | * We can only assume the task group can't go away on us if | 118 | * We can only assume the task group can't go away on us if |
123 | * autogroup_move_group() can see us on ->thread_group list. | 119 | * autogroup_move_group() can see us on ->thread_group list. |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 7052d3fd4e7b..8d0f35debf35 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x) | |||
274 | * first without taking the lock so we can | 274 | * first without taking the lock so we can |
275 | * return early in the blocking case. | 275 | * return early in the blocking case. |
276 | */ | 276 | */ |
277 | if (!ACCESS_ONCE(x->done)) | 277 | if (!READ_ONCE(x->done)) |
278 | return 0; | 278 | return 0; |
279 | 279 | ||
280 | spin_lock_irqsave(&x->wait.lock, flags); | 280 | spin_lock_irqsave(&x->wait.lock, flags); |
@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); | |||
297 | */ | 297 | */ |
298 | bool completion_done(struct completion *x) | 298 | bool completion_done(struct completion *x) |
299 | { | 299 | { |
300 | return !!ACCESS_ONCE(x->done); | 300 | if (!READ_ONCE(x->done)) |
301 | return false; | ||
302 | |||
303 | /* | ||
304 | * If ->done, we need to wait for complete() to release ->wait.lock | ||
305 | * otherwise we can end up freeing the completion before complete() | ||
306 | * is done referencing it. | ||
307 | * | ||
308 | * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders | ||
309 | * the loads of ->done and ->wait.lock such that we cannot observe | ||
310 | * the lock before complete() acquires it while observing the ->done | ||
311 | * after it's acquired the lock. | ||
312 | */ | ||
313 | smp_rmb(); | ||
314 | spin_unlock_wait(&x->wait.lock); | ||
315 | return true; | ||
301 | } | 316 | } |
302 | EXPORT_SYMBOL(completion_done); | 317 | EXPORT_SYMBOL(completion_done); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13049aac05a6..f0f831e8a345 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -307,66 +307,6 @@ __read_mostly int scheduler_running; | |||
307 | int sysctl_sched_rt_runtime = 950000; | 307 | int sysctl_sched_rt_runtime = 950000; |
308 | 308 | ||
309 | /* | 309 | /* |
310 | * __task_rq_lock - lock the rq @p resides on. | ||
311 | */ | ||
312 | static inline struct rq *__task_rq_lock(struct task_struct *p) | ||
313 | __acquires(rq->lock) | ||
314 | { | ||
315 | struct rq *rq; | ||
316 | |||
317 | lockdep_assert_held(&p->pi_lock); | ||
318 | |||
319 | for (;;) { | ||
320 | rq = task_rq(p); | ||
321 | raw_spin_lock(&rq->lock); | ||
322 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) | ||
323 | return rq; | ||
324 | raw_spin_unlock(&rq->lock); | ||
325 | |||
326 | while (unlikely(task_on_rq_migrating(p))) | ||
327 | cpu_relax(); | ||
328 | } | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. | ||
333 | */ | ||
334 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | ||
335 | __acquires(p->pi_lock) | ||
336 | __acquires(rq->lock) | ||
337 | { | ||
338 | struct rq *rq; | ||
339 | |||
340 | for (;;) { | ||
341 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | ||
342 | rq = task_rq(p); | ||
343 | raw_spin_lock(&rq->lock); | ||
344 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) | ||
345 | return rq; | ||
346 | raw_spin_unlock(&rq->lock); | ||
347 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
348 | |||
349 | while (unlikely(task_on_rq_migrating(p))) | ||
350 | cpu_relax(); | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void __task_rq_unlock(struct rq *rq) | ||
355 | __releases(rq->lock) | ||
356 | { | ||
357 | raw_spin_unlock(&rq->lock); | ||
358 | } | ||
359 | |||
360 | static inline void | ||
361 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
362 | __releases(rq->lock) | ||
363 | __releases(p->pi_lock) | ||
364 | { | ||
365 | raw_spin_unlock(&rq->lock); | ||
366 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * this_rq_lock - lock this runqueue and disable interrupts. | 310 | * this_rq_lock - lock this runqueue and disable interrupts. |
371 | */ | 311 | */ |
372 | static struct rq *this_rq_lock(void) | 312 | static struct rq *this_rq_lock(void) |
@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void) | |||
2899 | preempt_disable(); | 2839 | preempt_disable(); |
2900 | } | 2840 | } |
2901 | 2841 | ||
2902 | static void preempt_schedule_common(void) | 2842 | static void __sched notrace preempt_schedule_common(void) |
2903 | { | 2843 | { |
2904 | do { | 2844 | do { |
2905 | __preempt_count_add(PREEMPT_ACTIVE); | 2845 | __preempt_count_add(PREEMPT_ACTIVE); |
@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to); | |||
4418 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 4358 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4419 | * that process accounting knows that this is a task in IO wait state. | 4359 | * that process accounting knows that this is a task in IO wait state. |
4420 | */ | 4360 | */ |
4421 | void __sched io_schedule(void) | ||
4422 | { | ||
4423 | struct rq *rq = raw_rq(); | ||
4424 | |||
4425 | delayacct_blkio_start(); | ||
4426 | atomic_inc(&rq->nr_iowait); | ||
4427 | blk_flush_plug(current); | ||
4428 | current->in_iowait = 1; | ||
4429 | schedule(); | ||
4430 | current->in_iowait = 0; | ||
4431 | atomic_dec(&rq->nr_iowait); | ||
4432 | delayacct_blkio_end(); | ||
4433 | } | ||
4434 | EXPORT_SYMBOL(io_schedule); | ||
4435 | |||
4436 | long __sched io_schedule_timeout(long timeout) | 4361 | long __sched io_schedule_timeout(long timeout) |
4437 | { | 4362 | { |
4438 | struct rq *rq = raw_rq(); | 4363 | int old_iowait = current->in_iowait; |
4364 | struct rq *rq; | ||
4439 | long ret; | 4365 | long ret; |
4440 | 4366 | ||
4367 | current->in_iowait = 1; | ||
4368 | if (old_iowait) | ||
4369 | blk_schedule_flush_plug(current); | ||
4370 | else | ||
4371 | blk_flush_plug(current); | ||
4372 | |||
4441 | delayacct_blkio_start(); | 4373 | delayacct_blkio_start(); |
4374 | rq = raw_rq(); | ||
4442 | atomic_inc(&rq->nr_iowait); | 4375 | atomic_inc(&rq->nr_iowait); |
4443 | blk_flush_plug(current); | ||
4444 | current->in_iowait = 1; | ||
4445 | ret = schedule_timeout(timeout); | 4376 | ret = schedule_timeout(timeout); |
4446 | current->in_iowait = 0; | 4377 | current->in_iowait = old_iowait; |
4447 | atomic_dec(&rq->nr_iowait); | 4378 | atomic_dec(&rq->nr_iowait); |
4448 | delayacct_blkio_end(); | 4379 | delayacct_blkio_end(); |
4380 | |||
4449 | return ret; | 4381 | return ret; |
4450 | } | 4382 | } |
4383 | EXPORT_SYMBOL(io_schedule_timeout); | ||
4451 | 4384 | ||
4452 | /** | 4385 | /** |
4453 | * sys_sched_get_priority_max - return maximum RT priority. | 4386 | * sys_sched_get_priority_max - return maximum RT priority. |
@@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
7642 | { | 7575 | { |
7643 | struct task_struct *g, *p; | 7576 | struct task_struct *g, *p; |
7644 | 7577 | ||
7578 | /* | ||
7579 | * Autogroups do not have RT tasks; see autogroup_create(). | ||
7580 | */ | ||
7581 | if (task_group_is_autogroup(tg)) | ||
7582 | return 0; | ||
7583 | |||
7645 | for_each_process_thread(g, p) { | 7584 | for_each_process_thread(g, p) { |
7646 | if (rt_task(p) && task_group(p) == tg) | 7585 | if (rt_task(p) && task_group(p) == tg) |
7647 | return 1; | 7586 | return 1; |
@@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, | |||
7734 | { | 7673 | { |
7735 | int i, err = 0; | 7674 | int i, err = 0; |
7736 | 7675 | ||
7676 | /* | ||
7677 | * Disallowing the root group RT runtime is BAD, it would disallow the | ||
7678 | * kernel creating (and or operating) RT threads. | ||
7679 | */ | ||
7680 | if (tg == &root_task_group && rt_runtime == 0) | ||
7681 | return -EINVAL; | ||
7682 | |||
7683 | /* No period doesn't make any sense. */ | ||
7684 | if (rt_period == 0) | ||
7685 | return -EINVAL; | ||
7686 | |||
7737 | mutex_lock(&rt_constraints_mutex); | 7687 | mutex_lock(&rt_constraints_mutex); |
7738 | read_lock(&tasklist_lock); | 7688 | read_lock(&tasklist_lock); |
7739 | err = __rt_schedulable(tg, rt_period, rt_runtime); | 7689 | err = __rt_schedulable(tg, rt_period, rt_runtime); |
@@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
7790 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 7740 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; |
7791 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 7741 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
7792 | 7742 | ||
7793 | if (rt_period == 0) | ||
7794 | return -EINVAL; | ||
7795 | |||
7796 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 7743 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
7797 | } | 7744 | } |
7798 | 7745 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a027799ae130..3fa8fa6d9403 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
511 | struct sched_dl_entity, | 511 | struct sched_dl_entity, |
512 | dl_timer); | 512 | dl_timer); |
513 | struct task_struct *p = dl_task_of(dl_se); | 513 | struct task_struct *p = dl_task_of(dl_se); |
514 | unsigned long flags; | ||
514 | struct rq *rq; | 515 | struct rq *rq; |
515 | again: | ||
516 | rq = task_rq(p); | ||
517 | raw_spin_lock(&rq->lock); | ||
518 | 516 | ||
519 | if (rq != task_rq(p)) { | 517 | rq = task_rq_lock(current, &flags); |
520 | /* Task was moved, retrying. */ | ||
521 | raw_spin_unlock(&rq->lock); | ||
522 | goto again; | ||
523 | } | ||
524 | 518 | ||
525 | /* | 519 | /* |
526 | * We need to take care of several possible races here: | 520 | * We need to take care of several possible races here: |
@@ -541,6 +535,26 @@ again: | |||
541 | 535 | ||
542 | sched_clock_tick(); | 536 | sched_clock_tick(); |
543 | update_rq_clock(rq); | 537 | update_rq_clock(rq); |
538 | |||
539 | /* | ||
540 | * If the throttle happened during sched-out; like: | ||
541 | * | ||
542 | * schedule() | ||
543 | * deactivate_task() | ||
544 | * dequeue_task_dl() | ||
545 | * update_curr_dl() | ||
546 | * start_dl_timer() | ||
547 | * __dequeue_task_dl() | ||
548 | * prev->on_rq = 0; | ||
549 | * | ||
550 | * We can be both throttled and !queued. Replenish the counter | ||
551 | * but do not enqueue -- wait for our wakeup to do that. | ||
552 | */ | ||
553 | if (!task_on_rq_queued(p)) { | ||
554 | replenish_dl_entity(dl_se, dl_se); | ||
555 | goto unlock; | ||
556 | } | ||
557 | |||
544 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 558 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
545 | if (dl_task(rq->curr)) | 559 | if (dl_task(rq->curr)) |
546 | check_preempt_curr_dl(rq, p, 0); | 560 | check_preempt_curr_dl(rq, p, 0); |
@@ -555,7 +569,7 @@ again: | |||
555 | push_dl_task(rq); | 569 | push_dl_task(rq); |
556 | #endif | 570 | #endif |
557 | unlock: | 571 | unlock: |
558 | raw_spin_unlock(&rq->lock); | 572 | task_rq_unlock(rq, current, &flags); |
559 | 573 | ||
560 | return HRTIMER_NORESTART; | 574 | return HRTIMER_NORESTART; |
561 | } | 575 | } |
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq) | |||
898 | rq->curr->dl.dl_yielded = 1; | 912 | rq->curr->dl.dl_yielded = 1; |
899 | p->dl.runtime = 0; | 913 | p->dl.runtime = 0; |
900 | } | 914 | } |
915 | update_rq_clock(rq); | ||
901 | update_curr_dl(rq); | 916 | update_curr_dl(rq); |
902 | } | 917 | } |
903 | 918 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0870db23d79c..dc0f435a2779 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { } | |||
1380 | 1380 | ||
1381 | extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); | 1381 | extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); |
1382 | 1382 | ||
1383 | /* | ||
1384 | * __task_rq_lock - lock the rq @p resides on. | ||
1385 | */ | ||
1386 | static inline struct rq *__task_rq_lock(struct task_struct *p) | ||
1387 | __acquires(rq->lock) | ||
1388 | { | ||
1389 | struct rq *rq; | ||
1390 | |||
1391 | lockdep_assert_held(&p->pi_lock); | ||
1392 | |||
1393 | for (;;) { | ||
1394 | rq = task_rq(p); | ||
1395 | raw_spin_lock(&rq->lock); | ||
1396 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) | ||
1397 | return rq; | ||
1398 | raw_spin_unlock(&rq->lock); | ||
1399 | |||
1400 | while (unlikely(task_on_rq_migrating(p))) | ||
1401 | cpu_relax(); | ||
1402 | } | ||
1403 | } | ||
1404 | |||
1405 | /* | ||
1406 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. | ||
1407 | */ | ||
1408 | static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | ||
1409 | __acquires(p->pi_lock) | ||
1410 | __acquires(rq->lock) | ||
1411 | { | ||
1412 | struct rq *rq; | ||
1413 | |||
1414 | for (;;) { | ||
1415 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | ||
1416 | rq = task_rq(p); | ||
1417 | raw_spin_lock(&rq->lock); | ||
1418 | /* | ||
1419 | * move_queued_task() task_rq_lock() | ||
1420 | * | ||
1421 | * ACQUIRE (rq->lock) | ||
1422 | * [S] ->on_rq = MIGRATING [L] rq = task_rq() | ||
1423 | * WMB (__set_task_cpu()) ACQUIRE (rq->lock); | ||
1424 | * [S] ->cpu = new_cpu [L] task_rq() | ||
1425 | * [L] ->on_rq | ||
1426 | * RELEASE (rq->lock) | ||
1427 | * | ||
1428 | * If we observe the old cpu in task_rq_lock, the acquire of | ||
1429 | * the old rq->lock will fully serialize against the stores. | ||
1430 | * | ||
1431 | * If we observe the new cpu in task_rq_lock, the acquire will | ||
1432 | * pair with the WMB to ensure we must then also see migrating. | ||
1433 | */ | ||
1434 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) | ||
1435 | return rq; | ||
1436 | raw_spin_unlock(&rq->lock); | ||
1437 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
1438 | |||
1439 | while (unlikely(task_on_rq_migrating(p))) | ||
1440 | cpu_relax(); | ||
1441 | } | ||
1442 | } | ||
1443 | |||
1444 | static inline void __task_rq_unlock(struct rq *rq) | ||
1445 | __releases(rq->lock) | ||
1446 | { | ||
1447 | raw_spin_unlock(&rq->lock); | ||
1448 | } | ||
1449 | |||
1450 | static inline void | ||
1451 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
1452 | __releases(rq->lock) | ||
1453 | __releases(p->pi_lock) | ||
1454 | { | ||
1455 | raw_spin_unlock(&rq->lock); | ||
1456 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
1457 | } | ||
1458 | |||
1383 | #ifdef CONFIG_SMP | 1459 | #ifdef CONFIG_SMP |
1384 | #ifdef CONFIG_PREEMPT | 1460 | #ifdef CONFIG_PREEMPT |
1385 | 1461 | ||