aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-21 13:40:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-21 13:40:02 -0500
commite2defd02717ebc54ae2f4862271a3093665b426a (patch)
treebb724dc1041b72ac9a241fb9d00aae995fea6236 /kernel
parentb5aeca54d0212515d820e5555115e2fc7847a68b (diff)
parent2636ed5f8d15ff9395731593537b4b3fdf2af24d (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Thiscontains misc fixes: preempt_schedule_common() and io_schedule() recursion fixes, sched/dl fixes, a completion_done() revert, two sched/rt fixes and a comment update patch" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/rt: Avoid obvious configuration fail sched/autogroup: Fix failure to set cpu.rt_runtime_us sched/dl: Do update_rq_clock() in yield_task_dl() sched: Prevent recursion in io_schedule() sched/completion: Serialize completion_done() with complete() sched: Fix preempt_schedule_common() triggering tracing recursion sched/dl: Prevent enqueue of a sleeping task in dl_task_timer() sched: Make dl_task_time() use task_rq_lock() sched: Clarify ordering between task_rq_lock() and move_queued_task()
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/completion.c19
-rw-r--r--kernel/sched/core.c113
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/sched.h76
5 files changed, 148 insertions, 99 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
87 * so we don't have to move tasks around upon policy change, 87 * so we don't have to move tasks around upon policy change,
88 * or flail around trying to allocate bandwidth on the fly. 88 * or flail around trying to allocate bandwidth on the fly.
89 * A bandwidth exception in __sched_setscheduler() allows 89 * A bandwidth exception in __sched_setscheduler() allows
90 * the policy change to proceed. Thereafter, task_group() 90 * the policy change to proceed.
91 * returns &root_task_group, so zero bandwidth is required.
92 */ 91 */
93 free_rt_sched_group(tg); 92 free_rt_sched_group(tg);
94 tg->rt_se = root_task_group.rt_se; 93 tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
115 if (tg != &root_task_group) 114 if (tg != &root_task_group)
116 return false; 115 return false;
117 116
118 if (p->sched_class != &fair_sched_class)
119 return false;
120
121 /* 117 /*
122 * We can only assume the task group can't go away on us if 118 * We can only assume the task group can't go away on us if
123 * autogroup_move_group() can see us on ->thread_group list. 119 * autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 7052d3fd4e7b..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
274 * first without taking the lock so we can 274 * first without taking the lock so we can
275 * return early in the blocking case. 275 * return early in the blocking case.
276 */ 276 */
277 if (!ACCESS_ONCE(x->done)) 277 if (!READ_ONCE(x->done))
278 return 0; 278 return 0;
279 279
280 spin_lock_irqsave(&x->wait.lock, flags); 280 spin_lock_irqsave(&x->wait.lock, flags);
@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
297 */ 297 */
298bool completion_done(struct completion *x) 298bool completion_done(struct completion *x)
299{ 299{
300 return !!ACCESS_ONCE(x->done); 300 if (!READ_ONCE(x->done))
301 return false;
302
303 /*
304 * If ->done, we need to wait for complete() to release ->wait.lock
305 * otherwise we can end up freeing the completion before complete()
306 * is done referencing it.
307 *
308 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
309 * the loads of ->done and ->wait.lock such that we cannot observe
310 * the lock before complete() acquires it while observing the ->done
311 * after it's acquired the lock.
312 */
313 smp_rmb();
314 spin_unlock_wait(&x->wait.lock);
315 return true;
301} 316}
302EXPORT_SYMBOL(completion_done); 317EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13049aac05a6..f0f831e8a345 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -307,66 +307,6 @@ __read_mostly int scheduler_running;
307int sysctl_sched_rt_runtime = 950000; 307int sysctl_sched_rt_runtime = 950000;
308 308
309/* 309/*
310 * __task_rq_lock - lock the rq @p resides on.
311 */
312static inline struct rq *__task_rq_lock(struct task_struct *p)
313 __acquires(rq->lock)
314{
315 struct rq *rq;
316
317 lockdep_assert_held(&p->pi_lock);
318
319 for (;;) {
320 rq = task_rq(p);
321 raw_spin_lock(&rq->lock);
322 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
323 return rq;
324 raw_spin_unlock(&rq->lock);
325
326 while (unlikely(task_on_rq_migrating(p)))
327 cpu_relax();
328 }
329}
330
331/*
332 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
333 */
334static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
335 __acquires(p->pi_lock)
336 __acquires(rq->lock)
337{
338 struct rq *rq;
339
340 for (;;) {
341 raw_spin_lock_irqsave(&p->pi_lock, *flags);
342 rq = task_rq(p);
343 raw_spin_lock(&rq->lock);
344 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
345 return rq;
346 raw_spin_unlock(&rq->lock);
347 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
348
349 while (unlikely(task_on_rq_migrating(p)))
350 cpu_relax();
351 }
352}
353
354static void __task_rq_unlock(struct rq *rq)
355 __releases(rq->lock)
356{
357 raw_spin_unlock(&rq->lock);
358}
359
360static inline void
361task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
362 __releases(rq->lock)
363 __releases(p->pi_lock)
364{
365 raw_spin_unlock(&rq->lock);
366 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
367}
368
369/*
370 * this_rq_lock - lock this runqueue and disable interrupts. 310 * this_rq_lock - lock this runqueue and disable interrupts.
371 */ 311 */
372static struct rq *this_rq_lock(void) 312static struct rq *this_rq_lock(void)
@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
2899 preempt_disable(); 2839 preempt_disable();
2900} 2840}
2901 2841
2902static void preempt_schedule_common(void) 2842static void __sched notrace preempt_schedule_common(void)
2903{ 2843{
2904 do { 2844 do {
2905 __preempt_count_add(PREEMPT_ACTIVE); 2845 __preempt_count_add(PREEMPT_ACTIVE);
@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
4418 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4358 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4419 * that process accounting knows that this is a task in IO wait state. 4359 * that process accounting knows that this is a task in IO wait state.
4420 */ 4360 */
4421void __sched io_schedule(void)
4422{
4423 struct rq *rq = raw_rq();
4424
4425 delayacct_blkio_start();
4426 atomic_inc(&rq->nr_iowait);
4427 blk_flush_plug(current);
4428 current->in_iowait = 1;
4429 schedule();
4430 current->in_iowait = 0;
4431 atomic_dec(&rq->nr_iowait);
4432 delayacct_blkio_end();
4433}
4434EXPORT_SYMBOL(io_schedule);
4435
4436long __sched io_schedule_timeout(long timeout) 4361long __sched io_schedule_timeout(long timeout)
4437{ 4362{
4438 struct rq *rq = raw_rq(); 4363 int old_iowait = current->in_iowait;
4364 struct rq *rq;
4439 long ret; 4365 long ret;
4440 4366
4367 current->in_iowait = 1;
4368 if (old_iowait)
4369 blk_schedule_flush_plug(current);
4370 else
4371 blk_flush_plug(current);
4372
4441 delayacct_blkio_start(); 4373 delayacct_blkio_start();
4374 rq = raw_rq();
4442 atomic_inc(&rq->nr_iowait); 4375 atomic_inc(&rq->nr_iowait);
4443 blk_flush_plug(current);
4444 current->in_iowait = 1;
4445 ret = schedule_timeout(timeout); 4376 ret = schedule_timeout(timeout);
4446 current->in_iowait = 0; 4377 current->in_iowait = old_iowait;
4447 atomic_dec(&rq->nr_iowait); 4378 atomic_dec(&rq->nr_iowait);
4448 delayacct_blkio_end(); 4379 delayacct_blkio_end();
4380
4449 return ret; 4381 return ret;
4450} 4382}
4383EXPORT_SYMBOL(io_schedule_timeout);
4451 4384
4452/** 4385/**
4453 * sys_sched_get_priority_max - return maximum RT priority. 4386 * sys_sched_get_priority_max - return maximum RT priority.
@@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7642{ 7575{
7643 struct task_struct *g, *p; 7576 struct task_struct *g, *p;
7644 7577
7578 /*
7579 * Autogroups do not have RT tasks; see autogroup_create().
7580 */
7581 if (task_group_is_autogroup(tg))
7582 return 0;
7583
7645 for_each_process_thread(g, p) { 7584 for_each_process_thread(g, p) {
7646 if (rt_task(p) && task_group(p) == tg) 7585 if (rt_task(p) && task_group(p) == tg)
7647 return 1; 7586 return 1;
@@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
7734{ 7673{
7735 int i, err = 0; 7674 int i, err = 0;
7736 7675
7676 /*
7677 * Disallowing the root group RT runtime is BAD, it would disallow the
7678 * kernel creating (and or operating) RT threads.
7679 */
7680 if (tg == &root_task_group && rt_runtime == 0)
7681 return -EINVAL;
7682
7683 /* No period doesn't make any sense. */
7684 if (rt_period == 0)
7685 return -EINVAL;
7686
7737 mutex_lock(&rt_constraints_mutex); 7687 mutex_lock(&rt_constraints_mutex);
7738 read_lock(&tasklist_lock); 7688 read_lock(&tasklist_lock);
7739 err = __rt_schedulable(tg, rt_period, rt_runtime); 7689 err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7790 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7740 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7791 rt_runtime = tg->rt_bandwidth.rt_runtime; 7741 rt_runtime = tg->rt_bandwidth.rt_runtime;
7792 7742
7793 if (rt_period == 0)
7794 return -EINVAL;
7795
7796 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7743 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7797} 7744}
7798 7745
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a027799ae130..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
511 struct sched_dl_entity, 511 struct sched_dl_entity,
512 dl_timer); 512 dl_timer);
513 struct task_struct *p = dl_task_of(dl_se); 513 struct task_struct *p = dl_task_of(dl_se);
514 unsigned long flags;
514 struct rq *rq; 515 struct rq *rq;
515again:
516 rq = task_rq(p);
517 raw_spin_lock(&rq->lock);
518 516
519 if (rq != task_rq(p)) { 517 rq = task_rq_lock(current, &flags);
520 /* Task was moved, retrying. */
521 raw_spin_unlock(&rq->lock);
522 goto again;
523 }
524 518
525 /* 519 /*
526 * We need to take care of several possible races here: 520 * We need to take care of several possible races here:
@@ -541,6 +535,26 @@ again:
541 535
542 sched_clock_tick(); 536 sched_clock_tick();
543 update_rq_clock(rq); 537 update_rq_clock(rq);
538
539 /*
540 * If the throttle happened during sched-out; like:
541 *
542 * schedule()
543 * deactivate_task()
544 * dequeue_task_dl()
545 * update_curr_dl()
546 * start_dl_timer()
547 * __dequeue_task_dl()
548 * prev->on_rq = 0;
549 *
550 * We can be both throttled and !queued. Replenish the counter
551 * but do not enqueue -- wait for our wakeup to do that.
552 */
553 if (!task_on_rq_queued(p)) {
554 replenish_dl_entity(dl_se, dl_se);
555 goto unlock;
556 }
557
544 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 558 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
545 if (dl_task(rq->curr)) 559 if (dl_task(rq->curr))
546 check_preempt_curr_dl(rq, p, 0); 560 check_preempt_curr_dl(rq, p, 0);
@@ -555,7 +569,7 @@ again:
555 push_dl_task(rq); 569 push_dl_task(rq);
556#endif 570#endif
557unlock: 571unlock:
558 raw_spin_unlock(&rq->lock); 572 task_rq_unlock(rq, current, &flags);
559 573
560 return HRTIMER_NORESTART; 574 return HRTIMER_NORESTART;
561} 575}
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
898 rq->curr->dl.dl_yielded = 1; 912 rq->curr->dl.dl_yielded = 1;
899 p->dl.runtime = 0; 913 p->dl.runtime = 0;
900 } 914 }
915 update_rq_clock(rq);
901 update_curr_dl(rq); 916 update_curr_dl(rq);
902} 917}
903 918
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0870db23d79c..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
1380 1380
1381extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); 1381extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
1382 1382
1383/*
1384 * __task_rq_lock - lock the rq @p resides on.
1385 */
1386static inline struct rq *__task_rq_lock(struct task_struct *p)
1387 __acquires(rq->lock)
1388{
1389 struct rq *rq;
1390
1391 lockdep_assert_held(&p->pi_lock);
1392
1393 for (;;) {
1394 rq = task_rq(p);
1395 raw_spin_lock(&rq->lock);
1396 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
1397 return rq;
1398 raw_spin_unlock(&rq->lock);
1399
1400 while (unlikely(task_on_rq_migrating(p)))
1401 cpu_relax();
1402 }
1403}
1404
1405/*
1406 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
1407 */
1408static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1409 __acquires(p->pi_lock)
1410 __acquires(rq->lock)
1411{
1412 struct rq *rq;
1413
1414 for (;;) {
1415 raw_spin_lock_irqsave(&p->pi_lock, *flags);
1416 rq = task_rq(p);
1417 raw_spin_lock(&rq->lock);
1418 /*
1419 * move_queued_task() task_rq_lock()
1420 *
1421 * ACQUIRE (rq->lock)
1422 * [S] ->on_rq = MIGRATING [L] rq = task_rq()
1423 * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
1424 * [S] ->cpu = new_cpu [L] task_rq()
1425 * [L] ->on_rq
1426 * RELEASE (rq->lock)
1427 *
1428 * If we observe the old cpu in task_rq_lock, the acquire of
1429 * the old rq->lock will fully serialize against the stores.
1430 *
1431 * If we observe the new cpu in task_rq_lock, the acquire will
1432 * pair with the WMB to ensure we must then also see migrating.
1433 */
1434 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
1435 return rq;
1436 raw_spin_unlock(&rq->lock);
1437 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1438
1439 while (unlikely(task_on_rq_migrating(p)))
1440 cpu_relax();
1441 }
1442}
1443
1444static inline void __task_rq_unlock(struct rq *rq)
1445 __releases(rq->lock)
1446{
1447 raw_spin_unlock(&rq->lock);
1448}
1449
1450static inline void
1451task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
1452 __releases(rq->lock)
1453 __releases(p->pi_lock)
1454{
1455 raw_spin_unlock(&rq->lock);
1456 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1457}
1458
1383#ifdef CONFIG_SMP 1459#ifdef CONFIG_SMP
1384#ifdef CONFIG_PREEMPT 1460#ifdef CONFIG_PREEMPT
1385 1461