aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_rt.c
diff options
context:
space:
mode:
authorGregory Haskins <ghaskins@novell.com>2008-05-12 15:20:41 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-06 09:19:25 -0400
commit45c01e824991b2dd0a332e19efc4901acb31209f (patch)
tree20d889b97636410c6ae425ff5dec90479d8cbb1a /kernel/sched_rt.c
parent39b945a37bac2b692773a470890c8ba301485b15 (diff)
sched: prioritize non-migratable tasks over migratable ones
Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm that could allow suboptimal balancing if a non-migratable task gets queued behind a running migratable one. It is discussed in this thread: http://lkml.org/lkml/2008/4/22/296 This issue has been further exacerbated by a recent checkin to sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3). >From a pure priority standpoint, the run-queue is doing the "right" thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1 wakes up at equal or lower priority (affined only to cpu1) later, it *should* wait for T0 to finish. However, in reality that is likely suboptimal from a system perspective if there are other cores that could allow T0 and T1 to run concurrently. Since T1 can not migrate, the only choice for higher concurrency is to try to move T0. This is not something we addessed in the recent rt-balancing re-work. This patch tries to enhance the balancing algorithm by accomodating this scenario. It accomplishes this by incorporating the migratability of a task into its priority calculation. Within a numerical tsk->prio, a non-migratable task is logically higher than a migratable one. We maintain this by introducing a new per-priority queue (xqueue, or exclusive-queue) for holding non-migratable tasks. The scheduler will draw from the xqueue over the standard shared-queue (squeue) when available. There are several details for utilizing this properly. 1) During task-wake-up, we not only need to check if the priority preempts the current task, but we also need to check for this non-migratable condition. Therefore, if a non-migratable task wakes up and sees an equal priority migratable task already running, it will attempt to preempt it *if* there is a likelyhood that the current task will find an immediate home. 2) Tasks only get this non-migratable "priority boost" on wake-up. Any requeuing will result in the non-migratable task being queued to the end of the shared queue. This is an attempt to prevent the system from being completely unfair to migratable tasks during things like SCHED_RR timeslicing. I am sure this patch introduces potentially "odd" behavior if you concoct a scenario where a bunch of non-migratable threads could starve migratable ones given the right pattern. I am not yet convinced that this is a problem since we are talking about tasks of equal RT priority anyway, and there never is much in the way of guarantees against starvation under that scenario anyway. (e.g. you could come up with a similar scenario with a specific timing environment verses an affinity environment). I can be convinced otherwise, but for now I think this is "ok". Signed-off-by: Gregory Haskins <ghaskins@novell.com> CC: Dmitry Adamushko <dmitry.adamushko@gmail.com> CC: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/sched_rt.c')
-rw-r--r--kernel/sched_rt.c75
1 files changed, 68 insertions, 7 deletions
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3432d573205d..fefed39fafd8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -458,7 +458,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
458 if (group_rq && rt_rq_throttled(group_rq)) 458 if (group_rq && rt_rq_throttled(group_rq))
459 return; 459 return;
460 460
461 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 461 if (rt_se->nr_cpus_allowed == 1)
462 list_add_tail(&rt_se->run_list,
463 array->xqueue + rt_se_prio(rt_se));
464 else
465 list_add_tail(&rt_se->run_list,
466 array->squeue + rt_se_prio(rt_se));
467
462 __set_bit(rt_se_prio(rt_se), array->bitmap); 468 __set_bit(rt_se_prio(rt_se), array->bitmap);
463 469
464 inc_rt_tasks(rt_se, rt_rq); 470 inc_rt_tasks(rt_se, rt_rq);
@@ -470,7 +476,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
470 struct rt_prio_array *array = &rt_rq->active; 476 struct rt_prio_array *array = &rt_rq->active;
471 477
472 list_del_init(&rt_se->run_list); 478 list_del_init(&rt_se->run_list);
473 if (list_empty(array->queue + rt_se_prio(rt_se))) 479 if (list_empty(array->squeue + rt_se_prio(rt_se))
480 && list_empty(array->xqueue + rt_se_prio(rt_se)))
474 __clear_bit(rt_se_prio(rt_se), array->bitmap); 481 __clear_bit(rt_se_prio(rt_se), array->bitmap);
475 482
476 dec_rt_tasks(rt_se, rt_rq); 483 dec_rt_tasks(rt_se, rt_rq);
@@ -537,13 +544,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
537/* 544/*
538 * Put task to the end of the run list without the overhead of dequeue 545 * Put task to the end of the run list without the overhead of dequeue
539 * followed by enqueue. 546 * followed by enqueue.
547 *
548 * Note: We always enqueue the task to the shared-queue, regardless of its
549 * previous position w.r.t. exclusive vs shared. This is so that exclusive RR
550 * tasks fairly round-robin with all tasks on the runqueue, not just other
551 * exclusive tasks.
540 */ 552 */
541static 553static
542void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 554void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
543{ 555{
544 struct rt_prio_array *array = &rt_rq->active; 556 struct rt_prio_array *array = &rt_rq->active;
545 557
546 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 558 list_del_init(&rt_se->run_list);
559 list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
547} 560}
548 561
549static void requeue_task_rt(struct rq *rq, struct task_struct *p) 562static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -601,13 +614,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
601} 614}
602#endif /* CONFIG_SMP */ 615#endif /* CONFIG_SMP */
603 616
617static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
618 struct rt_rq *rt_rq);
619
604/* 620/*
605 * Preempt the current task with a newly woken task if needed: 621 * Preempt the current task with a newly woken task if needed:
606 */ 622 */
607static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 623static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
608{ 624{
609 if (p->prio < rq->curr->prio) 625 if (p->prio < rq->curr->prio) {
610 resched_task(rq->curr); 626 resched_task(rq->curr);
627 return;
628 }
629
630#ifdef CONFIG_SMP
631 /*
632 * If:
633 *
634 * - the newly woken task is of equal priority to the current task
635 * - the newly woken task is non-migratable while current is migratable
636 * - current will be preempted on the next reschedule
637 *
638 * we should check to see if current can readily move to a different
639 * cpu. If so, we will reschedule to allow the push logic to try
640 * to move current somewhere else, making room for our non-migratable
641 * task.
642 */
643 if((p->prio == rq->curr->prio)
644 && p->rt.nr_cpus_allowed == 1
645 && rq->curr->rt.nr_cpus_allowed != 1
646 && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
647 cpumask_t mask;
648
649 if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
650 /*
651 * There appears to be other cpus that can accept
652 * current, so lets reschedule to try and push it away
653 */
654 resched_task(rq->curr);
655 }
656#endif
611} 657}
612 658
613static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 659static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -621,8 +667,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
621 idx = sched_find_first_bit(array->bitmap); 667 idx = sched_find_first_bit(array->bitmap);
622 BUG_ON(idx >= MAX_RT_PRIO); 668 BUG_ON(idx >= MAX_RT_PRIO);
623 669
624 queue = array->queue + idx; 670 queue = array->xqueue + idx;
625 next = list_entry(queue->next, struct sched_rt_entity, run_list); 671 if (!list_empty(queue))
672 next = list_entry(queue->next, struct sched_rt_entity,
673 run_list);
674 else {
675 queue = array->squeue + idx;
676 next = list_entry(queue->next, struct sched_rt_entity,
677 run_list);
678 }
626 679
627 return next; 680 return next;
628} 681}
@@ -692,7 +745,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
692 continue; 745 continue;
693 if (next && next->prio < idx) 746 if (next && next->prio < idx)
694 continue; 747 continue;
695 list_for_each_entry(rt_se, array->queue + idx, run_list) { 748 list_for_each_entry(rt_se, array->squeue + idx, run_list) {
696 struct task_struct *p = rt_task_of(rt_se); 749 struct task_struct *p = rt_task_of(rt_se);
697 if (pick_rt_task(rq, p, cpu)) { 750 if (pick_rt_task(rq, p, cpu)) {
698 next = p; 751 next = p;
@@ -1146,6 +1199,14 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1146 } 1199 }
1147 1200
1148 update_rt_migration(rq); 1201 update_rt_migration(rq);
1202
1203 if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
1204 /*
1205 * If either the new or old weight is a "1", we need
1206 * to requeue to properly move between shared and
1207 * exclusive queues.
1208 */
1209 requeue_task_rt(rq, p);
1149 } 1210 }
1150 1211
1151 p->cpus_allowed = *new_mask; 1212 p->cpus_allowed = *new_mask;