aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2006-06-27 05:54:51 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-27 20:32:46 -0400
commitb29739f902ee76a05493fb7d2303490fc75364f4 (patch)
tree1bf48dfb74752a7ef24a2a4a74c45da0aaec754b /kernel
parent77ba89c5cf28d5d98a3cae17f67a3e42b102cc25 (diff)
[PATCH] pi-futex: scheduler support for pi
Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting The effective_prio() cleanups also fix a priority-calculation bug pointed out by Andrey Gelman, in set_user_nice(). has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Andrey Gelman <agelman@012.net.il> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c189
1 files changed, 160 insertions, 29 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 15abf0833245..08431f07a999 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -355,6 +355,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
356 356
357/* 357/*
358 * __task_rq_lock - lock the runqueue a given task resides on.
359 * Must be called interrupts disabled.
360 */
361static inline runqueue_t *__task_rq_lock(task_t *p)
362 __acquires(rq->lock)
363{
364 struct runqueue *rq;
365
366repeat_lock_task:
367 rq = task_rq(p);
368 spin_lock(&rq->lock);
369 if (unlikely(rq != task_rq(p))) {
370 spin_unlock(&rq->lock);
371 goto repeat_lock_task;
372 }
373 return rq;
374}
375
376/*
358 * task_rq_lock - lock the runqueue a given task resides on and disable 377 * task_rq_lock - lock the runqueue a given task resides on and disable
359 * interrupts. Note the ordering: we can safely lookup the task_rq without 378 * interrupts. Note the ordering: we can safely lookup the task_rq without
360 * explicitly disabling preemption. 379 * explicitly disabling preemption.
@@ -375,6 +394,12 @@ repeat_lock_task:
375 return rq; 394 return rq;
376} 395}
377 396
397static inline void __task_rq_unlock(runqueue_t *rq)
398 __releases(rq->lock)
399{
400 spin_unlock(&rq->lock);
401}
402
378static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 403static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
379 __releases(rq->lock) 404 __releases(rq->lock)
380{ 405{
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
638} 663}
639 664
640/* 665/*
641 * effective_prio - return the priority that is based on the static 666 * __normal_prio - return the priority that is based on the static
642 * priority but is modified by bonuses/penalties. 667 * priority but is modified by bonuses/penalties.
643 * 668 *
644 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 669 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
651 * 676 *
652 * Both properties are important to certain workloads. 677 * Both properties are important to certain workloads.
653 */ 678 */
654static int effective_prio(task_t *p) 679
680static inline int __normal_prio(task_t *p)
655{ 681{
656 int bonus, prio; 682 int bonus, prio;
657 683
658 if (rt_task(p))
659 return p->prio;
660
661 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 684 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
662 685
663 prio = p->static_prio - bonus; 686 prio = p->static_prio - bonus;
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
692 715
693static void set_load_weight(task_t *p) 716static void set_load_weight(task_t *p)
694{ 717{
695 if (rt_task(p)) { 718 if (has_rt_policy(p)) {
696#ifdef CONFIG_SMP 719#ifdef CONFIG_SMP
697 if (p == task_rq(p)->migration_thread) 720 if (p == task_rq(p)->migration_thread)
698 /* 721 /*
@@ -731,6 +754,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
731} 754}
732 755
733/* 756/*
757 * Calculate the expected normal priority: i.e. priority
758 * without taking RT-inheritance into account. Might be
759 * boosted by interactivity modifiers. Changes upon fork,
760 * setprio syscalls, and whenever the interactivity
761 * estimator recalculates.
762 */
763static inline int normal_prio(task_t *p)
764{
765 int prio;
766
767 if (has_rt_policy(p))
768 prio = MAX_RT_PRIO-1 - p->rt_priority;
769 else
770 prio = __normal_prio(p);
771 return prio;
772}
773
774/*
775 * Calculate the current priority, i.e. the priority
776 * taken into account by the scheduler. This value might
777 * be boosted by RT tasks, or might be boosted by
778 * interactivity modifiers. Will be RT if the task got
779 * RT-boosted. If not then it returns p->normal_prio.
780 */
781static int effective_prio(task_t *p)
782{
783 p->normal_prio = normal_prio(p);
784 /*
785 * If we are RT tasks or we were boosted to RT priority,
786 * keep the priority unchanged. Otherwise, update priority
787 * to the normal priority:
788 */
789 if (!rt_prio(p->prio))
790 return p->normal_prio;
791 return p->prio;
792}
793
794/*
734 * __activate_task - move a task to the runqueue. 795 * __activate_task - move a task to the runqueue.
735 */ 796 */
736static void __activate_task(task_t *p, runqueue_t *rq) 797static void __activate_task(task_t *p, runqueue_t *rq)
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
752 inc_nr_running(p, rq); 813 inc_nr_running(p, rq);
753} 814}
754 815
816/*
817 * Recalculate p->normal_prio and p->prio after having slept,
818 * updating the sleep-average too:
819 */
755static int recalc_task_prio(task_t *p, unsigned long long now) 820static int recalc_task_prio(task_t *p, unsigned long long now)
756{ 821{
757 /* Caller must always ensure 'now >= p->timestamp' */ 822 /* Caller must always ensure 'now >= p->timestamp' */
@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1448 * event cannot wake it up and insert it on the runqueue either. 1513 * event cannot wake it up and insert it on the runqueue either.
1449 */ 1514 */
1450 p->state = TASK_RUNNING; 1515 p->state = TASK_RUNNING;
1516
1517 /*
1518 * Make sure we do not leak PI boosting priority to the child:
1519 */
1520 p->prio = current->normal_prio;
1521
1451 INIT_LIST_HEAD(&p->run_list); 1522 INIT_LIST_HEAD(&p->run_list);
1452 p->array = NULL; 1523 p->array = NULL;
1453#ifdef CONFIG_SCHEDSTATS 1524#ifdef CONFIG_SCHEDSTATS
@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1527 __activate_task(p, rq); 1598 __activate_task(p, rq);
1528 else { 1599 else {
1529 p->prio = current->prio; 1600 p->prio = current->prio;
1601 p->normal_prio = current->normal_prio;
1530 list_add_tail(&p->run_list, &current->run_list); 1602 list_add_tail(&p->run_list, &current->run_list);
1531 p->array = current->array; 1603 p->array = current->array;
1532 p->array->nr_active++; 1604 p->array->nr_active++;
@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3668 3740
3669EXPORT_SYMBOL(sleep_on_timeout); 3741EXPORT_SYMBOL(sleep_on_timeout);
3670 3742
3743#ifdef CONFIG_RT_MUTEXES
3744
3745/*
3746 * rt_mutex_setprio - set the current priority of a task
3747 * @p: task
3748 * @prio: prio value (kernel-internal form)
3749 *
3750 * This function changes the 'effective' priority of a task. It does
3751 * not touch ->normal_prio like __setscheduler().
3752 *
3753 * Used by the rt_mutex code to implement priority inheritance logic.
3754 */
3755void rt_mutex_setprio(task_t *p, int prio)
3756{
3757 unsigned long flags;
3758 prio_array_t *array;
3759 runqueue_t *rq;
3760 int oldprio;
3761
3762 BUG_ON(prio < 0 || prio > MAX_PRIO);
3763
3764 rq = task_rq_lock(p, &flags);
3765
3766 oldprio = p->prio;
3767 array = p->array;
3768 if (array)
3769 dequeue_task(p, array);
3770 p->prio = prio;
3771
3772 if (array) {
3773 /*
3774 * If changing to an RT priority then queue it
3775 * in the active array!
3776 */
3777 if (rt_task(p))
3778 array = rq->active;
3779 enqueue_task(p, array);
3780 /*
3781 * Reschedule if we are currently running on this runqueue and
3782 * our priority decreased, or if we are not currently running on
3783 * this runqueue and our priority is higher than the current's
3784 */
3785 if (task_running(rq, p)) {
3786 if (p->prio > oldprio)
3787 resched_task(rq->curr);
3788 } else if (TASK_PREEMPTS_CURR(p, rq))
3789 resched_task(rq->curr);
3790 }
3791 task_rq_unlock(rq, &flags);
3792}
3793
3794#endif
3795
3671void set_user_nice(task_t *p, long nice) 3796void set_user_nice(task_t *p, long nice)
3672{ 3797{
3673 unsigned long flags; 3798 unsigned long flags;
3674 prio_array_t *array; 3799 prio_array_t *array;
3675 runqueue_t *rq; 3800 runqueue_t *rq;
3676 int old_prio, new_prio, delta; 3801 int old_prio, delta;
3677 3802
3678 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3803 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3679 return; 3804 return;
@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
3688 * it wont have any effect on scheduling until the task is 3813 * it wont have any effect on scheduling until the task is
3689 * not SCHED_NORMAL/SCHED_BATCH: 3814 * not SCHED_NORMAL/SCHED_BATCH:
3690 */ 3815 */
3691 if (rt_task(p)) { 3816 if (has_rt_policy(p)) {
3692 p->static_prio = NICE_TO_PRIO(nice); 3817 p->static_prio = NICE_TO_PRIO(nice);
3693 goto out_unlock; 3818 goto out_unlock;
3694 } 3819 }
@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
3698 dec_raw_weighted_load(rq, p); 3823 dec_raw_weighted_load(rq, p);
3699 } 3824 }
3700 3825
3701 old_prio = p->prio;
3702 new_prio = NICE_TO_PRIO(nice);
3703 delta = new_prio - old_prio;
3704 p->static_prio = NICE_TO_PRIO(nice); 3826 p->static_prio = NICE_TO_PRIO(nice);
3705 set_load_weight(p); 3827 set_load_weight(p);
3706 p->prio += delta; 3828 old_prio = p->prio;
3829 p->prio = effective_prio(p);
3830 delta = p->prio - old_prio;
3707 3831
3708 if (array) { 3832 if (array) {
3709 enqueue_task(p, array); 3833 enqueue_task(p, array);
@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
3718out_unlock: 3842out_unlock:
3719 task_rq_unlock(rq, &flags); 3843 task_rq_unlock(rq, &flags);
3720} 3844}
3721
3722EXPORT_SYMBOL(set_user_nice); 3845EXPORT_SYMBOL(set_user_nice);
3723 3846
3724/* 3847/*
@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3833 BUG_ON(p->array); 3956 BUG_ON(p->array);
3834 p->policy = policy; 3957 p->policy = policy;
3835 p->rt_priority = prio; 3958 p->rt_priority = prio;
3836 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3959 p->normal_prio = normal_prio(p);
3837 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3960 /* we are holding p->pi_lock already */
3838 } else { 3961 p->prio = rt_mutex_getprio(p);
3839 p->prio = p->static_prio; 3962 /*
3840 /* 3963 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3841 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3964 */
3842 */ 3965 if (policy == SCHED_BATCH)
3843 if (policy == SCHED_BATCH) 3966 p->sleep_avg = 0;
3844 p->sleep_avg = 0;
3845 }
3846 set_load_weight(p); 3967 set_load_weight(p);
3847} 3968}
3848 3969
@@ -3912,14 +4033,20 @@ recheck:
3912 if (retval) 4033 if (retval)
3913 return retval; 4034 return retval;
3914 /* 4035 /*
4036 * make sure no PI-waiters arrive (or leave) while we are
4037 * changing the priority of the task:
4038 */
4039 spin_lock_irqsave(&p->pi_lock, flags);
4040 /*
3915 * To be able to change p->policy safely, the apropriate 4041 * To be able to change p->policy safely, the apropriate
3916 * runqueue lock must be held. 4042 * runqueue lock must be held.
3917 */ 4043 */
3918 rq = task_rq_lock(p, &flags); 4044 rq = __task_rq_lock(p);
3919 /* recheck policy now with rq lock held */ 4045 /* recheck policy now with rq lock held */
3920 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4046 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3921 policy = oldpolicy = -1; 4047 policy = oldpolicy = -1;
3922 task_rq_unlock(rq, &flags); 4048 __task_rq_unlock(rq);
4049 spin_unlock_irqrestore(&p->pi_lock, flags);
3923 goto recheck; 4050 goto recheck;
3924 } 4051 }
3925 array = p->array; 4052 array = p->array;
@@ -3940,7 +4067,9 @@ recheck:
3940 } else if (TASK_PREEMPTS_CURR(p, rq)) 4067 } else if (TASK_PREEMPTS_CURR(p, rq))
3941 resched_task(rq->curr); 4068 resched_task(rq->curr);
3942 } 4069 }
3943 task_rq_unlock(rq, &flags); 4070 __task_rq_unlock(rq);
4071 spin_unlock_irqrestore(&p->pi_lock, flags);
4072
3944 return 0; 4073 return 0;
3945} 4074}
3946EXPORT_SYMBOL_GPL(sched_setscheduler); 4075EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4575 idle->timestamp = sched_clock(); 4704 idle->timestamp = sched_clock();
4576 idle->sleep_avg = 0; 4705 idle->sleep_avg = 0;
4577 idle->array = NULL; 4706 idle->array = NULL;
4578 idle->prio = MAX_PRIO; 4707 idle->prio = idle->normal_prio = MAX_PRIO;
4579 idle->state = TASK_RUNNING; 4708 idle->state = TASK_RUNNING;
4580 idle->cpus_allowed = cpumask_of_cpu(cpu); 4709 idle->cpus_allowed = cpumask_of_cpu(cpu);
4581 set_task_cpu(idle, cpu); 4710 set_task_cpu(idle, cpu);
@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
6582 if (!rt_task(p)) 6711 if (!rt_task(p))
6583 continue; 6712 continue;
6584 6713
6585 rq = task_rq_lock(p, &flags); 6714 spin_lock_irqsave(&p->pi_lock, flags);
6715 rq = __task_rq_lock(p);
6586 6716
6587 array = p->array; 6717 array = p->array;
6588 if (array) 6718 if (array)
@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
6593 resched_task(rq->curr); 6723 resched_task(rq->curr);
6594 } 6724 }
6595 6725
6596 task_rq_unlock(rq, &flags); 6726 __task_rq_unlock(rq);
6727 spin_unlock_irqrestore(&p->pi_lock, flags);
6597 } 6728 }
6598 read_unlock_irq(&tasklist_lock); 6729 read_unlock_irq(&tasklist_lock);
6599} 6730}