diff options
author | Ingo Molnar <mingo@elte.hu> | 2006-06-27 05:54:51 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-27 20:32:46 -0400 |
commit | b29739f902ee76a05493fb7d2303490fc75364f4 (patch) | |
tree | 1bf48dfb74752a7ef24a2a4a74c45da0aaec754b /kernel | |
parent | 77ba89c5cf28d5d98a3cae17f67a3e42b102cc25 (diff) |
[PATCH] pi-futex: scheduler support for pi
Add framework to boost/unboost the priority of RT tasks.
This consists of:
- caching the 'normal' priority in ->normal_prio
- providing a functions to set/get the priority of the task
- make sched_setscheduler() aware of boosting
The effective_prio() cleanups also fix a priority-calculation bug pointed out
by Andrey Gelman, in set_user_nice().
has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andrey Gelman <agelman@012.net.il>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 189 |
1 files changed, 160 insertions, 29 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 15abf0833245..08431f07a999 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -355,6 +355,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
355 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 355 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
356 | 356 | ||
357 | /* | 357 | /* |
358 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
359 | * Must be called interrupts disabled. | ||
360 | */ | ||
361 | static inline runqueue_t *__task_rq_lock(task_t *p) | ||
362 | __acquires(rq->lock) | ||
363 | { | ||
364 | struct runqueue *rq; | ||
365 | |||
366 | repeat_lock_task: | ||
367 | rq = task_rq(p); | ||
368 | spin_lock(&rq->lock); | ||
369 | if (unlikely(rq != task_rq(p))) { | ||
370 | spin_unlock(&rq->lock); | ||
371 | goto repeat_lock_task; | ||
372 | } | ||
373 | return rq; | ||
374 | } | ||
375 | |||
376 | /* | ||
358 | * task_rq_lock - lock the runqueue a given task resides on and disable | 377 | * task_rq_lock - lock the runqueue a given task resides on and disable |
359 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 378 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
360 | * explicitly disabling preemption. | 379 | * explicitly disabling preemption. |
@@ -375,6 +394,12 @@ repeat_lock_task: | |||
375 | return rq; | 394 | return rq; |
376 | } | 395 | } |
377 | 396 | ||
397 | static inline void __task_rq_unlock(runqueue_t *rq) | ||
398 | __releases(rq->lock) | ||
399 | { | ||
400 | spin_unlock(&rq->lock); | ||
401 | } | ||
402 | |||
378 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 403 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) |
379 | __releases(rq->lock) | 404 | __releases(rq->lock) |
380 | { | 405 | { |
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
638 | } | 663 | } |
639 | 664 | ||
640 | /* | 665 | /* |
641 | * effective_prio - return the priority that is based on the static | 666 | * __normal_prio - return the priority that is based on the static |
642 | * priority but is modified by bonuses/penalties. | 667 | * priority but is modified by bonuses/penalties. |
643 | * | 668 | * |
644 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 669 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
651 | * | 676 | * |
652 | * Both properties are important to certain workloads. | 677 | * Both properties are important to certain workloads. |
653 | */ | 678 | */ |
654 | static int effective_prio(task_t *p) | 679 | |
680 | static inline int __normal_prio(task_t *p) | ||
655 | { | 681 | { |
656 | int bonus, prio; | 682 | int bonus, prio; |
657 | 683 | ||
658 | if (rt_task(p)) | ||
659 | return p->prio; | ||
660 | |||
661 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 684 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
662 | 685 | ||
663 | prio = p->static_prio - bonus; | 686 | prio = p->static_prio - bonus; |
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p) | |||
692 | 715 | ||
693 | static void set_load_weight(task_t *p) | 716 | static void set_load_weight(task_t *p) |
694 | { | 717 | { |
695 | if (rt_task(p)) { | 718 | if (has_rt_policy(p)) { |
696 | #ifdef CONFIG_SMP | 719 | #ifdef CONFIG_SMP |
697 | if (p == task_rq(p)->migration_thread) | 720 | if (p == task_rq(p)->migration_thread) |
698 | /* | 721 | /* |
@@ -731,6 +754,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq) | |||
731 | } | 754 | } |
732 | 755 | ||
733 | /* | 756 | /* |
757 | * Calculate the expected normal priority: i.e. priority | ||
758 | * without taking RT-inheritance into account. Might be | ||
759 | * boosted by interactivity modifiers. Changes upon fork, | ||
760 | * setprio syscalls, and whenever the interactivity | ||
761 | * estimator recalculates. | ||
762 | */ | ||
763 | static inline int normal_prio(task_t *p) | ||
764 | { | ||
765 | int prio; | ||
766 | |||
767 | if (has_rt_policy(p)) | ||
768 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
769 | else | ||
770 | prio = __normal_prio(p); | ||
771 | return prio; | ||
772 | } | ||
773 | |||
774 | /* | ||
775 | * Calculate the current priority, i.e. the priority | ||
776 | * taken into account by the scheduler. This value might | ||
777 | * be boosted by RT tasks, or might be boosted by | ||
778 | * interactivity modifiers. Will be RT if the task got | ||
779 | * RT-boosted. If not then it returns p->normal_prio. | ||
780 | */ | ||
781 | static int effective_prio(task_t *p) | ||
782 | { | ||
783 | p->normal_prio = normal_prio(p); | ||
784 | /* | ||
785 | * If we are RT tasks or we were boosted to RT priority, | ||
786 | * keep the priority unchanged. Otherwise, update priority | ||
787 | * to the normal priority: | ||
788 | */ | ||
789 | if (!rt_prio(p->prio)) | ||
790 | return p->normal_prio; | ||
791 | return p->prio; | ||
792 | } | ||
793 | |||
794 | /* | ||
734 | * __activate_task - move a task to the runqueue. | 795 | * __activate_task - move a task to the runqueue. |
735 | */ | 796 | */ |
736 | static void __activate_task(task_t *p, runqueue_t *rq) | 797 | static void __activate_task(task_t *p, runqueue_t *rq) |
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | |||
752 | inc_nr_running(p, rq); | 813 | inc_nr_running(p, rq); |
753 | } | 814 | } |
754 | 815 | ||
816 | /* | ||
817 | * Recalculate p->normal_prio and p->prio after having slept, | ||
818 | * updating the sleep-average too: | ||
819 | */ | ||
755 | static int recalc_task_prio(task_t *p, unsigned long long now) | 820 | static int recalc_task_prio(task_t *p, unsigned long long now) |
756 | { | 821 | { |
757 | /* Caller must always ensure 'now >= p->timestamp' */ | 822 | /* Caller must always ensure 'now >= p->timestamp' */ |
@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1448 | * event cannot wake it up and insert it on the runqueue either. | 1513 | * event cannot wake it up and insert it on the runqueue either. |
1449 | */ | 1514 | */ |
1450 | p->state = TASK_RUNNING; | 1515 | p->state = TASK_RUNNING; |
1516 | |||
1517 | /* | ||
1518 | * Make sure we do not leak PI boosting priority to the child: | ||
1519 | */ | ||
1520 | p->prio = current->normal_prio; | ||
1521 | |||
1451 | INIT_LIST_HEAD(&p->run_list); | 1522 | INIT_LIST_HEAD(&p->run_list); |
1452 | p->array = NULL; | 1523 | p->array = NULL; |
1453 | #ifdef CONFIG_SCHEDSTATS | 1524 | #ifdef CONFIG_SCHEDSTATS |
@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1527 | __activate_task(p, rq); | 1598 | __activate_task(p, rq); |
1528 | else { | 1599 | else { |
1529 | p->prio = current->prio; | 1600 | p->prio = current->prio; |
1601 | p->normal_prio = current->normal_prio; | ||
1530 | list_add_tail(&p->run_list, ¤t->run_list); | 1602 | list_add_tail(&p->run_list, ¤t->run_list); |
1531 | p->array = current->array; | 1603 | p->array = current->array; |
1532 | p->array->nr_active++; | 1604 | p->array->nr_active++; |
@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3668 | 3740 | ||
3669 | EXPORT_SYMBOL(sleep_on_timeout); | 3741 | EXPORT_SYMBOL(sleep_on_timeout); |
3670 | 3742 | ||
3743 | #ifdef CONFIG_RT_MUTEXES | ||
3744 | |||
3745 | /* | ||
3746 | * rt_mutex_setprio - set the current priority of a task | ||
3747 | * @p: task | ||
3748 | * @prio: prio value (kernel-internal form) | ||
3749 | * | ||
3750 | * This function changes the 'effective' priority of a task. It does | ||
3751 | * not touch ->normal_prio like __setscheduler(). | ||
3752 | * | ||
3753 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
3754 | */ | ||
3755 | void rt_mutex_setprio(task_t *p, int prio) | ||
3756 | { | ||
3757 | unsigned long flags; | ||
3758 | prio_array_t *array; | ||
3759 | runqueue_t *rq; | ||
3760 | int oldprio; | ||
3761 | |||
3762 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
3763 | |||
3764 | rq = task_rq_lock(p, &flags); | ||
3765 | |||
3766 | oldprio = p->prio; | ||
3767 | array = p->array; | ||
3768 | if (array) | ||
3769 | dequeue_task(p, array); | ||
3770 | p->prio = prio; | ||
3771 | |||
3772 | if (array) { | ||
3773 | /* | ||
3774 | * If changing to an RT priority then queue it | ||
3775 | * in the active array! | ||
3776 | */ | ||
3777 | if (rt_task(p)) | ||
3778 | array = rq->active; | ||
3779 | enqueue_task(p, array); | ||
3780 | /* | ||
3781 | * Reschedule if we are currently running on this runqueue and | ||
3782 | * our priority decreased, or if we are not currently running on | ||
3783 | * this runqueue and our priority is higher than the current's | ||
3784 | */ | ||
3785 | if (task_running(rq, p)) { | ||
3786 | if (p->prio > oldprio) | ||
3787 | resched_task(rq->curr); | ||
3788 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3789 | resched_task(rq->curr); | ||
3790 | } | ||
3791 | task_rq_unlock(rq, &flags); | ||
3792 | } | ||
3793 | |||
3794 | #endif | ||
3795 | |||
3671 | void set_user_nice(task_t *p, long nice) | 3796 | void set_user_nice(task_t *p, long nice) |
3672 | { | 3797 | { |
3673 | unsigned long flags; | 3798 | unsigned long flags; |
3674 | prio_array_t *array; | 3799 | prio_array_t *array; |
3675 | runqueue_t *rq; | 3800 | runqueue_t *rq; |
3676 | int old_prio, new_prio, delta; | 3801 | int old_prio, delta; |
3677 | 3802 | ||
3678 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3803 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3679 | return; | 3804 | return; |
@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice) | |||
3688 | * it wont have any effect on scheduling until the task is | 3813 | * it wont have any effect on scheduling until the task is |
3689 | * not SCHED_NORMAL/SCHED_BATCH: | 3814 | * not SCHED_NORMAL/SCHED_BATCH: |
3690 | */ | 3815 | */ |
3691 | if (rt_task(p)) { | 3816 | if (has_rt_policy(p)) { |
3692 | p->static_prio = NICE_TO_PRIO(nice); | 3817 | p->static_prio = NICE_TO_PRIO(nice); |
3693 | goto out_unlock; | 3818 | goto out_unlock; |
3694 | } | 3819 | } |
@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice) | |||
3698 | dec_raw_weighted_load(rq, p); | 3823 | dec_raw_weighted_load(rq, p); |
3699 | } | 3824 | } |
3700 | 3825 | ||
3701 | old_prio = p->prio; | ||
3702 | new_prio = NICE_TO_PRIO(nice); | ||
3703 | delta = new_prio - old_prio; | ||
3704 | p->static_prio = NICE_TO_PRIO(nice); | 3826 | p->static_prio = NICE_TO_PRIO(nice); |
3705 | set_load_weight(p); | 3827 | set_load_weight(p); |
3706 | p->prio += delta; | 3828 | old_prio = p->prio; |
3829 | p->prio = effective_prio(p); | ||
3830 | delta = p->prio - old_prio; | ||
3707 | 3831 | ||
3708 | if (array) { | 3832 | if (array) { |
3709 | enqueue_task(p, array); | 3833 | enqueue_task(p, array); |
@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) | |||
3718 | out_unlock: | 3842 | out_unlock: |
3719 | task_rq_unlock(rq, &flags); | 3843 | task_rq_unlock(rq, &flags); |
3720 | } | 3844 | } |
3721 | |||
3722 | EXPORT_SYMBOL(set_user_nice); | 3845 | EXPORT_SYMBOL(set_user_nice); |
3723 | 3846 | ||
3724 | /* | 3847 | /* |
@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3833 | BUG_ON(p->array); | 3956 | BUG_ON(p->array); |
3834 | p->policy = policy; | 3957 | p->policy = policy; |
3835 | p->rt_priority = prio; | 3958 | p->rt_priority = prio; |
3836 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 3959 | p->normal_prio = normal_prio(p); |
3837 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3960 | /* we are holding p->pi_lock already */ |
3838 | } else { | 3961 | p->prio = rt_mutex_getprio(p); |
3839 | p->prio = p->static_prio; | 3962 | /* |
3840 | /* | 3963 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3841 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 3964 | */ |
3842 | */ | 3965 | if (policy == SCHED_BATCH) |
3843 | if (policy == SCHED_BATCH) | 3966 | p->sleep_avg = 0; |
3844 | p->sleep_avg = 0; | ||
3845 | } | ||
3846 | set_load_weight(p); | 3967 | set_load_weight(p); |
3847 | } | 3968 | } |
3848 | 3969 | ||
@@ -3912,14 +4033,20 @@ recheck: | |||
3912 | if (retval) | 4033 | if (retval) |
3913 | return retval; | 4034 | return retval; |
3914 | /* | 4035 | /* |
4036 | * make sure no PI-waiters arrive (or leave) while we are | ||
4037 | * changing the priority of the task: | ||
4038 | */ | ||
4039 | spin_lock_irqsave(&p->pi_lock, flags); | ||
4040 | /* | ||
3915 | * To be able to change p->policy safely, the apropriate | 4041 | * To be able to change p->policy safely, the apropriate |
3916 | * runqueue lock must be held. | 4042 | * runqueue lock must be held. |
3917 | */ | 4043 | */ |
3918 | rq = task_rq_lock(p, &flags); | 4044 | rq = __task_rq_lock(p); |
3919 | /* recheck policy now with rq lock held */ | 4045 | /* recheck policy now with rq lock held */ |
3920 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4046 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3921 | policy = oldpolicy = -1; | 4047 | policy = oldpolicy = -1; |
3922 | task_rq_unlock(rq, &flags); | 4048 | __task_rq_unlock(rq); |
4049 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
3923 | goto recheck; | 4050 | goto recheck; |
3924 | } | 4051 | } |
3925 | array = p->array; | 4052 | array = p->array; |
@@ -3940,7 +4067,9 @@ recheck: | |||
3940 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4067 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3941 | resched_task(rq->curr); | 4068 | resched_task(rq->curr); |
3942 | } | 4069 | } |
3943 | task_rq_unlock(rq, &flags); | 4070 | __task_rq_unlock(rq); |
4071 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4072 | |||
3944 | return 0; | 4073 | return 0; |
3945 | } | 4074 | } |
3946 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4075 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4575 | idle->timestamp = sched_clock(); | 4704 | idle->timestamp = sched_clock(); |
4576 | idle->sleep_avg = 0; | 4705 | idle->sleep_avg = 0; |
4577 | idle->array = NULL; | 4706 | idle->array = NULL; |
4578 | idle->prio = MAX_PRIO; | 4707 | idle->prio = idle->normal_prio = MAX_PRIO; |
4579 | idle->state = TASK_RUNNING; | 4708 | idle->state = TASK_RUNNING; |
4580 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4709 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4581 | set_task_cpu(idle, cpu); | 4710 | set_task_cpu(idle, cpu); |
@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void) | |||
6582 | if (!rt_task(p)) | 6711 | if (!rt_task(p)) |
6583 | continue; | 6712 | continue; |
6584 | 6713 | ||
6585 | rq = task_rq_lock(p, &flags); | 6714 | spin_lock_irqsave(&p->pi_lock, flags); |
6715 | rq = __task_rq_lock(p); | ||
6586 | 6716 | ||
6587 | array = p->array; | 6717 | array = p->array; |
6588 | if (array) | 6718 | if (array) |
@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void) | |||
6593 | resched_task(rq->curr); | 6723 | resched_task(rq->curr); |
6594 | } | 6724 | } |
6595 | 6725 | ||
6596 | task_rq_unlock(rq, &flags); | 6726 | __task_rq_unlock(rq); |
6727 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
6597 | } | 6728 | } |
6598 | read_unlock_irq(&tasklist_lock); | 6729 | read_unlock_irq(&tasklist_lock); |
6599 | } | 6730 | } |