aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/sched.h21
-rw-r--r--kernel/sched.c189
3 files changed, 181 insertions, 31 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e127ef7e8da8..678c1a90380d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -87,6 +87,7 @@ extern struct group_info init_groups;
87 .lock_depth = -1, \ 87 .lock_depth = -1, \
88 .prio = MAX_PRIO-20, \ 88 .prio = MAX_PRIO-20, \
89 .static_prio = MAX_PRIO-20, \ 89 .static_prio = MAX_PRIO-20, \
90 .normal_prio = MAX_PRIO-20, \
90 .policy = SCHED_NORMAL, \ 91 .policy = SCHED_NORMAL, \
91 .cpus_allowed = CPU_MASK_ALL, \ 92 .cpus_allowed = CPU_MASK_ALL, \
92 .mm = NULL, \ 93 .mm = NULL, \
@@ -122,6 +123,7 @@ extern struct group_info init_groups;
122 .journal_info = NULL, \ 123 .journal_info = NULL, \
123 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ 124 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
124 .fs_excl = ATOMIC_INIT(0), \ 125 .fs_excl = ATOMIC_INIT(0), \
126 .pi_lock = SPIN_LOCK_UNLOCKED, \
125} 127}
126 128
127 129
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0bc81a151e50..6f167645e7e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -495,8 +495,11 @@ struct signal_struct {
495 495
496#define MAX_PRIO (MAX_RT_PRIO + 40) 496#define MAX_PRIO (MAX_RT_PRIO + 40)
497 497
498#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) 498#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
499#define rt_task(p) rt_prio((p)->prio)
499#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) 500#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
501#define has_rt_policy(p) \
502 unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
500 503
501/* 504/*
502 * Some day this will be a full-fledged user tracking system.. 505 * Some day this will be a full-fledged user tracking system..
@@ -725,7 +728,7 @@ struct task_struct {
725#endif 728#endif
726#endif 729#endif
727 int load_weight; /* for niceness load balancing purposes */ 730 int load_weight; /* for niceness load balancing purposes */
728 int prio, static_prio; 731 int prio, static_prio, normal_prio;
729 struct list_head run_list; 732 struct list_head run_list;
730 prio_array_t *array; 733 prio_array_t *array;
731 734
@@ -852,6 +855,9 @@ struct task_struct {
852/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ 855/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
853 spinlock_t alloc_lock; 856 spinlock_t alloc_lock;
854 857
858 /* Protection of the PI data structures: */
859 spinlock_t pi_lock;
860
855#ifdef CONFIG_DEBUG_MUTEXES 861#ifdef CONFIG_DEBUG_MUTEXES
856 /* mutex deadlock detection */ 862 /* mutex deadlock detection */
857 struct mutex_waiter *blocked_on; 863 struct mutex_waiter *blocked_on;
@@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {}
1018#endif 1024#endif
1019 1025
1020extern void sched_idle_next(void); 1026extern void sched_idle_next(void);
1027
1028#ifdef CONFIG_RT_MUTEXES
1029extern int rt_mutex_getprio(task_t *p);
1030extern void rt_mutex_setprio(task_t *p, int prio);
1031#else
1032static inline int rt_mutex_getprio(task_t *p)
1033{
1034 return p->normal_prio;
1035}
1036#endif
1037
1021extern void set_user_nice(task_t *p, long nice); 1038extern void set_user_nice(task_t *p, long nice);
1022extern int task_prio(const task_t *p); 1039extern int task_prio(const task_t *p);
1023extern int task_nice(const task_t *p); 1040extern int task_nice(const task_t *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 15abf0833245..08431f07a999 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -355,6 +355,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
356 356
357/* 357/*
358 * __task_rq_lock - lock the runqueue a given task resides on.
359 * Must be called interrupts disabled.
360 */
361static inline runqueue_t *__task_rq_lock(task_t *p)
362 __acquires(rq->lock)
363{
364 struct runqueue *rq;
365
366repeat_lock_task:
367 rq = task_rq(p);
368 spin_lock(&rq->lock);
369 if (unlikely(rq != task_rq(p))) {
370 spin_unlock(&rq->lock);
371 goto repeat_lock_task;
372 }
373 return rq;
374}
375
376/*
358 * task_rq_lock - lock the runqueue a given task resides on and disable 377 * task_rq_lock - lock the runqueue a given task resides on and disable
359 * interrupts. Note the ordering: we can safely lookup the task_rq without 378 * interrupts. Note the ordering: we can safely lookup the task_rq without
360 * explicitly disabling preemption. 379 * explicitly disabling preemption.
@@ -375,6 +394,12 @@ repeat_lock_task:
375 return rq; 394 return rq;
376} 395}
377 396
397static inline void __task_rq_unlock(runqueue_t *rq)
398 __releases(rq->lock)
399{
400 spin_unlock(&rq->lock);
401}
402
378static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 403static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
379 __releases(rq->lock) 404 __releases(rq->lock)
380{ 405{
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
638} 663}
639 664
640/* 665/*
641 * effective_prio - return the priority that is based on the static 666 * __normal_prio - return the priority that is based on the static
642 * priority but is modified by bonuses/penalties. 667 * priority but is modified by bonuses/penalties.
643 * 668 *
644 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 669 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
651 * 676 *
652 * Both properties are important to certain workloads. 677 * Both properties are important to certain workloads.
653 */ 678 */
654static int effective_prio(task_t *p) 679
680static inline int __normal_prio(task_t *p)
655{ 681{
656 int bonus, prio; 682 int bonus, prio;
657 683
658 if (rt_task(p))
659 return p->prio;
660
661 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 684 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
662 685
663 prio = p->static_prio - bonus; 686 prio = p->static_prio - bonus;
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
692 715
693static void set_load_weight(task_t *p) 716static void set_load_weight(task_t *p)
694{ 717{
695 if (rt_task(p)) { 718 if (has_rt_policy(p)) {
696#ifdef CONFIG_SMP 719#ifdef CONFIG_SMP
697 if (p == task_rq(p)->migration_thread) 720 if (p == task_rq(p)->migration_thread)
698 /* 721 /*
@@ -731,6 +754,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
731} 754}
732 755
733/* 756/*
757 * Calculate the expected normal priority: i.e. priority
758 * without taking RT-inheritance into account. Might be
759 * boosted by interactivity modifiers. Changes upon fork,
760 * setprio syscalls, and whenever the interactivity
761 * estimator recalculates.
762 */
763static inline int normal_prio(task_t *p)
764{
765 int prio;
766
767 if (has_rt_policy(p))
768 prio = MAX_RT_PRIO-1 - p->rt_priority;
769 else
770 prio = __normal_prio(p);
771 return prio;
772}
773
774/*
775 * Calculate the current priority, i.e. the priority
776 * taken into account by the scheduler. This value might
777 * be boosted by RT tasks, or might be boosted by
778 * interactivity modifiers. Will be RT if the task got
779 * RT-boosted. If not then it returns p->normal_prio.
780 */
781static int effective_prio(task_t *p)
782{
783 p->normal_prio = normal_prio(p);
784 /*
785 * If we are RT tasks or we were boosted to RT priority,
786 * keep the priority unchanged. Otherwise, update priority
787 * to the normal priority:
788 */
789 if (!rt_prio(p->prio))
790 return p->normal_prio;
791 return p->prio;
792}
793
794/*
734 * __activate_task - move a task to the runqueue. 795 * __activate_task - move a task to the runqueue.
735 */ 796 */
736static void __activate_task(task_t *p, runqueue_t *rq) 797static void __activate_task(task_t *p, runqueue_t *rq)
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
752 inc_nr_running(p, rq); 813 inc_nr_running(p, rq);
753} 814}
754 815
816/*
817 * Recalculate p->normal_prio and p->prio after having slept,
818 * updating the sleep-average too:
819 */
755static int recalc_task_prio(task_t *p, unsigned long long now) 820static int recalc_task_prio(task_t *p, unsigned long long now)
756{ 821{
757 /* Caller must always ensure 'now >= p->timestamp' */ 822 /* Caller must always ensure 'now >= p->timestamp' */
@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1448 * event cannot wake it up and insert it on the runqueue either. 1513 * event cannot wake it up and insert it on the runqueue either.
1449 */ 1514 */
1450 p->state = TASK_RUNNING; 1515 p->state = TASK_RUNNING;
1516
1517 /*
1518 * Make sure we do not leak PI boosting priority to the child:
1519 */
1520 p->prio = current->normal_prio;
1521
1451 INIT_LIST_HEAD(&p->run_list); 1522 INIT_LIST_HEAD(&p->run_list);
1452 p->array = NULL; 1523 p->array = NULL;
1453#ifdef CONFIG_SCHEDSTATS 1524#ifdef CONFIG_SCHEDSTATS
@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1527 __activate_task(p, rq); 1598 __activate_task(p, rq);
1528 else { 1599 else {
1529 p->prio = current->prio; 1600 p->prio = current->prio;
1601 p->normal_prio = current->normal_prio;
1530 list_add_tail(&p->run_list, &current->run_list); 1602 list_add_tail(&p->run_list, &current->run_list);
1531 p->array = current->array; 1603 p->array = current->array;
1532 p->array->nr_active++; 1604 p->array->nr_active++;
@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3668 3740
3669EXPORT_SYMBOL(sleep_on_timeout); 3741EXPORT_SYMBOL(sleep_on_timeout);
3670 3742
3743#ifdef CONFIG_RT_MUTEXES
3744
3745/*
3746 * rt_mutex_setprio - set the current priority of a task
3747 * @p: task
3748 * @prio: prio value (kernel-internal form)
3749 *
3750 * This function changes the 'effective' priority of a task. It does
3751 * not touch ->normal_prio like __setscheduler().
3752 *
3753 * Used by the rt_mutex code to implement priority inheritance logic.
3754 */
3755void rt_mutex_setprio(task_t *p, int prio)
3756{
3757 unsigned long flags;
3758 prio_array_t *array;
3759 runqueue_t *rq;
3760 int oldprio;
3761
3762 BUG_ON(prio < 0 || prio > MAX_PRIO);
3763
3764 rq = task_rq_lock(p, &flags);
3765
3766 oldprio = p->prio;
3767 array = p->array;
3768 if (array)
3769 dequeue_task(p, array);
3770 p->prio = prio;
3771
3772 if (array) {
3773 /*
3774 * If changing to an RT priority then queue it
3775 * in the active array!
3776 */
3777 if (rt_task(p))
3778 array = rq->active;
3779 enqueue_task(p, array);
3780 /*
3781 * Reschedule if we are currently running on this runqueue and
3782 * our priority decreased, or if we are not currently running on
3783 * this runqueue and our priority is higher than the current's
3784 */
3785 if (task_running(rq, p)) {
3786 if (p->prio > oldprio)
3787 resched_task(rq->curr);
3788 } else if (TASK_PREEMPTS_CURR(p, rq))
3789 resched_task(rq->curr);
3790 }
3791 task_rq_unlock(rq, &flags);
3792}
3793
3794#endif
3795
3671void set_user_nice(task_t *p, long nice) 3796void set_user_nice(task_t *p, long nice)
3672{ 3797{
3673 unsigned long flags; 3798 unsigned long flags;
3674 prio_array_t *array; 3799 prio_array_t *array;
3675 runqueue_t *rq; 3800 runqueue_t *rq;
3676 int old_prio, new_prio, delta; 3801 int old_prio, delta;
3677 3802
3678 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3803 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3679 return; 3804 return;
@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
3688 * it wont have any effect on scheduling until the task is 3813 * it wont have any effect on scheduling until the task is
3689 * not SCHED_NORMAL/SCHED_BATCH: 3814 * not SCHED_NORMAL/SCHED_BATCH:
3690 */ 3815 */
3691 if (rt_task(p)) { 3816 if (has_rt_policy(p)) {
3692 p->static_prio = NICE_TO_PRIO(nice); 3817 p->static_prio = NICE_TO_PRIO(nice);
3693 goto out_unlock; 3818 goto out_unlock;
3694 } 3819 }
@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
3698 dec_raw_weighted_load(rq, p); 3823 dec_raw_weighted_load(rq, p);
3699 } 3824 }
3700 3825
3701 old_prio = p->prio;
3702 new_prio = NICE_TO_PRIO(nice);
3703 delta = new_prio - old_prio;
3704 p->static_prio = NICE_TO_PRIO(nice); 3826 p->static_prio = NICE_TO_PRIO(nice);
3705 set_load_weight(p); 3827 set_load_weight(p);
3706 p->prio += delta; 3828 old_prio = p->prio;
3829 p->prio = effective_prio(p);
3830 delta = p->prio - old_prio;
3707 3831
3708 if (array) { 3832 if (array) {
3709 enqueue_task(p, array); 3833 enqueue_task(p, array);
@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
3718out_unlock: 3842out_unlock:
3719 task_rq_unlock(rq, &flags); 3843 task_rq_unlock(rq, &flags);
3720} 3844}
3721
3722EXPORT_SYMBOL(set_user_nice); 3845EXPORT_SYMBOL(set_user_nice);
3723 3846
3724/* 3847/*
@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3833 BUG_ON(p->array); 3956 BUG_ON(p->array);
3834 p->policy = policy; 3957 p->policy = policy;
3835 p->rt_priority = prio; 3958 p->rt_priority = prio;
3836 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3959 p->normal_prio = normal_prio(p);
3837 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3960 /* we are holding p->pi_lock already */
3838 } else { 3961 p->prio = rt_mutex_getprio(p);
3839 p->prio = p->static_prio; 3962 /*
3840 /* 3963 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3841 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3964 */
3842 */ 3965 if (policy == SCHED_BATCH)
3843 if (policy == SCHED_BATCH) 3966 p->sleep_avg = 0;
3844 p->sleep_avg = 0;
3845 }
3846 set_load_weight(p); 3967 set_load_weight(p);
3847} 3968}
3848 3969
@@ -3912,14 +4033,20 @@ recheck:
3912 if (retval) 4033 if (retval)
3913 return retval; 4034 return retval;
3914 /* 4035 /*
4036 * make sure no PI-waiters arrive (or leave) while we are
4037 * changing the priority of the task:
4038 */
4039 spin_lock_irqsave(&p->pi_lock, flags);
4040 /*
3915 * To be able to change p->policy safely, the apropriate 4041 * To be able to change p->policy safely, the apropriate
3916 * runqueue lock must be held. 4042 * runqueue lock must be held.
3917 */ 4043 */
3918 rq = task_rq_lock(p, &flags); 4044 rq = __task_rq_lock(p);
3919 /* recheck policy now with rq lock held */ 4045 /* recheck policy now with rq lock held */
3920 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4046 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3921 policy = oldpolicy = -1; 4047 policy = oldpolicy = -1;
3922 task_rq_unlock(rq, &flags); 4048 __task_rq_unlock(rq);
4049 spin_unlock_irqrestore(&p->pi_lock, flags);
3923 goto recheck; 4050 goto recheck;
3924 } 4051 }
3925 array = p->array; 4052 array = p->array;
@@ -3940,7 +4067,9 @@ recheck:
3940 } else if (TASK_PREEMPTS_CURR(p, rq)) 4067 } else if (TASK_PREEMPTS_CURR(p, rq))
3941 resched_task(rq->curr); 4068 resched_task(rq->curr);
3942 } 4069 }
3943 task_rq_unlock(rq, &flags); 4070 __task_rq_unlock(rq);
4071 spin_unlock_irqrestore(&p->pi_lock, flags);
4072
3944 return 0; 4073 return 0;
3945} 4074}
3946EXPORT_SYMBOL_GPL(sched_setscheduler); 4075EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4575 idle->timestamp = sched_clock(); 4704 idle->timestamp = sched_clock();
4576 idle->sleep_avg = 0; 4705 idle->sleep_avg = 0;
4577 idle->array = NULL; 4706 idle->array = NULL;
4578 idle->prio = MAX_PRIO; 4707 idle->prio = idle->normal_prio = MAX_PRIO;
4579 idle->state = TASK_RUNNING; 4708 idle->state = TASK_RUNNING;
4580 idle->cpus_allowed = cpumask_of_cpu(cpu); 4709 idle->cpus_allowed = cpumask_of_cpu(cpu);
4581 set_task_cpu(idle, cpu); 4710 set_task_cpu(idle, cpu);
@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
6582 if (!rt_task(p)) 6711 if (!rt_task(p))
6583 continue; 6712 continue;
6584 6713
6585 rq = task_rq_lock(p, &flags); 6714 spin_lock_irqsave(&p->pi_lock, flags);
6715 rq = __task_rq_lock(p);
6586 6716
6587 array = p->array; 6717 array = p->array;
6588 if (array) 6718 if (array)
@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
6593 resched_task(rq->curr); 6723 resched_task(rq->curr);
6594 } 6724 }
6595 6725
6596 task_rq_unlock(rq, &flags); 6726 __task_rq_unlock(rq);
6727 spin_unlock_irqrestore(&p->pi_lock, flags);
6597 } 6728 }
6598 read_unlock_irq(&tasklist_lock); 6729 read_unlock_irq(&tasklist_lock);
6599} 6730}