aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-01-25 15:08:29 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:29 -0500
commit8f4d37ec073c17e2d4aa8851df5837d798606d6f (patch)
treea9ac9063eca53e4d0110e8086f55241ea70ba993 /kernel
parent02b67cc3ba36bdba351d6c3a00593f4ec550d9d3 (diff)
sched: high-res preemption tick
Use HR-timers (when available) to deliver an accurate preemption tick. The regular scheduler tick that runs at 1/HZ can be too coarse when nice level are used. The fairness system will still keep the cpu utilisation 'fair' by then delaying the task that got an excessive amount of CPU time but try to minimize this by delivering preemption points spot-on. The average frequency of this extra interrupt is sched_latency / nr_latency. Which need not be higher than 1/HZ, its just that the distribution within the sched_latency period is important. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/sched.c210
-rw-r--r--kernel/sched_fair.c69
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c2
5 files changed, 268 insertions, 17 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
54 default 300 if HZ_300 54 default 300 if HZ_300
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ee37602a6d8..17f93d3eda91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -65,6 +65,7 @@
65#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
66#include <linux/unistd.h> 66#include <linux/unistd.h>
67#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
68 69
69#include <asm/tlb.h> 70#include <asm/tlb.h>
70#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -451,6 +452,12 @@ struct rq {
451 struct list_head migration_queue; 452 struct list_head migration_queue;
452#endif 453#endif
453 454
455#ifdef CONFIG_SCHED_HRTICK
456 unsigned long hrtick_flags;
457 ktime_t hrtick_expire;
458 struct hrtimer hrtick_timer;
459#endif
460
454#ifdef CONFIG_SCHEDSTATS 461#ifdef CONFIG_SCHEDSTATS
455 /* latency stats */ 462 /* latency stats */
456 struct sched_info rq_sched_info; 463 struct sched_info rq_sched_info;
@@ -572,6 +579,8 @@ enum {
572 SCHED_FEAT_START_DEBIT = 4, 579 SCHED_FEAT_START_DEBIT = 4,
573 SCHED_FEAT_TREE_AVG = 8, 580 SCHED_FEAT_TREE_AVG = 8,
574 SCHED_FEAT_APPROX_AVG = 16, 581 SCHED_FEAT_APPROX_AVG = 16,
582 SCHED_FEAT_HRTICK = 32,
583 SCHED_FEAT_DOUBLE_TICK = 64,
575}; 584};
576 585
577const_debug unsigned int sysctl_sched_features = 586const_debug unsigned int sysctl_sched_features =
@@ -579,7 +588,9 @@ const_debug unsigned int sysctl_sched_features =
579 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 588 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
580 SCHED_FEAT_START_DEBIT * 1 | 589 SCHED_FEAT_START_DEBIT * 1 |
581 SCHED_FEAT_TREE_AVG * 0 | 590 SCHED_FEAT_TREE_AVG * 0 |
582 SCHED_FEAT_APPROX_AVG * 0; 591 SCHED_FEAT_APPROX_AVG * 0 |
592 SCHED_FEAT_HRTICK * 1 |
593 SCHED_FEAT_DOUBLE_TICK * 0;
583 594
584#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 595#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
585 596
@@ -796,6 +807,173 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
796} 807}
797EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 808EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
798 809
810static void __resched_task(struct task_struct *p, int tif_bit);
811
812static inline void resched_task(struct task_struct *p)
813{
814 __resched_task(p, TIF_NEED_RESCHED);
815}
816
817#ifdef CONFIG_SCHED_HRTICK
818/*
819 * Use HR-timers to deliver accurate preemption points.
820 *
821 * Its all a bit involved since we cannot program an hrt while holding the
822 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
823 * reschedule event.
824 *
825 * When we get rescheduled we reprogram the hrtick_timer outside of the
826 * rq->lock.
827 */
828static inline void resched_hrt(struct task_struct *p)
829{
830 __resched_task(p, TIF_HRTICK_RESCHED);
831}
832
833static inline void resched_rq(struct rq *rq)
834{
835 unsigned long flags;
836
837 spin_lock_irqsave(&rq->lock, flags);
838 resched_task(rq->curr);
839 spin_unlock_irqrestore(&rq->lock, flags);
840}
841
842enum {
843 HRTICK_SET, /* re-programm hrtick_timer */
844 HRTICK_RESET, /* not a new slice */
845};
846
847/*
848 * Use hrtick when:
849 * - enabled by features
850 * - hrtimer is actually high res
851 */
852static inline int hrtick_enabled(struct rq *rq)
853{
854 if (!sched_feat(HRTICK))
855 return 0;
856 return hrtimer_is_hres_active(&rq->hrtick_timer);
857}
858
859/*
860 * Called to set the hrtick timer state.
861 *
862 * called with rq->lock held and irqs disabled
863 */
864static void hrtick_start(struct rq *rq, u64 delay, int reset)
865{
866 assert_spin_locked(&rq->lock);
867
868 /*
869 * preempt at: now + delay
870 */
871 rq->hrtick_expire =
872 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
873 /*
874 * indicate we need to program the timer
875 */
876 __set_bit(HRTICK_SET, &rq->hrtick_flags);
877 if (reset)
878 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
879
880 /*
881 * New slices are called from the schedule path and don't need a
882 * forced reschedule.
883 */
884 if (reset)
885 resched_hrt(rq->curr);
886}
887
888static void hrtick_clear(struct rq *rq)
889{
890 if (hrtimer_active(&rq->hrtick_timer))
891 hrtimer_cancel(&rq->hrtick_timer);
892}
893
894/*
895 * Update the timer from the possible pending state.
896 */
897static void hrtick_set(struct rq *rq)
898{
899 ktime_t time;
900 int set, reset;
901 unsigned long flags;
902
903 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
904
905 spin_lock_irqsave(&rq->lock, flags);
906 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
907 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
908 time = rq->hrtick_expire;
909 clear_thread_flag(TIF_HRTICK_RESCHED);
910 spin_unlock_irqrestore(&rq->lock, flags);
911
912 if (set) {
913 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
914 if (reset && !hrtimer_active(&rq->hrtick_timer))
915 resched_rq(rq);
916 } else
917 hrtick_clear(rq);
918}
919
920/*
921 * High-resolution timer tick.
922 * Runs from hardirq context with interrupts disabled.
923 */
924static enum hrtimer_restart hrtick(struct hrtimer *timer)
925{
926 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
927
928 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
929
930 spin_lock(&rq->lock);
931 __update_rq_clock(rq);
932 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
933 spin_unlock(&rq->lock);
934
935 return HRTIMER_NORESTART;
936}
937
938static inline void init_rq_hrtick(struct rq *rq)
939{
940 rq->hrtick_flags = 0;
941 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
942 rq->hrtick_timer.function = hrtick;
943 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
944}
945
946void hrtick_resched(void)
947{
948 struct rq *rq;
949 unsigned long flags;
950
951 if (!test_thread_flag(TIF_HRTICK_RESCHED))
952 return;
953
954 local_irq_save(flags);
955 rq = cpu_rq(smp_processor_id());
956 hrtick_set(rq);
957 local_irq_restore(flags);
958}
959#else
960static inline void hrtick_clear(struct rq *rq)
961{
962}
963
964static inline void hrtick_set(struct rq *rq)
965{
966}
967
968static inline void init_rq_hrtick(struct rq *rq)
969{
970}
971
972void hrtick_resched(void)
973{
974}
975#endif
976
799/* 977/*
800 * resched_task - mark a task 'to be rescheduled now'. 978 * resched_task - mark a task 'to be rescheduled now'.
801 * 979 *
@@ -809,16 +987,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
809#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 987#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
810#endif 988#endif
811 989
812static void resched_task(struct task_struct *p) 990static void __resched_task(struct task_struct *p, int tif_bit)
813{ 991{
814 int cpu; 992 int cpu;
815 993
816 assert_spin_locked(&task_rq(p)->lock); 994 assert_spin_locked(&task_rq(p)->lock);
817 995
818 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 996 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
819 return; 997 return;
820 998
821 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 999 set_tsk_thread_flag(p, tif_bit);
822 1000
823 cpu = task_cpu(p); 1001 cpu = task_cpu(p);
824 if (cpu == smp_processor_id()) 1002 if (cpu == smp_processor_id())
@@ -841,10 +1019,10 @@ static void resched_cpu(int cpu)
841 spin_unlock_irqrestore(&rq->lock, flags); 1019 spin_unlock_irqrestore(&rq->lock, flags);
842} 1020}
843#else 1021#else
844static inline void resched_task(struct task_struct *p) 1022static void __resched_task(struct task_struct *p, int tif_bit)
845{ 1023{
846 assert_spin_locked(&task_rq(p)->lock); 1024 assert_spin_locked(&task_rq(p)->lock);
847 set_tsk_need_resched(p); 1025 set_tsk_thread_flag(p, tif_bit);
848} 1026}
849#endif 1027#endif
850 1028
@@ -3497,7 +3675,7 @@ void scheduler_tick(void)
3497 rq->tick_timestamp = rq->clock; 3675 rq->tick_timestamp = rq->clock;
3498 update_cpu_load(rq); 3676 update_cpu_load(rq);
3499 if (curr != rq->idle) /* FIXME: needed? */ 3677 if (curr != rq->idle) /* FIXME: needed? */
3500 curr->sched_class->task_tick(rq, curr); 3678 curr->sched_class->task_tick(rq, curr, 0);
3501 spin_unlock(&rq->lock); 3679 spin_unlock(&rq->lock);
3502 3680
3503#ifdef CONFIG_SMP 3681#ifdef CONFIG_SMP
@@ -3643,6 +3821,8 @@ need_resched_nonpreemptible:
3643 3821
3644 schedule_debug(prev); 3822 schedule_debug(prev);
3645 3823
3824 hrtick_clear(rq);
3825
3646 /* 3826 /*
3647 * Do the rq-clock update outside the rq lock: 3827 * Do the rq-clock update outside the rq lock:
3648 */ 3828 */
@@ -3680,14 +3860,20 @@ need_resched_nonpreemptible:
3680 ++*switch_count; 3860 ++*switch_count;
3681 3861
3682 context_switch(rq, prev, next); /* unlocks the rq */ 3862 context_switch(rq, prev, next); /* unlocks the rq */
3863 /*
3864 * the context switch might have flipped the stack from under
3865 * us, hence refresh the local variables.
3866 */
3867 cpu = smp_processor_id();
3868 rq = cpu_rq(cpu);
3683 } else 3869 } else
3684 spin_unlock_irq(&rq->lock); 3870 spin_unlock_irq(&rq->lock);
3685 3871
3686 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3872 hrtick_set(rq);
3687 cpu = smp_processor_id(); 3873
3688 rq = cpu_rq(cpu); 3874 if (unlikely(reacquire_kernel_lock(current) < 0))
3689 goto need_resched_nonpreemptible; 3875 goto need_resched_nonpreemptible;
3690 } 3876
3691 preempt_enable_no_resched(); 3877 preempt_enable_no_resched();
3692 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3878 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3693 goto need_resched; 3879 goto need_resched;
@@ -6913,6 +7099,8 @@ void __init sched_init(void)
6913 rq->rt.overloaded = 0; 7099 rq->rt.overloaded = 0;
6914 rq_attach_root(rq, &def_root_domain); 7100 rq_attach_root(rq, &def_root_domain);
6915#endif 7101#endif
7102 init_rq_hrtick(rq);
7103
6916 atomic_set(&rq->nr_iowait, 0); 7104 atomic_set(&rq->nr_iowait, 0);
6917 7105
6918 array = &rq->rt.active; 7106 array = &rq->rt.active;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index dfa18d55561d..3dab1ff83c4f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -642,13 +642,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
642 cfs_rq->curr = NULL; 642 cfs_rq->curr = NULL;
643} 643}
644 644
645static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 645static void
646entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
646{ 647{
647 /* 648 /*
648 * Update run-time statistics of the 'current'. 649 * Update run-time statistics of the 'current'.
649 */ 650 */
650 update_curr(cfs_rq); 651 update_curr(cfs_rq);
651 652
653#ifdef CONFIG_SCHED_HRTICK
654 /*
655 * queued ticks are scheduled to match the slice, so don't bother
656 * validating it and just reschedule.
657 */
658 if (queued)
659 return resched_task(rq_of(cfs_rq)->curr);
660 /*
661 * don't let the period tick interfere with the hrtick preemption
662 */
663 if (!sched_feat(DOUBLE_TICK) &&
664 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
665 return;
666#endif
667
652 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 668 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
653 check_preempt_tick(cfs_rq, curr); 669 check_preempt_tick(cfs_rq, curr);
654} 670}
@@ -754,6 +770,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
754 770
755#endif /* CONFIG_FAIR_GROUP_SCHED */ 771#endif /* CONFIG_FAIR_GROUP_SCHED */
756 772
773#ifdef CONFIG_SCHED_HRTICK
774static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
775{
776 int requeue = rq->curr == p;
777 struct sched_entity *se = &p->se;
778 struct cfs_rq *cfs_rq = cfs_rq_of(se);
779
780 WARN_ON(task_rq(p) != rq);
781
782 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
783 u64 slice = sched_slice(cfs_rq, se);
784 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
785 s64 delta = slice - ran;
786
787 if (delta < 0) {
788 if (rq->curr == p)
789 resched_task(p);
790 return;
791 }
792
793 /*
794 * Don't schedule slices shorter than 10000ns, that just
795 * doesn't make sense. Rely on vruntime for fairness.
796 */
797 if (!requeue)
798 delta = max(10000LL, delta);
799
800 hrtick_start(rq, delta, requeue);
801 }
802}
803#else
804static inline void
805hrtick_start_fair(struct rq *rq, struct task_struct *p)
806{
807}
808#endif
809
757/* 810/*
758 * The enqueue_task method is called before nr_running is 811 * The enqueue_task method is called before nr_running is
759 * increased. Here we update the fair scheduling stats and 812 * increased. Here we update the fair scheduling stats and
@@ -782,6 +835,8 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
782 */ 835 */
783 if (incload) 836 if (incload)
784 inc_cpu_load(rq, topse->load.weight); 837 inc_cpu_load(rq, topse->load.weight);
838
839 hrtick_start_fair(rq, rq->curr);
785} 840}
786 841
787/* 842/*
@@ -814,6 +869,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
814 */ 869 */
815 if (decload) 870 if (decload)
816 dec_cpu_load(rq, topse->load.weight); 871 dec_cpu_load(rq, topse->load.weight);
872
873 hrtick_start_fair(rq, rq->curr);
817} 874}
818 875
819/* 876/*
@@ -1049,6 +1106,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1049 1106
1050static struct task_struct *pick_next_task_fair(struct rq *rq) 1107static struct task_struct *pick_next_task_fair(struct rq *rq)
1051{ 1108{
1109 struct task_struct *p;
1052 struct cfs_rq *cfs_rq = &rq->cfs; 1110 struct cfs_rq *cfs_rq = &rq->cfs;
1053 struct sched_entity *se; 1111 struct sched_entity *se;
1054 1112
@@ -1060,7 +1118,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1060 cfs_rq = group_cfs_rq(se); 1118 cfs_rq = group_cfs_rq(se);
1061 } while (cfs_rq); 1119 } while (cfs_rq);
1062 1120
1063 return task_of(se); 1121 p = task_of(se);
1122 hrtick_start_fair(rq, p);
1123
1124 return p;
1064} 1125}
1065 1126
1066/* 1127/*
@@ -1235,14 +1296,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1235/* 1296/*
1236 * scheduler tick hitting a task of our scheduling class: 1297 * scheduler tick hitting a task of our scheduling class:
1237 */ 1298 */
1238static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1299static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1239{ 1300{
1240 struct cfs_rq *cfs_rq; 1301 struct cfs_rq *cfs_rq;
1241 struct sched_entity *se = &curr->se; 1302 struct sched_entity *se = &curr->se;
1242 1303
1243 for_each_sched_entity(se) { 1304 for_each_sched_entity(se) {
1244 cfs_rq = cfs_rq_of(se); 1305 cfs_rq = cfs_rq_of(se);
1245 entity_tick(cfs_rq, se); 1306 entity_tick(cfs_rq, se, queued);
1246 } 1307 }
1247} 1308}
1248 1309
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index ef7a2661fa10..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -61,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
61} 61}
62#endif 62#endif
63 63
64static void task_tick_idle(struct rq *rq, struct task_struct *curr) 64static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
65{ 65{
66} 66}
67 67
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f350f7b15158..83fbbcb8019e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -863,7 +863,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
863 } 863 }
864} 864}
865 865
866static void task_tick_rt(struct rq *rq, struct task_struct *p) 866static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
867{ 867{
868 update_curr_rt(rq); 868 update_curr_rt(rq);
869 869