aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-05-03 12:29:28 -0400
committerIngo Molnar <mingo@elte.hu>2008-05-05 17:56:18 -0400
commit3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd (patch)
tree3752f9ea8e014ec40e95a1b197b0a3d18e1056a8 /kernel
parenta5574cf65b5f03ce9ade3918764fe22e5e2371e3 (diff)
sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
this replaces the rq->clock stuff (and possibly cpu_clock()). - architectures that have an 'imperfect' hardware clock can set CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - the 'jiffie' window might be superfulous when we update tick_gtod before the __update_sched_clock() call in sched_clock_tick() - cpu_clock() might be implemented as: sched_clock_cpu(smp_processor_id()) if the accuracy proves good enough - how far can TSC drift in a single jiffie when considering the filtering and idle hooks? [ mingo@elte.hu: various fixes and cleanups ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/sched.c165
-rw-r--r--kernel/sched_clock.c236
-rw-r--r--kernel/sched_debug.c7
-rw-r--r--kernel/sched_fair.c2
5 files changed, 251 insertions, 161 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f52..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 9457106b18af..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
76 76
77/* 77/*
78 * Scheduler clock - returns current time in nanosec units.
79 * This is default implementation.
80 * Architectures and sub-architectures can override this.
81 */
82unsigned long long __attribute__((weak)) sched_clock(void)
83{
84 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85}
86
87/*
88 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
89 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90 * and back. 80 * and back.
@@ -557,13 +547,7 @@ struct rq {
557 unsigned long next_balance; 547 unsigned long next_balance;
558 struct mm_struct *prev_mm; 548 struct mm_struct *prev_mm;
559 549
560 u64 clock, prev_clock_raw; 550 u64 clock;
561 s64 clock_max_delta;
562
563 unsigned int clock_warps, clock_overflows, clock_underflows;
564 u64 idle_clock;
565 unsigned int clock_deep_idle_events;
566 u64 tick_timestamp;
567 551
568 atomic_t nr_iowait; 552 atomic_t nr_iowait;
569 553
@@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
628#endif 612#endif
629} 613}
630 614
631#ifdef CONFIG_NO_HZ
632static inline bool nohz_on(int cpu)
633{
634 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
635}
636
637static inline u64 max_skipped_ticks(struct rq *rq)
638{
639 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
640}
641
642static inline void update_last_tick_seen(struct rq *rq)
643{
644 rq->last_tick_seen = jiffies;
645}
646#else
647static inline u64 max_skipped_ticks(struct rq *rq)
648{
649 return 1;
650}
651
652static inline void update_last_tick_seen(struct rq *rq)
653{
654}
655#endif
656
657/*
658 * Update the per-runqueue clock, as finegrained as the platform can give
659 * us, but without assuming monotonicity, etc.:
660 */
661static void __update_rq_clock(struct rq *rq)
662{
663 u64 prev_raw = rq->prev_clock_raw;
664 u64 now = sched_clock();
665 s64 delta = now - prev_raw;
666 u64 clock = rq->clock;
667
668#ifdef CONFIG_SCHED_DEBUG
669 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
670#endif
671 /*
672 * Protect against sched_clock() occasionally going backwards:
673 */
674 if (unlikely(delta < 0)) {
675 clock++;
676 rq->clock_warps++;
677 } else {
678 /*
679 * Catch too large forward jumps too:
680 */
681 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
682 u64 max_time = rq->tick_timestamp + max_jump;
683
684 if (unlikely(clock + delta > max_time)) {
685 if (clock < max_time)
686 clock = max_time;
687 else
688 clock++;
689 rq->clock_overflows++;
690 } else {
691 if (unlikely(delta > rq->clock_max_delta))
692 rq->clock_max_delta = delta;
693 clock += delta;
694 }
695 }
696
697 rq->prev_clock_raw = now;
698 rq->clock = clock;
699}
700
701static void update_rq_clock(struct rq *rq)
702{
703 if (likely(smp_processor_id() == cpu_of(rq)))
704 __update_rq_clock(rq);
705}
706
707/* 615/*
708 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 616 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
709 * See detach_destroy_domains: synchronize_sched for details. 617 * See detach_destroy_domains: synchronize_sched for details.
@@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
719#define task_rq(p) cpu_rq(task_cpu(p)) 627#define task_rq(p) cpu_rq(task_cpu(p))
720#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 628#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
721 629
630static inline void update_rq_clock(struct rq *rq)
631{
632 rq->clock = sched_clock_cpu(cpu_of(rq));
633}
634
722/* 635/*
723 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 636 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
724 */ 637 */
@@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
935static unsigned long long __cpu_clock(int cpu) 848static unsigned long long __cpu_clock(int cpu)
936{ 849{
937 unsigned long long now; 850 unsigned long long now;
938 struct rq *rq;
939 851
940 /* 852 /*
941 * Only call sched_clock() if the scheduler has already been 853 * Only call sched_clock() if the scheduler has already been
@@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944 if (unlikely(!scheduler_running)) 856 if (unlikely(!scheduler_running))
945 return 0; 857 return 0;
946 858
947 rq = cpu_rq(cpu); 859 now = sched_clock_cpu(cpu);
948 update_rq_clock(rq);
949 now = rq->clock;
950 860
951 return now; 861 return now;
952} 862}
@@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void)
1120 return rq; 1030 return rq;
1121} 1031}
1122 1032
1123/*
1124 * We are going deep-idle (irqs are disabled):
1125 */
1126void sched_clock_idle_sleep_event(void)
1127{
1128 struct rq *rq = cpu_rq(smp_processor_id());
1129
1130 WARN_ON(!irqs_disabled());
1131 spin_lock(&rq->lock);
1132 __update_rq_clock(rq);
1133 spin_unlock(&rq->lock);
1134 rq->clock_deep_idle_events++;
1135}
1136EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1137
1138/*
1139 * We just idled delta nanoseconds (called with irqs disabled):
1140 */
1141void sched_clock_idle_wakeup_event(u64 delta_ns)
1142{
1143 struct rq *rq = cpu_rq(smp_processor_id());
1144 u64 now = sched_clock();
1145
1146 WARN_ON(!irqs_disabled());
1147 rq->idle_clock += delta_ns;
1148 /*
1149 * Override the previous timestamp and ignore all
1150 * sched_clock() deltas that occured while we idled,
1151 * and use the PM-provided delta_ns to advance the
1152 * rq clock:
1153 */
1154 spin_lock(&rq->lock);
1155 rq->prev_clock_raw = now;
1156 rq->clock += delta_ns;
1157 spin_unlock(&rq->lock);
1158 touch_softlockup_watchdog();
1159}
1160EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1161
1162static void __resched_task(struct task_struct *p, int tif_bit); 1033static void __resched_task(struct task_struct *p, int tif_bit);
1163 1034
1164static inline void resched_task(struct task_struct *p) 1035static inline void resched_task(struct task_struct *p)
@@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1283 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1154 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1284 1155
1285 spin_lock(&rq->lock); 1156 spin_lock(&rq->lock);
1286 __update_rq_clock(rq); 1157 update_rq_clock(rq);
1287 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1158 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1288 spin_unlock(&rq->lock); 1159 spin_unlock(&rq->lock);
1289 1160
@@ -4476,19 +4347,11 @@ void scheduler_tick(void)
4476 int cpu = smp_processor_id(); 4347 int cpu = smp_processor_id();
4477 struct rq *rq = cpu_rq(cpu); 4348 struct rq *rq = cpu_rq(cpu);
4478 struct task_struct *curr = rq->curr; 4349 struct task_struct *curr = rq->curr;
4479 u64 next_tick = rq->tick_timestamp + TICK_NSEC; 4350
4351 sched_clock_tick();
4480 4352
4481 spin_lock(&rq->lock); 4353 spin_lock(&rq->lock);
4482 __update_rq_clock(rq); 4354 update_rq_clock(rq);
4483 /*
4484 * Let rq->clock advance by at least TICK_NSEC:
4485 */
4486 if (unlikely(rq->clock < next_tick)) {
4487 rq->clock = next_tick;
4488 rq->clock_underflows++;
4489 }
4490 rq->tick_timestamp = rq->clock;
4491 update_last_tick_seen(rq);
4492 update_cpu_load(rq); 4355 update_cpu_load(rq);
4493 curr->sched_class->task_tick(rq, curr, 0); 4356 curr->sched_class->task_tick(rq, curr, 0);
4494 spin_unlock(&rq->lock); 4357 spin_unlock(&rq->lock);
@@ -4642,7 +4505,7 @@ need_resched_nonpreemptible:
4642 * Do the rq-clock update outside the rq lock: 4505 * Do the rq-clock update outside the rq lock:
4643 */ 4506 */
4644 local_irq_disable(); 4507 local_irq_disable();
4645 __update_rq_clock(rq); 4508 update_rq_clock(rq);
4646 spin_lock(&rq->lock); 4509 spin_lock(&rq->lock);
4647 clear_tsk_need_resched(prev); 4510 clear_tsk_need_resched(prev);
4648 4511
@@ -8226,8 +8089,6 @@ void __init sched_init(void)
8226 spin_lock_init(&rq->lock); 8089 spin_lock_init(&rq->lock);
8227 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8090 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8228 rq->nr_running = 0; 8091 rq->nr_running = 0;
8229 rq->clock = 1;
8230 update_last_tick_seen(rq);
8231 init_cfs_rq(&rq->cfs, rq); 8092 init_cfs_rq(&rq->cfs, rq);
8232 init_rt_rq(&rq->rt, rq); 8093 init_rt_rq(&rq->rt, rq);
8233#ifdef CONFIG_FAIR_GROUP_SCHED 8094#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
8371static void normalize_task(struct rq *rq, struct task_struct *p) 8232static void normalize_task(struct rq *rq, struct task_struct *p)
8372{ 8233{
8373 int on_rq; 8234 int on_rq;
8235
8374 update_rq_clock(rq); 8236 update_rq_clock(rq);
8375 on_rq = p->se.on_rq; 8237 on_rq = p->se.on_rq;
8376 if (on_rq) 8238 if (on_rq)
@@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void)
8402 p->se.sleep_start = 0; 8264 p->se.sleep_start = 0;
8403 p->se.block_start = 0; 8265 p->se.block_start = 0;
8404#endif 8266#endif
8405 task_rq(p)->clock = 0;
8406 8267
8407 if (!rt_task(p)) { 8268 if (!rt_task(p)) {
8408 /* 8269 /*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com>
9 *
10 * Create a semi stable clock from a mixture of other events, including:
11 * - gtod
12 * - jiffies
13 * - sched_clock()
14 * - explicit idle events
15 *
16 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
17 * making it monotonic and keeping it within an expected window. This window
18 * is set up using jiffies.
19 *
20 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
21 * that is otherwise invisible (TSC gets stopped).
22 *
23 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
24 * consistent between cpus (never more than 1 jiffies difference).
25 */
26#include <linux/sched.h>
27#include <linux/percpu.h>
28#include <linux/spinlock.h>
29#include <linux/ktime.h>
30#include <linux/module.h>
31
32
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34
35struct sched_clock_data {
36 /*
37 * Raw spinlock - this is a special case: this might be called
38 * from within instrumentation code so we dont want to do any
39 * instrumentation ourselves.
40 */
41 raw_spinlock_t lock;
42
43 unsigned long prev_jiffies;
44 u64 prev_raw;
45 u64 tick_raw;
46 u64 tick_gtod;
47 u64 clock;
48};
49
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
51
52static inline struct sched_clock_data *this_scd(void)
53{
54 return &__get_cpu_var(sched_clock_data);
55}
56
57static inline struct sched_clock_data *cpu_sdc(int cpu)
58{
59 return &per_cpu(sched_clock_data, cpu);
60}
61
62void sched_clock_init(void)
63{
64 u64 ktime_now = ktime_to_ns(ktime_get());
65 u64 now = 0;
66 int cpu;
67
68 for_each_possible_cpu(cpu) {
69 struct sched_clock_data *scd = cpu_sdc(cpu);
70
71 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72 scd->prev_jiffies = jiffies;
73 scd->prev_raw = now;
74 scd->tick_raw = now;
75 scd->tick_gtod = ktime_now;
76 scd->clock = ktime_now;
77 }
78}
79
80/*
81 * update the percpu scd from the raw @now value
82 *
83 * - filter out backward motion
84 * - use jiffies to generate a min,max window to clip the raw values
85 */
86static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
87{
88 unsigned long now_jiffies = jiffies;
89 long delta_jiffies = now_jiffies - scd->prev_jiffies;
90 u64 clock = scd->clock;
91 u64 min_clock, max_clock;
92 s64 delta = now - scd->prev_raw;
93
94 WARN_ON_ONCE(!irqs_disabled());
95 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
96
97 if (unlikely(delta < 0)) {
98 clock++;
99 goto out;
100 }
101
102 max_clock = min_clock + TICK_NSEC;
103
104 if (unlikely(clock + delta > max_clock)) {
105 if (clock < max_clock)
106 clock = max_clock;
107 else
108 clock++;
109 } else {
110 clock += delta;
111 }
112
113 out:
114 if (unlikely(clock < min_clock))
115 clock = min_clock;
116
117 scd->prev_raw = now;
118 scd->prev_jiffies = now_jiffies;
119 scd->clock = clock;
120}
121
122static void lock_double_clock(struct sched_clock_data *data1,
123 struct sched_clock_data *data2)
124{
125 if (data1 < data2) {
126 __raw_spin_lock(&data1->lock);
127 __raw_spin_lock(&data2->lock);
128 } else {
129 __raw_spin_lock(&data2->lock);
130 __raw_spin_lock(&data1->lock);
131 }
132}
133
134u64 sched_clock_cpu(int cpu)
135{
136 struct sched_clock_data *scd = cpu_sdc(cpu);
137 u64 now, clock;
138
139 WARN_ON_ONCE(!irqs_disabled());
140 now = sched_clock();
141
142 if (cpu != raw_smp_processor_id()) {
143 /*
144 * in order to update a remote cpu's clock based on our
145 * unstable raw time rebase it against:
146 * tick_raw (offset between raw counters)
147 * tick_gotd (tick offset between cpus)
148 */
149 struct sched_clock_data *my_scd = this_scd();
150
151 lock_double_clock(scd, my_scd);
152
153 now -= my_scd->tick_raw;
154 now += scd->tick_raw;
155
156 now -= my_scd->tick_gtod;
157 now += scd->tick_gtod;
158
159 __raw_spin_unlock(&my_scd->lock);
160 } else {
161 __raw_spin_lock(&scd->lock);
162 }
163
164 __update_sched_clock(scd, now);
165 clock = scd->clock;
166
167 __raw_spin_unlock(&scd->lock);
168
169 return clock;
170}
171
172void sched_clock_tick(void)
173{
174 struct sched_clock_data *scd = this_scd();
175 u64 now, now_gtod;
176
177 WARN_ON_ONCE(!irqs_disabled());
178
179 now = sched_clock();
180 now_gtod = ktime_to_ns(ktime_get());
181
182 __raw_spin_lock(&scd->lock);
183 __update_sched_clock(scd, now);
184 /*
185 * update tick_gtod after __update_sched_clock() because that will
186 * already observe 1 new jiffy; adding a new tick_gtod to that would
187 * increase the clock 2 jiffies.
188 */
189 scd->tick_raw = now;
190 scd->tick_gtod = now_gtod;
191 __raw_spin_unlock(&scd->lock);
192}
193
194/*
195 * We are going deep-idle (irqs are disabled):
196 */
197void sched_clock_idle_sleep_event(void)
198{
199 sched_clock_cpu(smp_processor_id());
200}
201EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
202
203/*
204 * We just idled delta nanoseconds (called with irqs disabled):
205 */
206void sched_clock_idle_wakeup_event(u64 delta_ns)
207{
208 struct sched_clock_data *scd = this_scd();
209 u64 now = sched_clock();
210
211 /*
212 * Override the previous timestamp and ignore all
213 * sched_clock() deltas that occured while we idled,
214 * and use the PM-provided delta_ns to advance the
215 * rq clock:
216 */
217 __raw_spin_lock(&scd->lock);
218 scd->prev_raw = now;
219 scd->clock += delta_ns;
220 __raw_spin_unlock(&scd->lock);
221
222 touch_softlockup_watchdog();
223}
224EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
225
226#endif
227
228/*
229 * Scheduler clock - returns current time in nanosec units.
230 * This is default implementation.
231 * Architectures and sub-architectures can override this.
232 */
233unsigned long long __attribute__((weak)) sched_clock(void)
234{
235 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
236}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e88..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
204 PN(next_balance); 204 PN(next_balance);
205 P(curr->pid); 205 P(curr->pid);
206 PN(clock); 206 PN(clock);
207 PN(idle_clock);
208 PN(prev_clock_raw);
209 P(clock_warps);
210 P(clock_overflows);
211 P(clock_underflows);
212 P(clock_deep_idle_events);
213 PN(clock_max_delta);
214 P(cpu_load[0]); 207 P(cpu_load[0]);
215 P(cpu_load[1]); 208 P(cpu_load[1]);
216 P(cpu_load[2]); 209 P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d99e01f6929a..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -959,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
959 return; 959 return;
960 960
961 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 961 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
962 __update_rq_clock(rq); 962 update_rq_clock(rq);
963 /* 963 /*
964 * Update run-time statistics of the 'current'. 964 * Update run-time statistics of the 'current'.
965 */ 965 */