diff options
author | Jeff Garzik <jeff@garzik.org> | 2007-02-17 15:11:43 -0500 |
---|---|---|
committer | Jeff Garzik <jeff@garzik.org> | 2007-02-17 15:11:43 -0500 |
commit | f630fe2817601314b2eb7ca5ddc23c7834646731 (patch) | |
tree | 3bfb4939b7bbc3859575ca8b58fa3f929b015941 /kernel | |
parent | 48c871c1f6a7c7044dd76774fb469e65c7e2e4e8 (diff) | |
parent | 8a03d9a498eaf02c8a118752050a5154852c13bf (diff) |
Merge branch 'master' into upstream
Diffstat (limited to 'kernel')
37 files changed, 4401 insertions, 931 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 14f4d45e0ae9..ac6b27abb1ad 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | |||
47 | obj-$(CONFIG_SECCOMP) += seccomp.o | 47 | obj-$(CONFIG_SECCOMP) += seccomp.o |
48 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 48 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
49 | obj-$(CONFIG_RELAY) += relay.o | 49 | obj-$(CONFIG_RELAY) += relay.o |
50 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | ||
50 | obj-$(CONFIG_UTS_NS) += utsname.o | 51 | obj-$(CONFIG_UTS_NS) += utsname.o |
51 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 52 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
52 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 53 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
diff --git a/kernel/fork.c b/kernel/fork.c index 0b6293d94d96..d154cc786489 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
858 | init_sigpending(&sig->shared_pending); | 858 | init_sigpending(&sig->shared_pending); |
859 | INIT_LIST_HEAD(&sig->posix_timers); | 859 | INIT_LIST_HEAD(&sig->posix_timers); |
860 | 860 | ||
861 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); | 861 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
862 | sig->it_real_incr.tv64 = 0; | 862 | sig->it_real_incr.tv64 = 0; |
863 | sig->real_timer.function = it_real_fn; | 863 | sig->real_timer.function = it_real_fn; |
864 | sig->tsk = tsk; | 864 | sig->tsk = tsk; |
diff --git a/kernel/futex.c b/kernel/futex.c index 5a737de857d3..e749e7df14b1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
1134 | 1134 | ||
1135 | if (sec != MAX_SCHEDULE_TIMEOUT) { | 1135 | if (sec != MAX_SCHEDULE_TIMEOUT) { |
1136 | to = &timeout; | 1136 | to = &timeout; |
1137 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | 1137 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); |
1138 | hrtimer_init_sleeper(to, current); | 1138 | hrtimer_init_sleeper(to, current); |
1139 | to->timer.expires = ktime_set(sec, nsec); | 1139 | to->timer.expires = ktime_set(sec, nsec); |
1140 | } | 1140 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f44e499e8fca..476cb0c0b4a4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -1,8 +1,9 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/hrtimer.c | 2 | * linux/kernel/hrtimer.c |
3 | * | 3 | * |
4 | * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar | 5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar |
6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner | ||
6 | * | 7 | * |
7 | * High-resolution kernel timers | 8 | * High-resolution kernel timers |
8 | * | 9 | * |
@@ -31,12 +32,17 @@ | |||
31 | */ | 32 | */ |
32 | 33 | ||
33 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/irq.h> | ||
34 | #include <linux/module.h> | 36 | #include <linux/module.h> |
35 | #include <linux/percpu.h> | 37 | #include <linux/percpu.h> |
36 | #include <linux/hrtimer.h> | 38 | #include <linux/hrtimer.h> |
37 | #include <linux/notifier.h> | 39 | #include <linux/notifier.h> |
38 | #include <linux/syscalls.h> | 40 | #include <linux/syscalls.h> |
41 | #include <linux/kallsyms.h> | ||
39 | #include <linux/interrupt.h> | 42 | #include <linux/interrupt.h> |
43 | #include <linux/tick.h> | ||
44 | #include <linux/seq_file.h> | ||
45 | #include <linux/err.h> | ||
40 | 46 | ||
41 | #include <asm/uaccess.h> | 47 | #include <asm/uaccess.h> |
42 | 48 | ||
@@ -45,7 +51,7 @@ | |||
45 | * | 51 | * |
46 | * returns the time in ktime_t format | 52 | * returns the time in ktime_t format |
47 | */ | 53 | */ |
48 | static ktime_t ktime_get(void) | 54 | ktime_t ktime_get(void) |
49 | { | 55 | { |
50 | struct timespec now; | 56 | struct timespec now; |
51 | 57 | ||
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void) | |||
59 | * | 65 | * |
60 | * returns the time in ktime_t format | 66 | * returns the time in ktime_t format |
61 | */ | 67 | */ |
62 | static ktime_t ktime_get_real(void) | 68 | ktime_t ktime_get_real(void) |
63 | { | 69 | { |
64 | struct timespec now; | 70 | struct timespec now; |
65 | 71 | ||
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); | |||
79 | * This ensures that we capture erroneous accesses to these clock ids | 85 | * This ensures that we capture erroneous accesses to these clock ids |
80 | * rather than moving them into the range of valid clock id's. | 86 | * rather than moving them into the range of valid clock id's. |
81 | */ | 87 | */ |
82 | 88 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |
83 | #define MAX_HRTIMER_BASES 2 | ||
84 | |||
85 | static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | ||
86 | { | 89 | { |
90 | |||
91 | .clock_base = | ||
87 | { | 92 | { |
88 | .index = CLOCK_REALTIME, | 93 | { |
89 | .get_time = &ktime_get_real, | 94 | .index = CLOCK_REALTIME, |
90 | .resolution = KTIME_REALTIME_RES, | 95 | .get_time = &ktime_get_real, |
91 | }, | 96 | .resolution = KTIME_LOW_RES, |
92 | { | 97 | }, |
93 | .index = CLOCK_MONOTONIC, | 98 | { |
94 | .get_time = &ktime_get, | 99 | .index = CLOCK_MONOTONIC, |
95 | .resolution = KTIME_MONOTONIC_RES, | 100 | .get_time = &ktime_get, |
96 | }, | 101 | .resolution = KTIME_LOW_RES, |
102 | }, | ||
103 | } | ||
97 | }; | 104 | }; |
98 | 105 | ||
99 | /** | 106 | /** |
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); | |||
125 | * Get the coarse grained time at the softirq based on xtime and | 132 | * Get the coarse grained time at the softirq based on xtime and |
126 | * wall_to_monotonic. | 133 | * wall_to_monotonic. |
127 | */ | 134 | */ |
128 | static void hrtimer_get_softirq_time(struct hrtimer_base *base) | 135 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) |
129 | { | 136 | { |
130 | ktime_t xtim, tomono; | 137 | ktime_t xtim, tomono; |
138 | struct timespec xts; | ||
131 | unsigned long seq; | 139 | unsigned long seq; |
132 | 140 | ||
133 | do { | 141 | do { |
134 | seq = read_seqbegin(&xtime_lock); | 142 | seq = read_seqbegin(&xtime_lock); |
135 | xtim = timespec_to_ktime(xtime); | 143 | #ifdef CONFIG_NO_HZ |
136 | tomono = timespec_to_ktime(wall_to_monotonic); | 144 | getnstimeofday(&xts); |
137 | 145 | #else | |
146 | xts = xtime; | ||
147 | #endif | ||
138 | } while (read_seqretry(&xtime_lock, seq)); | 148 | } while (read_seqretry(&xtime_lock, seq)); |
139 | 149 | ||
140 | base[CLOCK_REALTIME].softirq_time = xtim; | 150 | xtim = timespec_to_ktime(xts); |
141 | base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); | 151 | tomono = timespec_to_ktime(wall_to_monotonic); |
152 | base->clock_base[CLOCK_REALTIME].softirq_time = xtim; | ||
153 | base->clock_base[CLOCK_MONOTONIC].softirq_time = | ||
154 | ktime_add(xtim, tomono); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Helper function to check, whether the timer is running the callback | ||
159 | * function | ||
160 | */ | ||
161 | static inline int hrtimer_callback_running(struct hrtimer *timer) | ||
162 | { | ||
163 | return timer->state & HRTIMER_STATE_CALLBACK; | ||
142 | } | 164 | } |
143 | 165 | ||
144 | /* | 166 | /* |
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) | |||
147 | */ | 169 | */ |
148 | #ifdef CONFIG_SMP | 170 | #ifdef CONFIG_SMP |
149 | 171 | ||
150 | #define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) | ||
151 | |||
152 | /* | 172 | /* |
153 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock | 173 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock |
154 | * means that all timers which are tied to this base via timer->base are | 174 | * means that all timers which are tied to this base via timer->base are |
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) | |||
161 | * possible to set timer->base = NULL and drop the lock: the timer remains | 181 | * possible to set timer->base = NULL and drop the lock: the timer remains |
162 | * locked. | 182 | * locked. |
163 | */ | 183 | */ |
164 | static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, | 184 | static |
165 | unsigned long *flags) | 185 | struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, |
186 | unsigned long *flags) | ||
166 | { | 187 | { |
167 | struct hrtimer_base *base; | 188 | struct hrtimer_clock_base *base; |
168 | 189 | ||
169 | for (;;) { | 190 | for (;;) { |
170 | base = timer->base; | 191 | base = timer->base; |
171 | if (likely(base != NULL)) { | 192 | if (likely(base != NULL)) { |
172 | spin_lock_irqsave(&base->lock, *flags); | 193 | spin_lock_irqsave(&base->cpu_base->lock, *flags); |
173 | if (likely(base == timer->base)) | 194 | if (likely(base == timer->base)) |
174 | return base; | 195 | return base; |
175 | /* The timer has migrated to another CPU: */ | 196 | /* The timer has migrated to another CPU: */ |
176 | spin_unlock_irqrestore(&base->lock, *flags); | 197 | spin_unlock_irqrestore(&base->cpu_base->lock, *flags); |
177 | } | 198 | } |
178 | cpu_relax(); | 199 | cpu_relax(); |
179 | } | 200 | } |
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
182 | /* | 203 | /* |
183 | * Switch the timer base to the current CPU when possible. | 204 | * Switch the timer base to the current CPU when possible. |
184 | */ | 205 | */ |
185 | static inline struct hrtimer_base * | 206 | static inline struct hrtimer_clock_base * |
186 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | 207 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) |
187 | { | 208 | { |
188 | struct hrtimer_base *new_base; | 209 | struct hrtimer_clock_base *new_base; |
210 | struct hrtimer_cpu_base *new_cpu_base; | ||
189 | 211 | ||
190 | new_base = &__get_cpu_var(hrtimer_bases)[base->index]; | 212 | new_cpu_base = &__get_cpu_var(hrtimer_bases); |
213 | new_base = &new_cpu_base->clock_base[base->index]; | ||
191 | 214 | ||
192 | if (base != new_base) { | 215 | if (base != new_base) { |
193 | /* | 216 | /* |
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | |||
199 | * completed. There is no conflict as we hold the lock until | 222 | * completed. There is no conflict as we hold the lock until |
200 | * the timer is enqueued. | 223 | * the timer is enqueued. |
201 | */ | 224 | */ |
202 | if (unlikely(base->curr_timer == timer)) | 225 | if (unlikely(hrtimer_callback_running(timer))) |
203 | return base; | 226 | return base; |
204 | 227 | ||
205 | /* See the comment in lock_timer_base() */ | 228 | /* See the comment in lock_timer_base() */ |
206 | timer->base = NULL; | 229 | timer->base = NULL; |
207 | spin_unlock(&base->lock); | 230 | spin_unlock(&base->cpu_base->lock); |
208 | spin_lock(&new_base->lock); | 231 | spin_lock(&new_base->cpu_base->lock); |
209 | timer->base = new_base; | 232 | timer->base = new_base; |
210 | } | 233 | } |
211 | return new_base; | 234 | return new_base; |
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | |||
213 | 236 | ||
214 | #else /* CONFIG_SMP */ | 237 | #else /* CONFIG_SMP */ |
215 | 238 | ||
216 | #define set_curr_timer(b, t) do { } while (0) | 239 | static inline struct hrtimer_clock_base * |
217 | |||
218 | static inline struct hrtimer_base * | ||
219 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | 240 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) |
220 | { | 241 | { |
221 | struct hrtimer_base *base = timer->base; | 242 | struct hrtimer_clock_base *base = timer->base; |
222 | 243 | ||
223 | spin_lock_irqsave(&base->lock, *flags); | 244 | spin_lock_irqsave(&base->cpu_base->lock, *flags); |
224 | 245 | ||
225 | return base; | 246 | return base; |
226 | } | 247 | } |
227 | 248 | ||
228 | #define switch_hrtimer_base(t, b) (b) | 249 | # define switch_hrtimer_base(t, b) (b) |
229 | 250 | ||
230 | #endif /* !CONFIG_SMP */ | 251 | #endif /* !CONFIG_SMP */ |
231 | 252 | ||
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | |||
256 | 277 | ||
257 | return ktime_add(kt, tmp); | 278 | return ktime_add(kt, tmp); |
258 | } | 279 | } |
259 | |||
260 | #else /* CONFIG_KTIME_SCALAR */ | ||
261 | |||
262 | # endif /* !CONFIG_KTIME_SCALAR */ | 280 | # endif /* !CONFIG_KTIME_SCALAR */ |
263 | 281 | ||
264 | /* | 282 | /* |
265 | * Divide a ktime value by a nanosecond value | 283 | * Divide a ktime value by a nanosecond value |
266 | */ | 284 | */ |
267 | static unsigned long ktime_divns(const ktime_t kt, s64 div) | 285 | unsigned long ktime_divns(const ktime_t kt, s64 div) |
268 | { | 286 | { |
269 | u64 dclc, inc, dns; | 287 | u64 dclc, inc, dns; |
270 | int sft = 0; | 288 | int sft = 0; |
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
281 | 299 | ||
282 | return (unsigned long) dclc; | 300 | return (unsigned long) dclc; |
283 | } | 301 | } |
284 | |||
285 | #else /* BITS_PER_LONG < 64 */ | ||
286 | # define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) | ||
287 | #endif /* BITS_PER_LONG >= 64 */ | 302 | #endif /* BITS_PER_LONG >= 64 */ |
288 | 303 | ||
304 | /* High resolution timer related functions */ | ||
305 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
306 | |||
307 | /* | ||
308 | * High resolution timer enabled ? | ||
309 | */ | ||
310 | static int hrtimer_hres_enabled __read_mostly = 1; | ||
311 | |||
312 | /* | ||
313 | * Enable / Disable high resolution mode | ||
314 | */ | ||
315 | static int __init setup_hrtimer_hres(char *str) | ||
316 | { | ||
317 | if (!strcmp(str, "off")) | ||
318 | hrtimer_hres_enabled = 0; | ||
319 | else if (!strcmp(str, "on")) | ||
320 | hrtimer_hres_enabled = 1; | ||
321 | else | ||
322 | return 0; | ||
323 | return 1; | ||
324 | } | ||
325 | |||
326 | __setup("highres=", setup_hrtimer_hres); | ||
327 | |||
328 | /* | ||
329 | * hrtimer_high_res_enabled - query, if the highres mode is enabled | ||
330 | */ | ||
331 | static inline int hrtimer_is_hres_enabled(void) | ||
332 | { | ||
333 | return hrtimer_hres_enabled; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Is the high resolution mode active ? | ||
338 | */ | ||
339 | static inline int hrtimer_hres_active(void) | ||
340 | { | ||
341 | return __get_cpu_var(hrtimer_bases).hres_active; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * Reprogram the event source with checking both queues for the | ||
346 | * next event | ||
347 | * Called with interrupts disabled and base->lock held | ||
348 | */ | ||
349 | static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) | ||
350 | { | ||
351 | int i; | ||
352 | struct hrtimer_clock_base *base = cpu_base->clock_base; | ||
353 | ktime_t expires; | ||
354 | |||
355 | cpu_base->expires_next.tv64 = KTIME_MAX; | ||
356 | |||
357 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | ||
358 | struct hrtimer *timer; | ||
359 | |||
360 | if (!base->first) | ||
361 | continue; | ||
362 | timer = rb_entry(base->first, struct hrtimer, node); | ||
363 | expires = ktime_sub(timer->expires, base->offset); | ||
364 | if (expires.tv64 < cpu_base->expires_next.tv64) | ||
365 | cpu_base->expires_next = expires; | ||
366 | } | ||
367 | |||
368 | if (cpu_base->expires_next.tv64 != KTIME_MAX) | ||
369 | tick_program_event(cpu_base->expires_next, 1); | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Shared reprogramming for clock_realtime and clock_monotonic | ||
374 | * | ||
375 | * When a timer is enqueued and expires earlier than the already enqueued | ||
376 | * timers, we have to check, whether it expires earlier than the timer for | ||
377 | * which the clock event device was armed. | ||
378 | * | ||
379 | * Called with interrupts disabled and base->cpu_base.lock held | ||
380 | */ | ||
381 | static int hrtimer_reprogram(struct hrtimer *timer, | ||
382 | struct hrtimer_clock_base *base) | ||
383 | { | ||
384 | ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; | ||
385 | ktime_t expires = ktime_sub(timer->expires, base->offset); | ||
386 | int res; | ||
387 | |||
388 | /* | ||
389 | * When the callback is running, we do not reprogram the clock event | ||
390 | * device. The timer callback is either running on a different CPU or | ||
391 | * the callback is executed in the hrtimer_interupt context. The | ||
392 | * reprogramming is handled either by the softirq, which called the | ||
393 | * callback or at the end of the hrtimer_interrupt. | ||
394 | */ | ||
395 | if (hrtimer_callback_running(timer)) | ||
396 | return 0; | ||
397 | |||
398 | if (expires.tv64 >= expires_next->tv64) | ||
399 | return 0; | ||
400 | |||
401 | /* | ||
402 | * Clockevents returns -ETIME, when the event was in the past. | ||
403 | */ | ||
404 | res = tick_program_event(expires, 0); | ||
405 | if (!IS_ERR_VALUE(res)) | ||
406 | *expires_next = expires; | ||
407 | return res; | ||
408 | } | ||
409 | |||
410 | |||
411 | /* | ||
412 | * Retrigger next event is called after clock was set | ||
413 | * | ||
414 | * Called with interrupts disabled via on_each_cpu() | ||
415 | */ | ||
416 | static void retrigger_next_event(void *arg) | ||
417 | { | ||
418 | struct hrtimer_cpu_base *base; | ||
419 | struct timespec realtime_offset; | ||
420 | unsigned long seq; | ||
421 | |||
422 | if (!hrtimer_hres_active()) | ||
423 | return; | ||
424 | |||
425 | do { | ||
426 | seq = read_seqbegin(&xtime_lock); | ||
427 | set_normalized_timespec(&realtime_offset, | ||
428 | -wall_to_monotonic.tv_sec, | ||
429 | -wall_to_monotonic.tv_nsec); | ||
430 | } while (read_seqretry(&xtime_lock, seq)); | ||
431 | |||
432 | base = &__get_cpu_var(hrtimer_bases); | ||
433 | |||
434 | /* Adjust CLOCK_REALTIME offset */ | ||
435 | spin_lock(&base->lock); | ||
436 | base->clock_base[CLOCK_REALTIME].offset = | ||
437 | timespec_to_ktime(realtime_offset); | ||
438 | |||
439 | hrtimer_force_reprogram(base); | ||
440 | spin_unlock(&base->lock); | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * Clock realtime was set | ||
445 | * | ||
446 | * Change the offset of the realtime clock vs. the monotonic | ||
447 | * clock. | ||
448 | * | ||
449 | * We might have to reprogram the high resolution timer interrupt. On | ||
450 | * SMP we call the architecture specific code to retrigger _all_ high | ||
451 | * resolution timer interrupts. On UP we just disable interrupts and | ||
452 | * call the high resolution interrupt code. | ||
453 | */ | ||
454 | void clock_was_set(void) | ||
455 | { | ||
456 | /* Retrigger the CPU local events everywhere */ | ||
457 | on_each_cpu(retrigger_next_event, NULL, 0, 1); | ||
458 | } | ||
459 | |||
460 | /* | ||
461 | * Check, whether the timer is on the callback pending list | ||
462 | */ | ||
463 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
464 | { | ||
465 | return timer->state & HRTIMER_STATE_PENDING; | ||
466 | } | ||
467 | |||
468 | /* | ||
469 | * Remove a timer from the callback pending list | ||
470 | */ | ||
471 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
472 | { | ||
473 | list_del_init(&timer->cb_entry); | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * Initialize the high resolution related parts of cpu_base | ||
478 | */ | ||
479 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | ||
480 | { | ||
481 | base->expires_next.tv64 = KTIME_MAX; | ||
482 | base->hres_active = 0; | ||
483 | INIT_LIST_HEAD(&base->cb_pending); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Initialize the high resolution related parts of a hrtimer | ||
488 | */ | ||
489 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | ||
490 | { | ||
491 | INIT_LIST_HEAD(&timer->cb_entry); | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * When High resolution timers are active, try to reprogram. Note, that in case | ||
496 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry | ||
497 | * check happens. The timer gets enqueued into the rbtree. The reprogramming | ||
498 | * and expiry check is done in the hrtimer_interrupt or in the softirq. | ||
499 | */ | ||
500 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | ||
501 | struct hrtimer_clock_base *base) | ||
502 | { | ||
503 | if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { | ||
504 | |||
505 | /* Timer is expired, act upon the callback mode */ | ||
506 | switch(timer->cb_mode) { | ||
507 | case HRTIMER_CB_IRQSAFE_NO_RESTART: | ||
508 | /* | ||
509 | * We can call the callback from here. No restart | ||
510 | * happens, so no danger of recursion | ||
511 | */ | ||
512 | BUG_ON(timer->function(timer) != HRTIMER_NORESTART); | ||
513 | return 1; | ||
514 | case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: | ||
515 | /* | ||
516 | * This is solely for the sched tick emulation with | ||
517 | * dynamic tick support to ensure that we do not | ||
518 | * restart the tick right on the edge and end up with | ||
519 | * the tick timer in the softirq ! The calling site | ||
520 | * takes care of this. | ||
521 | */ | ||
522 | return 1; | ||
523 | case HRTIMER_CB_IRQSAFE: | ||
524 | case HRTIMER_CB_SOFTIRQ: | ||
525 | /* | ||
526 | * Move everything else into the softirq pending list ! | ||
527 | */ | ||
528 | list_add_tail(&timer->cb_entry, | ||
529 | &base->cpu_base->cb_pending); | ||
530 | timer->state = HRTIMER_STATE_PENDING; | ||
531 | raise_softirq(HRTIMER_SOFTIRQ); | ||
532 | return 1; | ||
533 | default: | ||
534 | BUG(); | ||
535 | } | ||
536 | } | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Switch to high resolution mode | ||
542 | */ | ||
543 | static void hrtimer_switch_to_hres(void) | ||
544 | { | ||
545 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | ||
546 | unsigned long flags; | ||
547 | |||
548 | if (base->hres_active) | ||
549 | return; | ||
550 | |||
551 | local_irq_save(flags); | ||
552 | |||
553 | if (tick_init_highres()) { | ||
554 | local_irq_restore(flags); | ||
555 | return; | ||
556 | } | ||
557 | base->hres_active = 1; | ||
558 | base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; | ||
559 | base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; | ||
560 | |||
561 | tick_setup_sched_timer(); | ||
562 | |||
563 | /* "Retrigger" the interrupt to get things going */ | ||
564 | retrigger_next_event(NULL); | ||
565 | local_irq_restore(flags); | ||
566 | printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", | ||
567 | smp_processor_id()); | ||
568 | } | ||
569 | |||
570 | #else | ||
571 | |||
572 | static inline int hrtimer_hres_active(void) { return 0; } | ||
573 | static inline int hrtimer_is_hres_enabled(void) { return 0; } | ||
574 | static inline void hrtimer_switch_to_hres(void) { } | ||
575 | static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } | ||
576 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | ||
577 | struct hrtimer_clock_base *base) | ||
578 | { | ||
579 | return 0; | ||
580 | } | ||
581 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
582 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
583 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | ||
584 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | ||
585 | |||
586 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
587 | |||
588 | #ifdef CONFIG_TIMER_STATS | ||
589 | void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) | ||
590 | { | ||
591 | if (timer->start_site) | ||
592 | return; | ||
593 | |||
594 | timer->start_site = addr; | ||
595 | memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); | ||
596 | timer->start_pid = current->pid; | ||
597 | } | ||
598 | #endif | ||
599 | |||
289 | /* | 600 | /* |
290 | * Counterpart to lock_timer_base above: | 601 | * Counterpart to lock_timer_base above: |
291 | */ | 602 | */ |
292 | static inline | 603 | static inline |
293 | void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | 604 | void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) |
294 | { | 605 | { |
295 | spin_unlock_irqrestore(&timer->base->lock, *flags); | 606 | spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); |
296 | } | 607 | } |
297 | 608 | ||
298 | /** | 609 | /** |
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) | |||
342 | * The timer is inserted in expiry order. Insertion into the | 653 | * The timer is inserted in expiry order. Insertion into the |
343 | * red black tree is O(log(n)). Must hold the base lock. | 654 | * red black tree is O(log(n)). Must hold the base lock. |
344 | */ | 655 | */ |
345 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 656 | static void enqueue_hrtimer(struct hrtimer *timer, |
657 | struct hrtimer_clock_base *base, int reprogram) | ||
346 | { | 658 | { |
347 | struct rb_node **link = &base->active.rb_node; | 659 | struct rb_node **link = &base->active.rb_node; |
348 | struct rb_node *parent = NULL; | 660 | struct rb_node *parent = NULL; |
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
368 | * Insert the timer to the rbtree and check whether it | 680 | * Insert the timer to the rbtree and check whether it |
369 | * replaces the first pending timer | 681 | * replaces the first pending timer |
370 | */ | 682 | */ |
371 | rb_link_node(&timer->node, parent, link); | ||
372 | rb_insert_color(&timer->node, &base->active); | ||
373 | |||
374 | if (!base->first || timer->expires.tv64 < | 683 | if (!base->first || timer->expires.tv64 < |
375 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | 684 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) { |
685 | /* | ||
686 | * Reprogram the clock event device. When the timer is already | ||
687 | * expired hrtimer_enqueue_reprogram has either called the | ||
688 | * callback or added it to the pending list and raised the | ||
689 | * softirq. | ||
690 | * | ||
691 | * This is a NOP for !HIGHRES | ||
692 | */ | ||
693 | if (reprogram && hrtimer_enqueue_reprogram(timer, base)) | ||
694 | return; | ||
695 | |||
376 | base->first = &timer->node; | 696 | base->first = &timer->node; |
697 | } | ||
698 | |||
699 | rb_link_node(&timer->node, parent, link); | ||
700 | rb_insert_color(&timer->node, &base->active); | ||
701 | /* | ||
702 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | ||
703 | * state of a possibly running callback. | ||
704 | */ | ||
705 | timer->state |= HRTIMER_STATE_ENQUEUED; | ||
377 | } | 706 | } |
378 | 707 | ||
379 | /* | 708 | /* |
380 | * __remove_hrtimer - internal function to remove a timer | 709 | * __remove_hrtimer - internal function to remove a timer |
381 | * | 710 | * |
382 | * Caller must hold the base lock. | 711 | * Caller must hold the base lock. |
712 | * | ||
713 | * High resolution timer mode reprograms the clock event device when the | ||
714 | * timer is the one which expires next. The caller can disable this by setting | ||
715 | * reprogram to zero. This is useful, when the context does a reprogramming | ||
716 | * anyway (e.g. timer interrupt) | ||
383 | */ | 717 | */ |
384 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 718 | static void __remove_hrtimer(struct hrtimer *timer, |
719 | struct hrtimer_clock_base *base, | ||
720 | unsigned long newstate, int reprogram) | ||
385 | { | 721 | { |
386 | /* | 722 | /* High res. callback list. NOP for !HIGHRES */ |
387 | * Remove the timer from the rbtree and replace the | 723 | if (hrtimer_cb_pending(timer)) |
388 | * first entry pointer if necessary. | 724 | hrtimer_remove_cb_pending(timer); |
389 | */ | 725 | else { |
390 | if (base->first == &timer->node) | 726 | /* |
391 | base->first = rb_next(&timer->node); | 727 | * Remove the timer from the rbtree and replace the |
392 | rb_erase(&timer->node, &base->active); | 728 | * first entry pointer if necessary. |
393 | rb_set_parent(&timer->node, &timer->node); | 729 | */ |
730 | if (base->first == &timer->node) { | ||
731 | base->first = rb_next(&timer->node); | ||
732 | /* Reprogram the clock event device. if enabled */ | ||
733 | if (reprogram && hrtimer_hres_active()) | ||
734 | hrtimer_force_reprogram(base->cpu_base); | ||
735 | } | ||
736 | rb_erase(&timer->node, &base->active); | ||
737 | } | ||
738 | timer->state = newstate; | ||
394 | } | 739 | } |
395 | 740 | ||
396 | /* | 741 | /* |
397 | * remove hrtimer, called with base lock held | 742 | * remove hrtimer, called with base lock held |
398 | */ | 743 | */ |
399 | static inline int | 744 | static inline int |
400 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 745 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) |
401 | { | 746 | { |
402 | if (hrtimer_active(timer)) { | 747 | if (hrtimer_is_queued(timer)) { |
403 | __remove_hrtimer(timer, base); | 748 | int reprogram; |
749 | |||
750 | /* | ||
751 | * Remove the timer and force reprogramming when high | ||
752 | * resolution mode is active and the timer is on the current | ||
753 | * CPU. If we remove a timer on another CPU, reprogramming is | ||
754 | * skipped. The interrupt event on this CPU is fired and | ||
755 | * reprogramming happens in the interrupt handler. This is a | ||
756 | * rare case and less expensive than a smp call. | ||
757 | */ | ||
758 | timer_stats_hrtimer_clear_start_info(timer); | ||
759 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | ||
760 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, | ||
761 | reprogram); | ||
404 | return 1; | 762 | return 1; |
405 | } | 763 | } |
406 | return 0; | 764 | return 0; |
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
419 | int | 777 | int |
420 | hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | 778 | hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) |
421 | { | 779 | { |
422 | struct hrtimer_base *base, *new_base; | 780 | struct hrtimer_clock_base *base, *new_base; |
423 | unsigned long flags; | 781 | unsigned long flags; |
424 | int ret; | 782 | int ret; |
425 | 783 | ||
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
431 | /* Switch the timer base, if necessary: */ | 789 | /* Switch the timer base, if necessary: */ |
432 | new_base = switch_hrtimer_base(timer, base); | 790 | new_base = switch_hrtimer_base(timer, base); |
433 | 791 | ||
434 | if (mode == HRTIMER_REL) { | 792 | if (mode == HRTIMER_MODE_REL) { |
435 | tim = ktime_add(tim, new_base->get_time()); | 793 | tim = ktime_add(tim, new_base->get_time()); |
436 | /* | 794 | /* |
437 | * CONFIG_TIME_LOW_RES is a temporary way for architectures | 795 | * CONFIG_TIME_LOW_RES is a temporary way for architectures |
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
446 | } | 804 | } |
447 | timer->expires = tim; | 805 | timer->expires = tim; |
448 | 806 | ||
449 | enqueue_hrtimer(timer, new_base); | 807 | timer_stats_hrtimer_set_start_info(timer); |
808 | |||
809 | enqueue_hrtimer(timer, new_base, base == new_base); | ||
450 | 810 | ||
451 | unlock_hrtimer_base(timer, &flags); | 811 | unlock_hrtimer_base(timer, &flags); |
452 | 812 | ||
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); | |||
466 | */ | 826 | */ |
467 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 827 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
468 | { | 828 | { |
469 | struct hrtimer_base *base; | 829 | struct hrtimer_clock_base *base; |
470 | unsigned long flags; | 830 | unsigned long flags; |
471 | int ret = -1; | 831 | int ret = -1; |
472 | 832 | ||
473 | base = lock_hrtimer_base(timer, &flags); | 833 | base = lock_hrtimer_base(timer, &flags); |
474 | 834 | ||
475 | if (base->curr_timer != timer) | 835 | if (!hrtimer_callback_running(timer)) |
476 | ret = remove_hrtimer(timer, base); | 836 | ret = remove_hrtimer(timer, base); |
477 | 837 | ||
478 | unlock_hrtimer_base(timer, &flags); | 838 | unlock_hrtimer_base(timer, &flags); |
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
508 | */ | 868 | */ |
509 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 869 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
510 | { | 870 | { |
511 | struct hrtimer_base *base; | 871 | struct hrtimer_clock_base *base; |
512 | unsigned long flags; | 872 | unsigned long flags; |
513 | ktime_t rem; | 873 | ktime_t rem; |
514 | 874 | ||
515 | base = lock_hrtimer_base(timer, &flags); | 875 | base = lock_hrtimer_base(timer, &flags); |
516 | rem = ktime_sub(timer->expires, timer->base->get_time()); | 876 | rem = ktime_sub(timer->expires, base->get_time()); |
517 | unlock_hrtimer_base(timer, &flags); | 877 | unlock_hrtimer_base(timer, &flags); |
518 | 878 | ||
519 | return rem; | 879 | return rem; |
520 | } | 880 | } |
521 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | 881 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); |
522 | 882 | ||
523 | #ifdef CONFIG_NO_IDLE_HZ | 883 | #if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) |
524 | /** | 884 | /** |
525 | * hrtimer_get_next_event - get the time until next expiry event | 885 | * hrtimer_get_next_event - get the time until next expiry event |
526 | * | 886 | * |
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | |||
529 | */ | 889 | */ |
530 | ktime_t hrtimer_get_next_event(void) | 890 | ktime_t hrtimer_get_next_event(void) |
531 | { | 891 | { |
532 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | 892 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
893 | struct hrtimer_clock_base *base = cpu_base->clock_base; | ||
533 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; | 894 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; |
534 | unsigned long flags; | 895 | unsigned long flags; |
535 | int i; | 896 | int i; |
536 | 897 | ||
537 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { | 898 | spin_lock_irqsave(&cpu_base->lock, flags); |
538 | struct hrtimer *timer; | ||
539 | 899 | ||
540 | spin_lock_irqsave(&base->lock, flags); | 900 | if (!hrtimer_hres_active()) { |
541 | if (!base->first) { | 901 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
542 | spin_unlock_irqrestore(&base->lock, flags); | 902 | struct hrtimer *timer; |
543 | continue; | 903 | |
904 | if (!base->first) | ||
905 | continue; | ||
906 | |||
907 | timer = rb_entry(base->first, struct hrtimer, node); | ||
908 | delta.tv64 = timer->expires.tv64; | ||
909 | delta = ktime_sub(delta, base->get_time()); | ||
910 | if (delta.tv64 < mindelta.tv64) | ||
911 | mindelta.tv64 = delta.tv64; | ||
544 | } | 912 | } |
545 | timer = rb_entry(base->first, struct hrtimer, node); | ||
546 | delta.tv64 = timer->expires.tv64; | ||
547 | spin_unlock_irqrestore(&base->lock, flags); | ||
548 | delta = ktime_sub(delta, base->get_time()); | ||
549 | if (delta.tv64 < mindelta.tv64) | ||
550 | mindelta.tv64 = delta.tv64; | ||
551 | } | 913 | } |
914 | |||
915 | spin_unlock_irqrestore(&cpu_base->lock, flags); | ||
916 | |||
552 | if (mindelta.tv64 < 0) | 917 | if (mindelta.tv64 < 0) |
553 | mindelta.tv64 = 0; | 918 | mindelta.tv64 = 0; |
554 | return mindelta; | 919 | return mindelta; |
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void) | |||
564 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | 929 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
565 | enum hrtimer_mode mode) | 930 | enum hrtimer_mode mode) |
566 | { | 931 | { |
567 | struct hrtimer_base *bases; | 932 | struct hrtimer_cpu_base *cpu_base; |
568 | 933 | ||
569 | memset(timer, 0, sizeof(struct hrtimer)); | 934 | memset(timer, 0, sizeof(struct hrtimer)); |
570 | 935 | ||
571 | bases = __raw_get_cpu_var(hrtimer_bases); | 936 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
572 | 937 | ||
573 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) | 938 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
574 | clock_id = CLOCK_MONOTONIC; | 939 | clock_id = CLOCK_MONOTONIC; |
575 | 940 | ||
576 | timer->base = &bases[clock_id]; | 941 | timer->base = &cpu_base->clock_base[clock_id]; |
577 | rb_set_parent(&timer->node, &timer->node); | 942 | hrtimer_init_timer_hres(timer); |
943 | |||
944 | #ifdef CONFIG_TIMER_STATS | ||
945 | timer->start_site = NULL; | ||
946 | timer->start_pid = -1; | ||
947 | memset(timer->start_comm, 0, TASK_COMM_LEN); | ||
948 | #endif | ||
578 | } | 949 | } |
579 | EXPORT_SYMBOL_GPL(hrtimer_init); | 950 | EXPORT_SYMBOL_GPL(hrtimer_init); |
580 | 951 | ||
@@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init); | |||
588 | */ | 959 | */ |
589 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | 960 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) |
590 | { | 961 | { |
591 | struct hrtimer_base *bases; | 962 | struct hrtimer_cpu_base *cpu_base; |
592 | 963 | ||
593 | bases = __raw_get_cpu_var(hrtimer_bases); | 964 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
594 | *tp = ktime_to_timespec(bases[which_clock].resolution); | 965 | *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); |
595 | 966 | ||
596 | return 0; | 967 | return 0; |
597 | } | 968 | } |
598 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 969 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
599 | 970 | ||
971 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
972 | |||
973 | /* | ||
974 | * High resolution timer interrupt | ||
975 | * Called with interrupts disabled | ||
976 | */ | ||
977 | void hrtimer_interrupt(struct clock_event_device *dev) | ||
978 | { | ||
979 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
980 | struct hrtimer_clock_base *base; | ||
981 | ktime_t expires_next, now; | ||
982 | int i, raise = 0; | ||
983 | |||
984 | BUG_ON(!cpu_base->hres_active); | ||
985 | cpu_base->nr_events++; | ||
986 | dev->next_event.tv64 = KTIME_MAX; | ||
987 | |||
988 | retry: | ||
989 | now = ktime_get(); | ||
990 | |||
991 | expires_next.tv64 = KTIME_MAX; | ||
992 | |||
993 | base = cpu_base->clock_base; | ||
994 | |||
995 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | ||
996 | ktime_t basenow; | ||
997 | struct rb_node *node; | ||
998 | |||
999 | spin_lock(&cpu_base->lock); | ||
1000 | |||
1001 | basenow = ktime_add(now, base->offset); | ||
1002 | |||
1003 | while ((node = base->first)) { | ||
1004 | struct hrtimer *timer; | ||
1005 | |||
1006 | timer = rb_entry(node, struct hrtimer, node); | ||
1007 | |||
1008 | if (basenow.tv64 < timer->expires.tv64) { | ||
1009 | ktime_t expires; | ||
1010 | |||
1011 | expires = ktime_sub(timer->expires, | ||
1012 | base->offset); | ||
1013 | if (expires.tv64 < expires_next.tv64) | ||
1014 | expires_next = expires; | ||
1015 | break; | ||
1016 | } | ||
1017 | |||
1018 | /* Move softirq callbacks to the pending list */ | ||
1019 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { | ||
1020 | __remove_hrtimer(timer, base, | ||
1021 | HRTIMER_STATE_PENDING, 0); | ||
1022 | list_add_tail(&timer->cb_entry, | ||
1023 | &base->cpu_base->cb_pending); | ||
1024 | raise = 1; | ||
1025 | continue; | ||
1026 | } | ||
1027 | |||
1028 | __remove_hrtimer(timer, base, | ||
1029 | HRTIMER_STATE_CALLBACK, 0); | ||
1030 | timer_stats_account_hrtimer(timer); | ||
1031 | |||
1032 | /* | ||
1033 | * Note: We clear the CALLBACK bit after | ||
1034 | * enqueue_hrtimer to avoid reprogramming of | ||
1035 | * the event hardware. This happens at the end | ||
1036 | * of this function anyway. | ||
1037 | */ | ||
1038 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
1039 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1040 | enqueue_hrtimer(timer, base, 0); | ||
1041 | } | ||
1042 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1043 | } | ||
1044 | spin_unlock(&cpu_base->lock); | ||
1045 | base++; | ||
1046 | } | ||
1047 | |||
1048 | cpu_base->expires_next = expires_next; | ||
1049 | |||
1050 | /* Reprogramming necessary ? */ | ||
1051 | if (expires_next.tv64 != KTIME_MAX) { | ||
1052 | if (tick_program_event(expires_next, 0)) | ||
1053 | goto retry; | ||
1054 | } | ||
1055 | |||
1056 | /* Raise softirq ? */ | ||
1057 | if (raise) | ||
1058 | raise_softirq(HRTIMER_SOFTIRQ); | ||
1059 | } | ||
1060 | |||
1061 | static void run_hrtimer_softirq(struct softirq_action *h) | ||
1062 | { | ||
1063 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1064 | |||
1065 | spin_lock_irq(&cpu_base->lock); | ||
1066 | |||
1067 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1068 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1069 | struct hrtimer *timer; | ||
1070 | int restart; | ||
1071 | |||
1072 | timer = list_entry(cpu_base->cb_pending.next, | ||
1073 | struct hrtimer, cb_entry); | ||
1074 | |||
1075 | timer_stats_account_hrtimer(timer); | ||
1076 | |||
1077 | fn = timer->function; | ||
1078 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
1079 | spin_unlock_irq(&cpu_base->lock); | ||
1080 | |||
1081 | restart = fn(timer); | ||
1082 | |||
1083 | spin_lock_irq(&cpu_base->lock); | ||
1084 | |||
1085 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1086 | if (restart == HRTIMER_RESTART) { | ||
1087 | BUG_ON(hrtimer_active(timer)); | ||
1088 | /* | ||
1089 | * Enqueue the timer, allow reprogramming of the event | ||
1090 | * device | ||
1091 | */ | ||
1092 | enqueue_hrtimer(timer, timer->base, 1); | ||
1093 | } else if (hrtimer_active(timer)) { | ||
1094 | /* | ||
1095 | * If the timer was rearmed on another CPU, reprogram | ||
1096 | * the event device. | ||
1097 | */ | ||
1098 | if (timer->base->first == &timer->node) | ||
1099 | hrtimer_reprogram(timer, timer->base); | ||
1100 | } | ||
1101 | } | ||
1102 | spin_unlock_irq(&cpu_base->lock); | ||
1103 | } | ||
1104 | |||
1105 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
1106 | |||
600 | /* | 1107 | /* |
601 | * Expire the per base hrtimer-queue: | 1108 | * Expire the per base hrtimer-queue: |
602 | */ | 1109 | */ |
603 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | 1110 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
1111 | int index) | ||
604 | { | 1112 | { |
605 | struct rb_node *node; | 1113 | struct rb_node *node; |
1114 | struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; | ||
606 | 1115 | ||
607 | if (!base->first) | 1116 | if (!base->first) |
608 | return; | 1117 | return; |
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) | |||
610 | if (base->get_softirq_time) | 1119 | if (base->get_softirq_time) |
611 | base->softirq_time = base->get_softirq_time(); | 1120 | base->softirq_time = base->get_softirq_time(); |
612 | 1121 | ||
613 | spin_lock_irq(&base->lock); | 1122 | spin_lock_irq(&cpu_base->lock); |
614 | 1123 | ||
615 | while ((node = base->first)) { | 1124 | while ((node = base->first)) { |
616 | struct hrtimer *timer; | 1125 | struct hrtimer *timer; |
617 | int (*fn)(struct hrtimer *); | 1126 | enum hrtimer_restart (*fn)(struct hrtimer *); |
618 | int restart; | 1127 | int restart; |
619 | 1128 | ||
620 | timer = rb_entry(node, struct hrtimer, node); | 1129 | timer = rb_entry(node, struct hrtimer, node); |
621 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1130 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
622 | break; | 1131 | break; |
623 | 1132 | ||
1133 | timer_stats_account_hrtimer(timer); | ||
1134 | |||
624 | fn = timer->function; | 1135 | fn = timer->function; |
625 | set_curr_timer(base, timer); | 1136 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); |
626 | __remove_hrtimer(timer, base); | 1137 | spin_unlock_irq(&cpu_base->lock); |
627 | spin_unlock_irq(&base->lock); | ||
628 | 1138 | ||
629 | restart = fn(timer); | 1139 | restart = fn(timer); |
630 | 1140 | ||
631 | spin_lock_irq(&base->lock); | 1141 | spin_lock_irq(&cpu_base->lock); |
632 | 1142 | ||
1143 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
633 | if (restart != HRTIMER_NORESTART) { | 1144 | if (restart != HRTIMER_NORESTART) { |
634 | BUG_ON(hrtimer_active(timer)); | 1145 | BUG_ON(hrtimer_active(timer)); |
635 | enqueue_hrtimer(timer, base); | 1146 | enqueue_hrtimer(timer, base, 0); |
636 | } | 1147 | } |
637 | } | 1148 | } |
638 | set_curr_timer(base, NULL); | 1149 | spin_unlock_irq(&cpu_base->lock); |
639 | spin_unlock_irq(&base->lock); | ||
640 | } | 1150 | } |
641 | 1151 | ||
642 | /* | 1152 | /* |
643 | * Called from timer softirq every jiffy, expire hrtimers: | 1153 | * Called from timer softirq every jiffy, expire hrtimers: |
1154 | * | ||
1155 | * For HRT its the fall back code to run the softirq in the timer | ||
1156 | * softirq context in case the hrtimer initialization failed or has | ||
1157 | * not been done yet. | ||
644 | */ | 1158 | */ |
645 | void hrtimer_run_queues(void) | 1159 | void hrtimer_run_queues(void) |
646 | { | 1160 | { |
647 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | 1161 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
648 | int i; | 1162 | int i; |
649 | 1163 | ||
650 | hrtimer_get_softirq_time(base); | 1164 | if (hrtimer_hres_active()) |
1165 | return; | ||
1166 | |||
1167 | /* | ||
1168 | * This _is_ ugly: We have to check in the softirq context, | ||
1169 | * whether we can switch to highres and / or nohz mode. The | ||
1170 | * clocksource switch happens in the timer interrupt with | ||
1171 | * xtime_lock held. Notification from there only sets the | ||
1172 | * check bit in the tick_oneshot code, otherwise we might | ||
1173 | * deadlock vs. xtime_lock. | ||
1174 | */ | ||
1175 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1176 | hrtimer_switch_to_hres(); | ||
651 | 1177 | ||
652 | for (i = 0; i < MAX_HRTIMER_BASES; i++) | 1178 | hrtimer_get_softirq_time(cpu_base); |
653 | run_hrtimer_queue(&base[i]); | 1179 | |
1180 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | ||
1181 | run_hrtimer_queue(cpu_base, i); | ||
654 | } | 1182 | } |
655 | 1183 | ||
656 | /* | 1184 | /* |
657 | * Sleep related functions: | 1185 | * Sleep related functions: |
658 | */ | 1186 | */ |
659 | static int hrtimer_wakeup(struct hrtimer *timer) | 1187 | static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) |
660 | { | 1188 | { |
661 | struct hrtimer_sleeper *t = | 1189 | struct hrtimer_sleeper *t = |
662 | container_of(timer, struct hrtimer_sleeper, timer); | 1190 | container_of(timer, struct hrtimer_sleeper, timer); |
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
673 | { | 1201 | { |
674 | sl->timer.function = hrtimer_wakeup; | 1202 | sl->timer.function = hrtimer_wakeup; |
675 | sl->task = task; | 1203 | sl->task = task; |
1204 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1205 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | ||
1206 | #endif | ||
676 | } | 1207 | } |
677 | 1208 | ||
678 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) | 1209 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) |
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
683 | set_current_state(TASK_INTERRUPTIBLE); | 1214 | set_current_state(TASK_INTERRUPTIBLE); |
684 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1215 | hrtimer_start(&t->timer, t->timer.expires, mode); |
685 | 1216 | ||
686 | schedule(); | 1217 | if (likely(t->task)) |
1218 | schedule(); | ||
687 | 1219 | ||
688 | hrtimer_cancel(&t->timer); | 1220 | hrtimer_cancel(&t->timer); |
689 | mode = HRTIMER_ABS; | 1221 | mode = HRTIMER_MODE_ABS; |
690 | 1222 | ||
691 | } while (t->task && !signal_pending(current)); | 1223 | } while (t->task && !signal_pending(current)); |
692 | 1224 | ||
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
702 | 1234 | ||
703 | restart->fn = do_no_restart_syscall; | 1235 | restart->fn = do_no_restart_syscall; |
704 | 1236 | ||
705 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); | 1237 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); |
706 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; | 1238 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; |
707 | 1239 | ||
708 | if (do_nanosleep(&t, HRTIMER_ABS)) | 1240 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) |
709 | return 0; | 1241 | return 0; |
710 | 1242 | ||
711 | rmtp = (struct timespec __user *) restart->arg1; | 1243 | rmtp = (struct timespec __user *) restart->arg1; |
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
738 | return 0; | 1270 | return 0; |
739 | 1271 | ||
740 | /* Absolute timers do not update the rmtp value and restart: */ | 1272 | /* Absolute timers do not update the rmtp value and restart: */ |
741 | if (mode == HRTIMER_ABS) | 1273 | if (mode == HRTIMER_MODE_ABS) |
742 | return -ERESTARTNOHAND; | 1274 | return -ERESTARTNOHAND; |
743 | 1275 | ||
744 | if (rmtp) { | 1276 | if (rmtp) { |
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | |||
771 | if (!timespec_valid(&tu)) | 1303 | if (!timespec_valid(&tu)) |
772 | return -EINVAL; | 1304 | return -EINVAL; |
773 | 1305 | ||
774 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); | 1306 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
775 | } | 1307 | } |
776 | 1308 | ||
777 | /* | 1309 | /* |
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | |||
779 | */ | 1311 | */ |
780 | static void __devinit init_hrtimers_cpu(int cpu) | 1312 | static void __devinit init_hrtimers_cpu(int cpu) |
781 | { | 1313 | { |
782 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); | 1314 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
783 | int i; | 1315 | int i; |
784 | 1316 | ||
785 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { | 1317 | spin_lock_init(&cpu_base->lock); |
786 | spin_lock_init(&base->lock); | 1318 | lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); |
787 | lockdep_set_class(&base->lock, &base->lock_key); | 1319 | |
788 | } | 1320 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1321 | cpu_base->clock_base[i].cpu_base = cpu_base; | ||
1322 | |||
1323 | hrtimer_init_hres(cpu_base); | ||
789 | } | 1324 | } |
790 | 1325 | ||
791 | #ifdef CONFIG_HOTPLUG_CPU | 1326 | #ifdef CONFIG_HOTPLUG_CPU |
792 | 1327 | ||
793 | static void migrate_hrtimer_list(struct hrtimer_base *old_base, | 1328 | static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, |
794 | struct hrtimer_base *new_base) | 1329 | struct hrtimer_clock_base *new_base) |
795 | { | 1330 | { |
796 | struct hrtimer *timer; | 1331 | struct hrtimer *timer; |
797 | struct rb_node *node; | 1332 | struct rb_node *node; |
798 | 1333 | ||
799 | while ((node = rb_first(&old_base->active))) { | 1334 | while ((node = rb_first(&old_base->active))) { |
800 | timer = rb_entry(node, struct hrtimer, node); | 1335 | timer = rb_entry(node, struct hrtimer, node); |
801 | __remove_hrtimer(timer, old_base); | 1336 | BUG_ON(hrtimer_callback_running(timer)); |
1337 | __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); | ||
802 | timer->base = new_base; | 1338 | timer->base = new_base; |
803 | enqueue_hrtimer(timer, new_base); | 1339 | /* |
1340 | * Enqueue the timer. Allow reprogramming of the event device | ||
1341 | */ | ||
1342 | enqueue_hrtimer(timer, new_base, 1); | ||
804 | } | 1343 | } |
805 | } | 1344 | } |
806 | 1345 | ||
807 | static void migrate_hrtimers(int cpu) | 1346 | static void migrate_hrtimers(int cpu) |
808 | { | 1347 | { |
809 | struct hrtimer_base *old_base, *new_base; | 1348 | struct hrtimer_cpu_base *old_base, *new_base; |
810 | int i; | 1349 | int i; |
811 | 1350 | ||
812 | BUG_ON(cpu_online(cpu)); | 1351 | BUG_ON(cpu_online(cpu)); |
813 | old_base = per_cpu(hrtimer_bases, cpu); | 1352 | old_base = &per_cpu(hrtimer_bases, cpu); |
814 | new_base = get_cpu_var(hrtimer_bases); | 1353 | new_base = &get_cpu_var(hrtimer_bases); |
815 | |||
816 | local_irq_disable(); | ||
817 | 1354 | ||
818 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | 1355 | tick_cancel_sched_timer(cpu); |
819 | 1356 | ||
820 | spin_lock(&new_base->lock); | 1357 | local_irq_disable(); |
821 | spin_lock(&old_base->lock); | ||
822 | |||
823 | BUG_ON(old_base->curr_timer); | ||
824 | 1358 | ||
825 | migrate_hrtimer_list(old_base, new_base); | 1359 | spin_lock(&new_base->lock); |
1360 | spin_lock(&old_base->lock); | ||
826 | 1361 | ||
827 | spin_unlock(&old_base->lock); | 1362 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
828 | spin_unlock(&new_base->lock); | 1363 | migrate_hrtimer_list(&old_base->clock_base[i], |
829 | old_base++; | 1364 | &new_base->clock_base[i]); |
830 | new_base++; | ||
831 | } | 1365 | } |
1366 | spin_unlock(&old_base->lock); | ||
1367 | spin_unlock(&new_base->lock); | ||
832 | 1368 | ||
833 | local_irq_enable(); | 1369 | local_irq_enable(); |
834 | put_cpu_var(hrtimer_bases); | 1370 | put_cpu_var(hrtimer_bases); |
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, | |||
848 | 1384 | ||
849 | #ifdef CONFIG_HOTPLUG_CPU | 1385 | #ifdef CONFIG_HOTPLUG_CPU |
850 | case CPU_DEAD: | 1386 | case CPU_DEAD: |
1387 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); | ||
851 | migrate_hrtimers(cpu); | 1388 | migrate_hrtimers(cpu); |
852 | break; | 1389 | break; |
853 | #endif | 1390 | #endif |
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void) | |||
868 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1405 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, |
869 | (void *)(long)smp_processor_id()); | 1406 | (void *)(long)smp_processor_id()); |
870 | register_cpu_notifier(&hrtimers_nb); | 1407 | register_cpu_notifier(&hrtimers_nb); |
1408 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1409 | open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); | ||
1410 | #endif | ||
871 | } | 1411 | } |
872 | 1412 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 475e8a71bcdc..0133f4f9e9f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data); | |||
168 | /** | 168 | /** |
169 | * set_irq_data - set irq type data for an irq | 169 | * set_irq_data - set irq type data for an irq |
170 | * @irq: Interrupt number | 170 | * @irq: Interrupt number |
171 | * @data: Pointer to interrupt specific data | 171 | * @entry: Pointer to MSI descriptor data |
172 | * | 172 | * |
173 | * Set the hardware irq controller data for an irq | 173 | * Set the hardware irq controller data for an irq |
174 | */ | 174 | */ |
@@ -230,10 +230,6 @@ static void default_enable(unsigned int irq) | |||
230 | */ | 230 | */ |
231 | static void default_disable(unsigned int irq) | 231 | static void default_disable(unsigned int irq) |
232 | { | 232 | { |
233 | struct irq_desc *desc = irq_desc + irq; | ||
234 | |||
235 | if (!(desc->status & IRQ_DELAYED_DISABLE)) | ||
236 | desc->chip->mask(irq); | ||
237 | } | 233 | } |
238 | 234 | ||
239 | /* | 235 | /* |
@@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
298 | 294 | ||
299 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 295 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
300 | goto out_unlock; | 296 | goto out_unlock; |
301 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
302 | kstat_cpu(cpu).irqs[irq]++; | 297 | kstat_cpu(cpu).irqs[irq]++; |
303 | 298 | ||
304 | action = desc->action; | 299 | action = desc->action; |
305 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | 300 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
301 | if (desc->chip->mask) | ||
302 | desc->chip->mask(irq); | ||
303 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
304 | desc->status |= IRQ_PENDING; | ||
306 | goto out_unlock; | 305 | goto out_unlock; |
306 | } | ||
307 | 307 | ||
308 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); | ||
308 | desc->status |= IRQ_INPROGRESS; | 309 | desc->status |= IRQ_INPROGRESS; |
309 | spin_unlock(&desc->lock); | 310 | spin_unlock(&desc->lock); |
310 | 311 | ||
@@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
396 | 397 | ||
397 | /* | 398 | /* |
398 | * If its disabled or no action available | 399 | * If its disabled or no action available |
399 | * keep it masked and get out of here | 400 | * then mask it and get out of here: |
400 | */ | 401 | */ |
401 | action = desc->action; | 402 | action = desc->action; |
402 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 403 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
403 | desc->status |= IRQ_PENDING; | 404 | desc->status |= IRQ_PENDING; |
405 | if (desc->chip->mask) | ||
406 | desc->chip->mask(irq); | ||
404 | goto out; | 407 | goto out; |
405 | } | 408 | } |
406 | 409 | ||
@@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
562 | 565 | ||
563 | /* Uninstall? */ | 566 | /* Uninstall? */ |
564 | if (handle == handle_bad_irq) { | 567 | if (handle == handle_bad_irq) { |
565 | if (desc->chip != &no_irq_chip) { | 568 | if (desc->chip != &no_irq_chip) |
566 | desc->chip->mask(irq); | 569 | mask_ack_irq(desc, irq); |
567 | desc->chip->ack(irq); | ||
568 | } | ||
569 | desc->status |= IRQ_DISABLED; | 570 | desc->status |= IRQ_DISABLED; |
570 | desc->depth = 1; | 571 | desc->depth = 1; |
571 | } | 572 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7c85d69188ef..5597c157442a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq) | |||
38 | } | 38 | } |
39 | EXPORT_SYMBOL(synchronize_irq); | 39 | EXPORT_SYMBOL(synchronize_irq); |
40 | 40 | ||
41 | /** | ||
42 | * irq_can_set_affinity - Check if the affinity of a given irq can be set | ||
43 | * @irq: Interrupt to check | ||
44 | * | ||
45 | */ | ||
46 | int irq_can_set_affinity(unsigned int irq) | ||
47 | { | ||
48 | struct irq_desc *desc = irq_desc + irq; | ||
49 | |||
50 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || | ||
51 | !desc->chip->set_affinity) | ||
52 | return 0; | ||
53 | |||
54 | return 1; | ||
55 | } | ||
56 | |||
57 | /** | ||
58 | * irq_set_affinity - Set the irq affinity of a given irq | ||
59 | * @irq: Interrupt to set affinity | ||
60 | * @cpumask: cpumask | ||
61 | * | ||
62 | */ | ||
63 | int irq_set_affinity(unsigned int irq, cpumask_t cpumask) | ||
64 | { | ||
65 | struct irq_desc *desc = irq_desc + irq; | ||
66 | |||
67 | if (!desc->chip->set_affinity) | ||
68 | return -EINVAL; | ||
69 | |||
70 | set_balance_irq_affinity(irq, cpumask); | ||
71 | |||
72 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
73 | set_pending_irq(irq, cpumask); | ||
74 | #else | ||
75 | desc->affinity = cpumask; | ||
76 | desc->chip->set_affinity(irq, cpumask); | ||
77 | #endif | ||
78 | return 0; | ||
79 | } | ||
80 | |||
41 | #endif | 81 | #endif |
42 | 82 | ||
43 | /** | 83 | /** |
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
281 | if (new->flags & IRQF_PERCPU) | 321 | if (new->flags & IRQF_PERCPU) |
282 | desc->status |= IRQ_PER_CPU; | 322 | desc->status |= IRQ_PER_CPU; |
283 | #endif | 323 | #endif |
324 | /* Exclude IRQ from balancing */ | ||
325 | if (new->flags & IRQF_NOBALANCING) | ||
326 | desc->status |= IRQ_NO_BALANCING; | ||
327 | |||
284 | if (!shared) { | 328 | if (!shared) { |
285 | irq_chip_set_defaults(desc->chip); | 329 | irq_chip_set_defaults(desc->chip); |
286 | 330 | ||
@@ -461,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, | |||
461 | /* | 505 | /* |
462 | * Lockdep wants atomic interrupt handlers: | 506 | * Lockdep wants atomic interrupt handlers: |
463 | */ | 507 | */ |
464 | irqflags |= SA_INTERRUPT; | 508 | irqflags |= IRQF_DISABLED; |
465 | #endif | 509 | #endif |
466 | /* | 510 | /* |
467 | * Sanity-check: shared interrupts must pass in a real dev-ID, | 511 | * Sanity-check: shared interrupts must pass in a real dev-ID, |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6d3be06e8ce6..2db91eb54ad8 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir; | |||
16 | 16 | ||
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | 18 | ||
19 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
20 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
21 | { | ||
22 | set_balance_irq_affinity(irq, mask_val); | ||
23 | |||
24 | /* | ||
25 | * Save these away for later use. Re-progam when the | ||
26 | * interrupt is pending | ||
27 | */ | ||
28 | set_pending_irq(irq, mask_val); | ||
29 | } | ||
30 | #else | ||
31 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
32 | { | ||
33 | set_balance_irq_affinity(irq, mask_val); | ||
34 | irq_desc[irq].affinity = mask_val; | ||
35 | irq_desc[irq].chip->set_affinity(irq, mask_val); | ||
36 | } | ||
37 | #endif | ||
38 | |||
39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 19 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
40 | int count, int *eof, void *data) | 20 | int count, int *eof, void *data) |
41 | { | 21 | { |
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
55 | cpumask_t new_value, tmp; | 35 | cpumask_t new_value, tmp; |
56 | 36 | ||
57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || | 37 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || |
58 | CHECK_IRQ_PER_CPU(irq_desc[irq].status)) | 38 | irq_balancing_disabled(irq)) |
59 | return -EIO; | 39 | return -EIO; |
60 | 40 | ||
61 | err = cpumask_parse_user(buffer, count, new_value); | 41 | err = cpumask_parse_user(buffer, count, new_value); |
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
73 | code to set default SMP affinity. */ | 53 | code to set default SMP affinity. */ |
74 | return select_smp_affinity(irq) ? -EINVAL : full_count; | 54 | return select_smp_affinity(irq) ? -EINVAL : full_count; |
75 | 55 | ||
76 | proc_set_irq_affinity(irq, new_value); | 56 | irq_set_affinity(irq, new_value); |
77 | 57 | ||
78 | return full_count; | 58 | return full_count; |
79 | } | 59 | } |
diff --git a/kernel/itimer.c b/kernel/itimer.c index 204ed7939e75..307c6a632ef6 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | |||
128 | /* | 128 | /* |
129 | * The timer is automagically restarted, when interval != 0 | 129 | * The timer is automagically restarted, when interval != 0 |
130 | */ | 130 | */ |
131 | int it_real_fn(struct hrtimer *timer) | 131 | enum hrtimer_restart it_real_fn(struct hrtimer *timer) |
132 | { | 132 | { |
133 | struct signal_struct *sig = | 133 | struct signal_struct *sig = |
134 | container_of(timer, struct signal_struct, real_timer); | 134 | container_of(timer, struct signal_struct, real_timer); |
135 | 135 | ||
136 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); | 136 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); |
137 | 137 | ||
138 | if (sig->it_real_incr.tv64 != 0) { | ||
139 | hrtimer_forward(timer, timer->base->softirq_time, | ||
140 | sig->it_real_incr); | ||
141 | return HRTIMER_RESTART; | ||
142 | } | ||
143 | return HRTIMER_NORESTART; | 138 | return HRTIMER_NORESTART; |
144 | } | 139 | } |
145 | 140 | ||
@@ -231,11 +226,14 @@ again: | |||
231 | spin_unlock_irq(&tsk->sighand->siglock); | 226 | spin_unlock_irq(&tsk->sighand->siglock); |
232 | goto again; | 227 | goto again; |
233 | } | 228 | } |
234 | tsk->signal->it_real_incr = | ||
235 | timeval_to_ktime(value->it_interval); | ||
236 | expires = timeval_to_ktime(value->it_value); | 229 | expires = timeval_to_ktime(value->it_value); |
237 | if (expires.tv64 != 0) | 230 | if (expires.tv64 != 0) { |
238 | hrtimer_start(timer, expires, HRTIMER_REL); | 231 | tsk->signal->it_real_incr = |
232 | timeval_to_ktime(value->it_interval); | ||
233 | hrtimer_start(timer, expires, HRTIMER_MODE_REL); | ||
234 | } else | ||
235 | tsk->signal->it_real_incr.tv64 = 0; | ||
236 | |||
239 | spin_unlock_irq(&tsk->sighand->siglock); | 237 | spin_unlock_irq(&tsk->sighand->siglock); |
240 | break; | 238 | break; |
241 | case ITIMER_VIRTUAL: | 239 | case ITIMER_VIRTUAL: |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379aa31ca..796276141e51 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data) | |||
217 | sub_info->retval = ret; | 217 | sub_info->retval = ret; |
218 | } | 218 | } |
219 | 219 | ||
220 | complete(sub_info->complete); | 220 | if (sub_info->wait < 0) |
221 | kfree(sub_info); | ||
222 | else | ||
223 | complete(sub_info->complete); | ||
221 | return 0; | 224 | return 0; |
222 | } | 225 | } |
223 | 226 | ||
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) | |||
239 | pid = kernel_thread(____call_usermodehelper, sub_info, | 242 | pid = kernel_thread(____call_usermodehelper, sub_info, |
240 | CLONE_VFORK | SIGCHLD); | 243 | CLONE_VFORK | SIGCHLD); |
241 | 244 | ||
245 | if (wait < 0) | ||
246 | return; | ||
247 | |||
242 | if (pid < 0) { | 248 | if (pid < 0) { |
243 | sub_info->retval = pid; | 249 | sub_info->retval = pid; |
244 | complete(sub_info->complete); | 250 | complete(sub_info->complete); |
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) | |||
253 | * @envp: null-terminated environment list | 259 | * @envp: null-terminated environment list |
254 | * @session_keyring: session keyring for process (NULL for an empty keyring) | 260 | * @session_keyring: session keyring for process (NULL for an empty keyring) |
255 | * @wait: wait for the application to finish and return status. | 261 | * @wait: wait for the application to finish and return status. |
262 | * when -1 don't wait at all, but you get no useful error back when | ||
263 | * the program couldn't be exec'ed. This makes it safe to call | ||
264 | * from interrupt context. | ||
256 | * | 265 | * |
257 | * Runs a user-space application. The application is started | 266 | * Runs a user-space application. The application is started |
258 | * asynchronously if wait is not set, and runs as a child of keventd. | 267 | * asynchronously if wait is not set, and runs as a child of keventd. |
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
265 | struct key *session_keyring, int wait) | 274 | struct key *session_keyring, int wait) |
266 | { | 275 | { |
267 | DECLARE_COMPLETION_ONSTACK(done); | 276 | DECLARE_COMPLETION_ONSTACK(done); |
268 | struct subprocess_info sub_info = { | 277 | struct subprocess_info *sub_info; |
269 | .work = __WORK_INITIALIZER(sub_info.work, | 278 | int retval; |
270 | __call_usermodehelper), | ||
271 | .complete = &done, | ||
272 | .path = path, | ||
273 | .argv = argv, | ||
274 | .envp = envp, | ||
275 | .ring = session_keyring, | ||
276 | .wait = wait, | ||
277 | .retval = 0, | ||
278 | }; | ||
279 | 279 | ||
280 | if (!khelper_wq) | 280 | if (!khelper_wq) |
281 | return -EBUSY; | 281 | return -EBUSY; |
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
283 | if (path[0] == '\0') | 283 | if (path[0] == '\0') |
284 | return 0; | 284 | return 0; |
285 | 285 | ||
286 | queue_work(khelper_wq, &sub_info.work); | 286 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); |
287 | if (!sub_info) | ||
288 | return -ENOMEM; | ||
289 | |||
290 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
291 | sub_info->complete = &done; | ||
292 | sub_info->path = path; | ||
293 | sub_info->argv = argv; | ||
294 | sub_info->envp = envp; | ||
295 | sub_info->ring = session_keyring; | ||
296 | sub_info->wait = wait; | ||
297 | |||
298 | queue_work(khelper_wq, &sub_info->work); | ||
299 | if (wait < 0) /* task has freed sub_info */ | ||
300 | return 0; | ||
287 | wait_for_completion(&done); | 301 | wait_for_completion(&done); |
288 | return sub_info.retval; | 302 | retval = sub_info->retval; |
303 | kfree(sub_info); | ||
304 | return retval; | ||
289 | } | 305 | } |
290 | EXPORT_SYMBOL(call_usermodehelper_keys); | 306 | EXPORT_SYMBOL(call_usermodehelper_keys); |
291 | 307 | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 88fc611b3ae9..58f35e586ee3 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -10,7 +10,6 @@ | |||
10 | * Code for /proc/lockdep and /proc/lockdep_stats: | 10 | * Code for /proc/lockdep and /proc/lockdep_stats: |
11 | * | 11 | * |
12 | */ | 12 | */ |
13 | #include <linux/sched.h> | ||
14 | #include <linux/module.h> | 13 | #include <linux/module.h> |
15 | #include <linux/proc_fs.h> | 14 | #include <linux/proc_fs.h> |
16 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 841539d72c55..d17436cdea1b 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -13,7 +13,6 @@ | |||
13 | * Released under the General Public License (GPL). | 13 | * Released under the General Public License (GPL). |
14 | */ | 14 | */ |
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <linux/sched.h> | ||
17 | #include <linux/delay.h> | 16 | #include <linux/delay.h> |
18 | #include <linux/module.h> | 17 | #include <linux/module.h> |
19 | #include <linux/poison.h> | 18 | #include <linux/poison.h> |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 7c3e1e6dfb5b..657f77697415 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
304 | * should be able to see it. | 304 | * should be able to see it. |
305 | */ | 305 | */ |
306 | struct task_struct *p; | 306 | struct task_struct *p; |
307 | read_lock(&tasklist_lock); | 307 | rcu_read_lock(); |
308 | p = find_task_by_pid(pid); | 308 | p = find_task_by_pid(pid); |
309 | if (p) { | 309 | if (p) { |
310 | if (CPUCLOCK_PERTHREAD(which_clock)) { | 310 | if (CPUCLOCK_PERTHREAD(which_clock)) { |
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
312 | error = cpu_clock_sample(which_clock, | 312 | error = cpu_clock_sample(which_clock, |
313 | p, &rtn); | 313 | p, &rtn); |
314 | } | 314 | } |
315 | } else if (p->tgid == pid && p->signal) { | 315 | } else { |
316 | error = cpu_clock_sample_group(which_clock, | 316 | read_lock(&tasklist_lock); |
317 | p, &rtn); | 317 | if (p->tgid == pid && p->signal) { |
318 | error = | ||
319 | cpu_clock_sample_group(which_clock, | ||
320 | p, &rtn); | ||
321 | } | ||
322 | read_unlock(&tasklist_lock); | ||
318 | } | 323 | } |
319 | } | 324 | } |
320 | read_unlock(&tasklist_lock); | 325 | rcu_read_unlock(); |
321 | } | 326 | } |
322 | 327 | ||
323 | if (error) | 328 | if (error) |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a1bf61617839..44318ca71978 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, | |||
145 | struct itimerspec *, struct itimerspec *); | 145 | struct itimerspec *, struct itimerspec *); |
146 | static int common_timer_del(struct k_itimer *timer); | 146 | static int common_timer_del(struct k_itimer *timer); |
147 | 147 | ||
148 | static int posix_timer_fn(struct hrtimer *data); | 148 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
149 | 149 | ||
150 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 150 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); |
151 | 151 | ||
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event); | |||
334 | 334 | ||
335 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. | 335 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. |
336 | */ | 336 | */ |
337 | static int posix_timer_fn(struct hrtimer *timer) | 337 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) |
338 | { | 338 | { |
339 | struct k_itimer *timr; | 339 | struct k_itimer *timr; |
340 | unsigned long flags; | 340 | unsigned long flags; |
341 | int si_private = 0; | 341 | int si_private = 0; |
342 | int ret = HRTIMER_NORESTART; | 342 | enum hrtimer_restart ret = HRTIMER_NORESTART; |
343 | 343 | ||
344 | timr = container_of(timer, struct k_itimer, it.real.timer); | 344 | timr = container_of(timer, struct k_itimer, it.real.timer); |
345 | spin_lock_irqsave(&timr->it_lock, flags); | 345 | spin_lock_irqsave(&timr->it_lock, flags); |
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer) | |||
356 | if (timr->it.real.interval.tv64 != 0) { | 356 | if (timr->it.real.interval.tv64 != 0) { |
357 | timr->it_overrun += | 357 | timr->it_overrun += |
358 | hrtimer_forward(timer, | 358 | hrtimer_forward(timer, |
359 | timer->base->softirq_time, | 359 | hrtimer_cb_get_time(timer), |
360 | timr->it.real.interval); | 360 | timr->it.real.interval); |
361 | ret = HRTIMER_RESTART; | 361 | ret = HRTIMER_RESTART; |
362 | ++timr->it_requeue_pending; | 362 | ++timr->it_requeue_pending; |
@@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
722 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) | 722 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) |
723 | return 0; | 723 | return 0; |
724 | 724 | ||
725 | mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; | 725 | mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; |
726 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); | 726 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); |
727 | timr->it.real.timer.function = posix_timer_fn; | 727 | timr->it.real.timer.function = posix_timer_fn; |
728 | 728 | ||
@@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
734 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ | 734 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ |
735 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { | 735 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { |
736 | /* Setup correct expiry time for relative timers */ | 736 | /* Setup correct expiry time for relative timers */ |
737 | if (mode == HRTIMER_REL) | 737 | if (mode == HRTIMER_MODE_REL) |
738 | timer->expires = ktime_add(timer->expires, | 738 | timer->expires = ktime_add(timer->expires, |
739 | timer->base->get_time()); | 739 | timer->base->get_time()); |
740 | return 0; | 740 | return 0; |
@@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, | |||
950 | struct timespec *tsave, struct timespec __user *rmtp) | 950 | struct timespec *tsave, struct timespec __user *rmtp) |
951 | { | 951 | { |
952 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? | 952 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? |
953 | HRTIMER_ABS : HRTIMER_REL, which_clock); | 953 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, |
954 | which_clock); | ||
954 | } | 955 | } |
955 | 956 | ||
956 | asmlinkage long | 957 | asmlinkage long |
diff --git a/kernel/resource.c b/kernel/resource.c index 2a3f88636580..bdb55a33f969 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -8,7 +8,6 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/sched.h> | ||
12 | #include <linux/errno.h> | 11 | #include <linux/errno.h> |
13 | #include <linux/ioport.h> | 12 | #include <linux/ioport.h> |
14 | #include <linux/init.h> | 13 | #include <linux/init.h> |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4ab17da46fd8..180978cb2f75 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
625 | /* Setup the timer, when timeout != NULL */ | 625 | /* Setup the timer, when timeout != NULL */ |
626 | if (unlikely(timeout)) | 626 | if (unlikely(timeout)) |
627 | hrtimer_start(&timeout->timer, timeout->timer.expires, | 627 | hrtimer_start(&timeout->timer, timeout->timer.expires, |
628 | HRTIMER_ABS); | 628 | HRTIMER_MODE_ABS); |
629 | 629 | ||
630 | for (;;) { | 630 | for (;;) { |
631 | /* Try to acquire the lock: */ | 631 | /* Try to acquire the lock: */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 08f86178aa34..0dc757246d89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1853 | struct mm_struct *mm = next->mm; | 1853 | struct mm_struct *mm = next->mm; |
1854 | struct mm_struct *oldmm = prev->active_mm; | 1854 | struct mm_struct *oldmm = prev->active_mm; |
1855 | 1855 | ||
1856 | /* | ||
1857 | * For paravirt, this is coupled with an exit in switch_to to | ||
1858 | * combine the page table reload and the switch backend into | ||
1859 | * one hypercall. | ||
1860 | */ | ||
1861 | arch_enter_lazy_cpu_mode(); | ||
1862 | |||
1856 | if (!mm) { | 1863 | if (!mm) { |
1857 | next->active_mm = oldmm; | 1864 | next->active_mm = oldmm; |
1858 | atomic_inc(&oldmm->mm_count); | 1865 | atomic_inc(&oldmm->mm_count); |
diff --git a/kernel/signal.c b/kernel/signal.c index 8072e568bbe0..e2a7d4bf7d57 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
456 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 456 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
457 | { | 457 | { |
458 | int signr = __dequeue_signal(&tsk->pending, mask, info); | 458 | int signr = __dequeue_signal(&tsk->pending, mask, info); |
459 | if (!signr) | 459 | if (!signr) { |
460 | signr = __dequeue_signal(&tsk->signal->shared_pending, | 460 | signr = __dequeue_signal(&tsk->signal->shared_pending, |
461 | mask, info); | 461 | mask, info); |
462 | /* | ||
463 | * itimer signal ? | ||
464 | * | ||
465 | * itimers are process shared and we restart periodic | ||
466 | * itimers in the signal delivery path to prevent DoS | ||
467 | * attacks in the high resolution timer case. This is | ||
468 | * compliant with the old way of self restarting | ||
469 | * itimers, as the SIGALRM is a legacy signal and only | ||
470 | * queued once. Changing the restart behaviour to | ||
471 | * restart the timer in the signal dequeue path is | ||
472 | * reducing the timer noise on heavy loaded !highres | ||
473 | * systems too. | ||
474 | */ | ||
475 | if (unlikely(signr == SIGALRM)) { | ||
476 | struct hrtimer *tmr = &tsk->signal->real_timer; | ||
477 | |||
478 | if (!hrtimer_is_queued(tmr) && | ||
479 | tsk->signal->it_real_incr.tv64 != 0) { | ||
480 | hrtimer_forward(tmr, tmr->base->get_time(), | ||
481 | tsk->signal->it_real_incr); | ||
482 | hrtimer_restart(tmr); | ||
483 | } | ||
484 | } | ||
485 | } | ||
462 | recalc_sigpending_tsk(tsk); | 486 | recalc_sigpending_tsk(tsk); |
463 | if (signr && unlikely(sig_kernel_stop(signr))) { | 487 | if (signr && unlikely(sig_kernel_stop(signr))) { |
464 | /* | 488 | /* |
465 | * Set a marker that we have dequeued a stop signal. Our | 489 | * Set a marker that we have dequeued a stop signal. Our |
466 | * caller might release the siglock and then the pending | 490 | * caller might release the siglock and then the pending |
467 | * stop signal it is about to process is no longer in the | 491 | * stop signal it is about to process is no longer in the |
468 | * pending bitmasks, but must still be cleared by a SIGCONT | 492 | * pending bitmasks, but must still be cleared by a SIGCONT |
469 | * (and overruled by a SIGKILL). So those cases clear this | 493 | * (and overruled by a SIGKILL). So those cases clear this |
470 | * shared flag after we've set it. Note that this flag may | 494 | * shared flag after we've set it. Note that this flag may |
471 | * remain set after the signal we return is ignored or | 495 | * remain set after the signal we return is ignored or |
472 | * handled. That doesn't matter because its only purpose | 496 | * handled. That doesn't matter because its only purpose |
473 | * is to alert stop-signal processing code when another | 497 | * is to alert stop-signal processing code when another |
474 | * processor has come along and cleared the flag. | 498 | * processor has come along and cleared the flag. |
475 | */ | 499 | */ |
476 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) | 500 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
477 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 501 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; |
478 | } | 502 | } |
479 | if ( signr && | 503 | if ( signr && |
480 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 504 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
481 | info->si_sys_private){ | 505 | info->si_sys_private){ |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 918e52df090e..8b75008e2bd8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
18 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/tick.h> | ||
20 | 21 | ||
21 | #include <asm/irq.h> | 22 | #include <asm/irq.h> |
22 | /* | 23 | /* |
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq); | |||
273 | 274 | ||
274 | #endif | 275 | #endif |
275 | 276 | ||
277 | /* | ||
278 | * Enter an interrupt context. | ||
279 | */ | ||
280 | void irq_enter(void) | ||
281 | { | ||
282 | __irq_enter(); | ||
283 | #ifdef CONFIG_NO_HZ | ||
284 | if (idle_cpu(smp_processor_id())) | ||
285 | tick_nohz_update_jiffies(); | ||
286 | #endif | ||
287 | } | ||
288 | |||
276 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 289 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
277 | # define invoke_softirq() __do_softirq() | 290 | # define invoke_softirq() __do_softirq() |
278 | #else | 291 | #else |
@@ -289,6 +302,12 @@ void irq_exit(void) | |||
289 | sub_preempt_count(IRQ_EXIT_OFFSET); | 302 | sub_preempt_count(IRQ_EXIT_OFFSET); |
290 | if (!in_interrupt() && local_softirq_pending()) | 303 | if (!in_interrupt() && local_softirq_pending()) |
291 | invoke_softirq(); | 304 | invoke_softirq(); |
305 | |||
306 | #ifdef CONFIG_NO_HZ | ||
307 | /* Make sure that timer wheel updates are propagated */ | ||
308 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) | ||
309 | tick_nohz_stop_sched_tick(); | ||
310 | #endif | ||
292 | preempt_enable_no_resched(); | 311 | preempt_enable_no_resched(); |
293 | } | 312 | } |
294 | 313 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e0ac6cd79fcf..3ca1d5ff0319 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -90,12 +90,6 @@ extern char modprobe_path[]; | |||
90 | #ifdef CONFIG_CHR_DEV_SG | 90 | #ifdef CONFIG_CHR_DEV_SG |
91 | extern int sg_big_buff; | 91 | extern int sg_big_buff; |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_SYSVIPC | ||
94 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
95 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
96 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
97 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
98 | #endif | ||
99 | 93 | ||
100 | #ifdef __sparc__ | 94 | #ifdef __sparc__ |
101 | extern char reboot_command []; | 95 | extern char reboot_command []; |
@@ -135,18 +129,6 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, | |||
135 | void __user *, size_t, ctl_table *); | 129 | void __user *, size_t, ctl_table *); |
136 | #endif | 130 | #endif |
137 | 131 | ||
138 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
139 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
140 | |||
141 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
142 | void __user *oldval, size_t __user *oldlenp, | ||
143 | void __user *newval, size_t newlen); | ||
144 | |||
145 | #ifdef CONFIG_SYSVIPC | ||
146 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
147 | void __user *oldval, size_t __user *oldlenp, | ||
148 | void __user *newval, size_t newlen); | ||
149 | #endif | ||
150 | 132 | ||
151 | #ifdef CONFIG_PROC_SYSCTL | 133 | #ifdef CONFIG_PROC_SYSCTL |
152 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 134 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
@@ -177,60 +159,6 @@ int sysctl_legacy_va_layout; | |||
177 | #endif | 159 | #endif |
178 | 160 | ||
179 | 161 | ||
180 | static void *get_uts(ctl_table *table, int write) | ||
181 | { | ||
182 | char *which = table->data; | ||
183 | #ifdef CONFIG_UTS_NS | ||
184 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
185 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
186 | #endif | ||
187 | if (!write) | ||
188 | down_read(&uts_sem); | ||
189 | else | ||
190 | down_write(&uts_sem); | ||
191 | return which; | ||
192 | } | ||
193 | |||
194 | static void put_uts(ctl_table *table, int write, void *which) | ||
195 | { | ||
196 | if (!write) | ||
197 | up_read(&uts_sem); | ||
198 | else | ||
199 | up_write(&uts_sem); | ||
200 | } | ||
201 | |||
202 | #ifdef CONFIG_SYSVIPC | ||
203 | static void *get_ipc(ctl_table *table, int write) | ||
204 | { | ||
205 | char *which = table->data; | ||
206 | struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; | ||
207 | which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; | ||
208 | return which; | ||
209 | } | ||
210 | #else | ||
211 | #define get_ipc(T,W) ((T)->data) | ||
212 | #endif | ||
213 | |||
214 | /* /proc declarations: */ | ||
215 | |||
216 | #ifdef CONFIG_PROC_SYSCTL | ||
217 | |||
218 | static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); | ||
219 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); | ||
220 | static int proc_opensys(struct inode *, struct file *); | ||
221 | |||
222 | const struct file_operations proc_sys_file_operations = { | ||
223 | .open = proc_opensys, | ||
224 | .read = proc_readsys, | ||
225 | .write = proc_writesys, | ||
226 | }; | ||
227 | |||
228 | extern struct proc_dir_entry *proc_sys_root; | ||
229 | |||
230 | static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); | ||
231 | static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); | ||
232 | #endif | ||
233 | |||
234 | /* The default sysctl tables: */ | 162 | /* The default sysctl tables: */ |
235 | 163 | ||
236 | static ctl_table root_table[] = { | 164 | static ctl_table root_table[] = { |
@@ -278,51 +206,6 @@ static ctl_table root_table[] = { | |||
278 | 206 | ||
279 | static ctl_table kern_table[] = { | 207 | static ctl_table kern_table[] = { |
280 | { | 208 | { |
281 | .ctl_name = KERN_OSTYPE, | ||
282 | .procname = "ostype", | ||
283 | .data = init_uts_ns.name.sysname, | ||
284 | .maxlen = sizeof(init_uts_ns.name.sysname), | ||
285 | .mode = 0444, | ||
286 | .proc_handler = &proc_do_uts_string, | ||
287 | .strategy = &sysctl_uts_string, | ||
288 | }, | ||
289 | { | ||
290 | .ctl_name = KERN_OSRELEASE, | ||
291 | .procname = "osrelease", | ||
292 | .data = init_uts_ns.name.release, | ||
293 | .maxlen = sizeof(init_uts_ns.name.release), | ||
294 | .mode = 0444, | ||
295 | .proc_handler = &proc_do_uts_string, | ||
296 | .strategy = &sysctl_uts_string, | ||
297 | }, | ||
298 | { | ||
299 | .ctl_name = KERN_VERSION, | ||
300 | .procname = "version", | ||
301 | .data = init_uts_ns.name.version, | ||
302 | .maxlen = sizeof(init_uts_ns.name.version), | ||
303 | .mode = 0444, | ||
304 | .proc_handler = &proc_do_uts_string, | ||
305 | .strategy = &sysctl_uts_string, | ||
306 | }, | ||
307 | { | ||
308 | .ctl_name = KERN_NODENAME, | ||
309 | .procname = "hostname", | ||
310 | .data = init_uts_ns.name.nodename, | ||
311 | .maxlen = sizeof(init_uts_ns.name.nodename), | ||
312 | .mode = 0644, | ||
313 | .proc_handler = &proc_do_uts_string, | ||
314 | .strategy = &sysctl_uts_string, | ||
315 | }, | ||
316 | { | ||
317 | .ctl_name = KERN_DOMAINNAME, | ||
318 | .procname = "domainname", | ||
319 | .data = init_uts_ns.name.domainname, | ||
320 | .maxlen = sizeof(init_uts_ns.name.domainname), | ||
321 | .mode = 0644, | ||
322 | .proc_handler = &proc_do_uts_string, | ||
323 | .strategy = &sysctl_uts_string, | ||
324 | }, | ||
325 | { | ||
326 | .ctl_name = KERN_PANIC, | 209 | .ctl_name = KERN_PANIC, |
327 | .procname = "panic", | 210 | .procname = "panic", |
328 | .data = &panic_timeout, | 211 | .data = &panic_timeout, |
@@ -478,71 +361,6 @@ static ctl_table kern_table[] = { | |||
478 | .proc_handler = &proc_dointvec, | 361 | .proc_handler = &proc_dointvec, |
479 | }, | 362 | }, |
480 | #endif | 363 | #endif |
481 | #ifdef CONFIG_SYSVIPC | ||
482 | { | ||
483 | .ctl_name = KERN_SHMMAX, | ||
484 | .procname = "shmmax", | ||
485 | .data = &init_ipc_ns.shm_ctlmax, | ||
486 | .maxlen = sizeof (init_ipc_ns.shm_ctlmax), | ||
487 | .mode = 0644, | ||
488 | .proc_handler = &proc_ipc_doulongvec_minmax, | ||
489 | .strategy = sysctl_ipc_data, | ||
490 | }, | ||
491 | { | ||
492 | .ctl_name = KERN_SHMALL, | ||
493 | .procname = "shmall", | ||
494 | .data = &init_ipc_ns.shm_ctlall, | ||
495 | .maxlen = sizeof (init_ipc_ns.shm_ctlall), | ||
496 | .mode = 0644, | ||
497 | .proc_handler = &proc_ipc_doulongvec_minmax, | ||
498 | .strategy = sysctl_ipc_data, | ||
499 | }, | ||
500 | { | ||
501 | .ctl_name = KERN_SHMMNI, | ||
502 | .procname = "shmmni", | ||
503 | .data = &init_ipc_ns.shm_ctlmni, | ||
504 | .maxlen = sizeof (init_ipc_ns.shm_ctlmni), | ||
505 | .mode = 0644, | ||
506 | .proc_handler = &proc_ipc_dointvec, | ||
507 | .strategy = sysctl_ipc_data, | ||
508 | }, | ||
509 | { | ||
510 | .ctl_name = KERN_MSGMAX, | ||
511 | .procname = "msgmax", | ||
512 | .data = &init_ipc_ns.msg_ctlmax, | ||
513 | .maxlen = sizeof (init_ipc_ns.msg_ctlmax), | ||
514 | .mode = 0644, | ||
515 | .proc_handler = &proc_ipc_dointvec, | ||
516 | .strategy = sysctl_ipc_data, | ||
517 | }, | ||
518 | { | ||
519 | .ctl_name = KERN_MSGMNI, | ||
520 | .procname = "msgmni", | ||
521 | .data = &init_ipc_ns.msg_ctlmni, | ||
522 | .maxlen = sizeof (init_ipc_ns.msg_ctlmni), | ||
523 | .mode = 0644, | ||
524 | .proc_handler = &proc_ipc_dointvec, | ||
525 | .strategy = sysctl_ipc_data, | ||
526 | }, | ||
527 | { | ||
528 | .ctl_name = KERN_MSGMNB, | ||
529 | .procname = "msgmnb", | ||
530 | .data = &init_ipc_ns.msg_ctlmnb, | ||
531 | .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), | ||
532 | .mode = 0644, | ||
533 | .proc_handler = &proc_ipc_dointvec, | ||
534 | .strategy = sysctl_ipc_data, | ||
535 | }, | ||
536 | { | ||
537 | .ctl_name = KERN_SEM, | ||
538 | .procname = "sem", | ||
539 | .data = &init_ipc_ns.sem_ctls, | ||
540 | .maxlen = 4*sizeof (int), | ||
541 | .mode = 0644, | ||
542 | .proc_handler = &proc_ipc_dointvec, | ||
543 | .strategy = sysctl_ipc_data, | ||
544 | }, | ||
545 | #endif | ||
546 | #ifdef CONFIG_MAGIC_SYSRQ | 364 | #ifdef CONFIG_MAGIC_SYSRQ |
547 | { | 365 | { |
548 | .ctl_name = KERN_SYSRQ, | 366 | .ctl_name = KERN_SYSRQ, |
@@ -1043,6 +861,12 @@ static ctl_table vm_table[] = { | |||
1043 | { .ctl_name = 0 } | 861 | { .ctl_name = 0 } |
1044 | }; | 862 | }; |
1045 | 863 | ||
864 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) | ||
865 | static ctl_table binfmt_misc_table[] = { | ||
866 | { .ctl_name = 0 } | ||
867 | }; | ||
868 | #endif | ||
869 | |||
1046 | static ctl_table fs_table[] = { | 870 | static ctl_table fs_table[] = { |
1047 | { | 871 | { |
1048 | .ctl_name = FS_NRINODE, | 872 | .ctl_name = FS_NRINODE, |
@@ -1166,6 +990,14 @@ static ctl_table fs_table[] = { | |||
1166 | .mode = 0644, | 990 | .mode = 0644, |
1167 | .proc_handler = &proc_dointvec, | 991 | .proc_handler = &proc_dointvec, |
1168 | }, | 992 | }, |
993 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) | ||
994 | { | ||
995 | .ctl_name = CTL_UNNUMBERED, | ||
996 | .procname = "binfmt_misc", | ||
997 | .mode = 0555, | ||
998 | .child = binfmt_misc_table, | ||
999 | }, | ||
1000 | #endif | ||
1169 | { .ctl_name = 0 } | 1001 | { .ctl_name = 0 } |
1170 | }; | 1002 | }; |
1171 | 1003 | ||
@@ -1177,8 +1009,6 @@ static ctl_table dev_table[] = { | |||
1177 | { .ctl_name = 0 } | 1009 | { .ctl_name = 0 } |
1178 | }; | 1010 | }; |
1179 | 1011 | ||
1180 | extern void init_irq_proc (void); | ||
1181 | |||
1182 | static DEFINE_SPINLOCK(sysctl_lock); | 1012 | static DEFINE_SPINLOCK(sysctl_lock); |
1183 | 1013 | ||
1184 | /* called under sysctl_lock */ | 1014 | /* called under sysctl_lock */ |
@@ -1220,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p) | |||
1220 | list_del_init(&p->ctl_entry); | 1050 | list_del_init(&p->ctl_entry); |
1221 | } | 1051 | } |
1222 | 1052 | ||
1223 | void __init sysctl_init(void) | 1053 | void sysctl_head_finish(struct ctl_table_header *head) |
1224 | { | 1054 | { |
1225 | #ifdef CONFIG_PROC_SYSCTL | 1055 | if (!head) |
1226 | register_proc_table(root_table, proc_sys_root, &root_table_header); | 1056 | return; |
1227 | init_irq_proc(); | 1057 | spin_lock(&sysctl_lock); |
1228 | #endif | 1058 | unuse_table(head); |
1059 | spin_unlock(&sysctl_lock); | ||
1060 | } | ||
1061 | |||
1062 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | ||
1063 | { | ||
1064 | struct ctl_table_header *head; | ||
1065 | struct list_head *tmp; | ||
1066 | spin_lock(&sysctl_lock); | ||
1067 | if (prev) { | ||
1068 | tmp = &prev->ctl_entry; | ||
1069 | unuse_table(prev); | ||
1070 | goto next; | ||
1071 | } | ||
1072 | tmp = &root_table_header.ctl_entry; | ||
1073 | for (;;) { | ||
1074 | head = list_entry(tmp, struct ctl_table_header, ctl_entry); | ||
1075 | |||
1076 | if (!use_table(head)) | ||
1077 | goto next; | ||
1078 | spin_unlock(&sysctl_lock); | ||
1079 | return head; | ||
1080 | next: | ||
1081 | tmp = tmp->next; | ||
1082 | if (tmp == &root_table_header.ctl_entry) | ||
1083 | break; | ||
1084 | } | ||
1085 | spin_unlock(&sysctl_lock); | ||
1086 | return NULL; | ||
1229 | } | 1087 | } |
1230 | 1088 | ||
1231 | #ifdef CONFIG_SYSCTL_SYSCALL | 1089 | #ifdef CONFIG_SYSCTL_SYSCALL |
1232 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1090 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
1233 | void __user *newval, size_t newlen) | 1091 | void __user *newval, size_t newlen) |
1234 | { | 1092 | { |
1235 | struct list_head *tmp; | 1093 | struct ctl_table_header *head; |
1236 | int error = -ENOTDIR; | 1094 | int error = -ENOTDIR; |
1237 | 1095 | ||
1238 | if (nlen <= 0 || nlen >= CTL_MAXNAME) | 1096 | if (nlen <= 0 || nlen >= CTL_MAXNAME) |
@@ -1242,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1242 | if (!oldlenp || get_user(old_len, oldlenp)) | 1100 | if (!oldlenp || get_user(old_len, oldlenp)) |
1243 | return -EFAULT; | 1101 | return -EFAULT; |
1244 | } | 1102 | } |
1245 | spin_lock(&sysctl_lock); | ||
1246 | tmp = &root_table_header.ctl_entry; | ||
1247 | do { | ||
1248 | struct ctl_table_header *head = | ||
1249 | list_entry(tmp, struct ctl_table_header, ctl_entry); | ||
1250 | |||
1251 | if (!use_table(head)) | ||
1252 | continue; | ||
1253 | |||
1254 | spin_unlock(&sysctl_lock); | ||
1255 | 1103 | ||
1104 | for (head = sysctl_head_next(NULL); head; | ||
1105 | head = sysctl_head_next(head)) { | ||
1256 | error = parse_table(name, nlen, oldval, oldlenp, | 1106 | error = parse_table(name, nlen, oldval, oldlenp, |
1257 | newval, newlen, head->ctl_table); | 1107 | newval, newlen, head->ctl_table); |
1258 | 1108 | if (error != -ENOTDIR) { | |
1259 | spin_lock(&sysctl_lock); | 1109 | sysctl_head_finish(head); |
1260 | unuse_table(head); | ||
1261 | if (error != -ENOTDIR) | ||
1262 | break; | 1110 | break; |
1263 | } while ((tmp = tmp->next) != &root_table_header.ctl_entry); | 1111 | } |
1264 | spin_unlock(&sysctl_lock); | 1112 | } |
1265 | return error; | 1113 | return error; |
1266 | } | 1114 | } |
1267 | 1115 | ||
@@ -1282,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | |||
1282 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 1130 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
1283 | 1131 | ||
1284 | /* | 1132 | /* |
1285 | * ctl_perm does NOT grant the superuser all rights automatically, because | 1133 | * sysctl_perm does NOT grant the superuser all rights automatically, because |
1286 | * some sysctl variables are readonly even to root. | 1134 | * some sysctl variables are readonly even to root. |
1287 | */ | 1135 | */ |
1288 | 1136 | ||
@@ -1297,7 +1145,7 @@ static int test_perm(int mode, int op) | |||
1297 | return -EACCES; | 1145 | return -EACCES; |
1298 | } | 1146 | } |
1299 | 1147 | ||
1300 | static inline int ctl_perm(ctl_table *table, int op) | 1148 | int sysctl_perm(ctl_table *table, int op) |
1301 | { | 1149 | { |
1302 | int error; | 1150 | int error; |
1303 | error = security_sysctl(table, op); | 1151 | error = security_sysctl(table, op); |
@@ -1321,19 +1169,11 @@ repeat: | |||
1321 | for ( ; table->ctl_name || table->procname; table++) { | 1169 | for ( ; table->ctl_name || table->procname; table++) { |
1322 | if (!table->ctl_name) | 1170 | if (!table->ctl_name) |
1323 | continue; | 1171 | continue; |
1324 | if (n == table->ctl_name || table->ctl_name == CTL_ANY) { | 1172 | if (n == table->ctl_name) { |
1325 | int error; | 1173 | int error; |
1326 | if (table->child) { | 1174 | if (table->child) { |
1327 | if (ctl_perm(table, 001)) | 1175 | if (sysctl_perm(table, 001)) |
1328 | return -EPERM; | 1176 | return -EPERM; |
1329 | if (table->strategy) { | ||
1330 | error = table->strategy( | ||
1331 | table, name, nlen, | ||
1332 | oldval, oldlenp, | ||
1333 | newval, newlen); | ||
1334 | if (error) | ||
1335 | return error; | ||
1336 | } | ||
1337 | name++; | 1177 | name++; |
1338 | nlen--; | 1178 | nlen--; |
1339 | table = table->child; | 1179 | table = table->child; |
@@ -1361,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
1361 | op |= 004; | 1201 | op |= 004; |
1362 | if (newval) | 1202 | if (newval) |
1363 | op |= 002; | 1203 | op |= 002; |
1364 | if (ctl_perm(table, op)) | 1204 | if (sysctl_perm(table, op)) |
1365 | return -EPERM; | 1205 | return -EPERM; |
1366 | 1206 | ||
1367 | if (table->strategy) { | 1207 | if (table->strategy) { |
@@ -1400,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table, | |||
1400 | } | 1240 | } |
1401 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 1241 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
1402 | 1242 | ||
1243 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) | ||
1244 | { | ||
1245 | for (; table->ctl_name || table->procname; table++) { | ||
1246 | table->parent = parent; | ||
1247 | if (table->child) | ||
1248 | sysctl_set_parent(table, table->child); | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | static __init int sysctl_init(void) | ||
1253 | { | ||
1254 | sysctl_set_parent(NULL, root_table); | ||
1255 | return 0; | ||
1256 | } | ||
1257 | |||
1258 | core_initcall(sysctl_init); | ||
1259 | |||
1403 | /** | 1260 | /** |
1404 | * register_sysctl_table - register a sysctl hierarchy | 1261 | * register_sysctl_table - register a sysctl hierarchy |
1405 | * @table: the top-level table structure | 1262 | * @table: the top-level table structure |
1406 | * @insert_at_head: whether the entry should be inserted in front or at the end | ||
1407 | * | 1263 | * |
1408 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1264 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1409 | * array. An entry with a ctl_name of 0 terminates the table. | 1265 | * array. An entry with a ctl_name of 0 terminates the table. |
@@ -1469,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
1469 | * This routine returns %NULL on a failure to register, and a pointer | 1325 | * This routine returns %NULL on a failure to register, and a pointer |
1470 | * to the table header on success. | 1326 | * to the table header on success. |
1471 | */ | 1327 | */ |
1472 | struct ctl_table_header *register_sysctl_table(ctl_table * table, | 1328 | struct ctl_table_header *register_sysctl_table(ctl_table * table) |
1473 | int insert_at_head) | ||
1474 | { | 1329 | { |
1475 | struct ctl_table_header *tmp; | 1330 | struct ctl_table_header *tmp; |
1476 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); | 1331 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); |
@@ -1480,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, | |||
1480 | INIT_LIST_HEAD(&tmp->ctl_entry); | 1335 | INIT_LIST_HEAD(&tmp->ctl_entry); |
1481 | tmp->used = 0; | 1336 | tmp->used = 0; |
1482 | tmp->unregistering = NULL; | 1337 | tmp->unregistering = NULL; |
1338 | sysctl_set_parent(NULL, table); | ||
1483 | spin_lock(&sysctl_lock); | 1339 | spin_lock(&sysctl_lock); |
1484 | if (insert_at_head) | 1340 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); |
1485 | list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); | ||
1486 | else | ||
1487 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | ||
1488 | spin_unlock(&sysctl_lock); | 1341 | spin_unlock(&sysctl_lock); |
1489 | #ifdef CONFIG_PROC_SYSCTL | ||
1490 | register_proc_table(table, proc_sys_root, tmp); | ||
1491 | #endif | ||
1492 | return tmp; | 1342 | return tmp; |
1493 | } | 1343 | } |
1494 | 1344 | ||
@@ -1504,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
1504 | might_sleep(); | 1354 | might_sleep(); |
1505 | spin_lock(&sysctl_lock); | 1355 | spin_lock(&sysctl_lock); |
1506 | start_unregistering(header); | 1356 | start_unregistering(header); |
1507 | #ifdef CONFIG_PROC_SYSCTL | ||
1508 | unregister_proc_table(header->ctl_table, proc_sys_root); | ||
1509 | #endif | ||
1510 | spin_unlock(&sysctl_lock); | 1357 | spin_unlock(&sysctl_lock); |
1511 | kfree(header); | 1358 | kfree(header); |
1512 | } | 1359 | } |
@@ -1530,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table) | |||
1530 | 1377 | ||
1531 | #ifdef CONFIG_PROC_SYSCTL | 1378 | #ifdef CONFIG_PROC_SYSCTL |
1532 | 1379 | ||
1533 | /* Scan the sysctl entries in table and add them all into /proc */ | ||
1534 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) | ||
1535 | { | ||
1536 | struct proc_dir_entry *de; | ||
1537 | int len; | ||
1538 | mode_t mode; | ||
1539 | |||
1540 | for (; table->ctl_name || table->procname; table++) { | ||
1541 | /* Can't do anything without a proc name. */ | ||
1542 | if (!table->procname) | ||
1543 | continue; | ||
1544 | /* Maybe we can't do anything with it... */ | ||
1545 | if (!table->proc_handler && !table->child) { | ||
1546 | printk(KERN_WARNING "SYSCTL: Can't register %s\n", | ||
1547 | table->procname); | ||
1548 | continue; | ||
1549 | } | ||
1550 | |||
1551 | len = strlen(table->procname); | ||
1552 | mode = table->mode; | ||
1553 | |||
1554 | de = NULL; | ||
1555 | if (table->proc_handler) | ||
1556 | mode |= S_IFREG; | ||
1557 | else { | ||
1558 | mode |= S_IFDIR; | ||
1559 | for (de = root->subdir; de; de = de->next) { | ||
1560 | if (proc_match(len, table->procname, de)) | ||
1561 | break; | ||
1562 | } | ||
1563 | /* If the subdir exists already, de is non-NULL */ | ||
1564 | } | ||
1565 | |||
1566 | if (!de) { | ||
1567 | de = create_proc_entry(table->procname, mode, root); | ||
1568 | if (!de) | ||
1569 | continue; | ||
1570 | de->set = set; | ||
1571 | de->data = (void *) table; | ||
1572 | if (table->proc_handler) | ||
1573 | de->proc_fops = &proc_sys_file_operations; | ||
1574 | } | ||
1575 | table->de = de; | ||
1576 | if (de->mode & S_IFDIR) | ||
1577 | register_proc_table(table->child, de, set); | ||
1578 | } | ||
1579 | } | ||
1580 | |||
1581 | /* | ||
1582 | * Unregister a /proc sysctl table and any subdirectories. | ||
1583 | */ | ||
1584 | static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) | ||
1585 | { | ||
1586 | struct proc_dir_entry *de; | ||
1587 | for (; table->ctl_name || table->procname; table++) { | ||
1588 | if (!(de = table->de)) | ||
1589 | continue; | ||
1590 | if (de->mode & S_IFDIR) { | ||
1591 | if (!table->child) { | ||
1592 | printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); | ||
1593 | continue; | ||
1594 | } | ||
1595 | unregister_proc_table(table->child, de); | ||
1596 | |||
1597 | /* Don't unregister directories which still have entries.. */ | ||
1598 | if (de->subdir) | ||
1599 | continue; | ||
1600 | } | ||
1601 | |||
1602 | /* | ||
1603 | * In any case, mark the entry as goner; we'll keep it | ||
1604 | * around if it's busy, but we'll know to do nothing with | ||
1605 | * its fields. We are under sysctl_lock here. | ||
1606 | */ | ||
1607 | de->data = NULL; | ||
1608 | |||
1609 | /* Don't unregister proc entries that are still being used.. */ | ||
1610 | if (atomic_read(&de->count)) | ||
1611 | continue; | ||
1612 | |||
1613 | table->de = NULL; | ||
1614 | remove_proc_entry(table->procname, root); | ||
1615 | } | ||
1616 | } | ||
1617 | |||
1618 | static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | ||
1619 | size_t count, loff_t *ppos) | ||
1620 | { | ||
1621 | int op; | ||
1622 | struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); | ||
1623 | struct ctl_table *table; | ||
1624 | size_t res; | ||
1625 | ssize_t error = -ENOTDIR; | ||
1626 | |||
1627 | spin_lock(&sysctl_lock); | ||
1628 | if (de && de->data && use_table(de->set)) { | ||
1629 | /* | ||
1630 | * at that point we know that sysctl was not unregistered | ||
1631 | * and won't be until we finish | ||
1632 | */ | ||
1633 | spin_unlock(&sysctl_lock); | ||
1634 | table = (struct ctl_table *) de->data; | ||
1635 | if (!table || !table->proc_handler) | ||
1636 | goto out; | ||
1637 | error = -EPERM; | ||
1638 | op = (write ? 002 : 004); | ||
1639 | if (ctl_perm(table, op)) | ||
1640 | goto out; | ||
1641 | |||
1642 | /* careful: calling conventions are nasty here */ | ||
1643 | res = count; | ||
1644 | error = (*table->proc_handler)(table, write, file, | ||
1645 | buf, &res, ppos); | ||
1646 | if (!error) | ||
1647 | error = res; | ||
1648 | out: | ||
1649 | spin_lock(&sysctl_lock); | ||
1650 | unuse_table(de->set); | ||
1651 | } | ||
1652 | spin_unlock(&sysctl_lock); | ||
1653 | return error; | ||
1654 | } | ||
1655 | |||
1656 | static int proc_opensys(struct inode *inode, struct file *file) | ||
1657 | { | ||
1658 | if (file->f_mode & FMODE_WRITE) { | ||
1659 | /* | ||
1660 | * sysctl entries that are not writable, | ||
1661 | * are _NOT_ writable, capabilities or not. | ||
1662 | */ | ||
1663 | if (!(inode->i_mode & S_IWUSR)) | ||
1664 | return -EPERM; | ||
1665 | } | ||
1666 | |||
1667 | return 0; | ||
1668 | } | ||
1669 | |||
1670 | static ssize_t proc_readsys(struct file * file, char __user * buf, | ||
1671 | size_t count, loff_t *ppos) | ||
1672 | { | ||
1673 | return do_rw_proc(0, file, buf, count, ppos); | ||
1674 | } | ||
1675 | |||
1676 | static ssize_t proc_writesys(struct file * file, const char __user * buf, | ||
1677 | size_t count, loff_t *ppos) | ||
1678 | { | ||
1679 | return do_rw_proc(1, file, (char __user *) buf, count, ppos); | ||
1680 | } | ||
1681 | |||
1682 | static int _proc_do_string(void* data, int maxlen, int write, | 1380 | static int _proc_do_string(void* data, int maxlen, int write, |
1683 | struct file *filp, void __user *buffer, | 1381 | struct file *filp, void __user *buffer, |
1684 | size_t *lenp, loff_t *ppos) | 1382 | size_t *lenp, loff_t *ppos) |
@@ -1762,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
1762 | buffer, lenp, ppos); | 1460 | buffer, lenp, ppos); |
1763 | } | 1461 | } |
1764 | 1462 | ||
1765 | /* | ||
1766 | * Special case of dostring for the UTS structure. This has locks | ||
1767 | * to observe. Should this be in kernel/sys.c ???? | ||
1768 | */ | ||
1769 | |||
1770 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
1771 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1772 | { | ||
1773 | int r; | ||
1774 | void *which; | ||
1775 | which = get_uts(table, write); | ||
1776 | r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); | ||
1777 | put_uts(table, write, which); | ||
1778 | return r; | ||
1779 | } | ||
1780 | 1463 | ||
1781 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | 1464 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, |
1782 | int *valp, | 1465 | int *valp, |
@@ -2362,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | |||
2362 | do_proc_dointvec_ms_jiffies_conv, NULL); | 2045 | do_proc_dointvec_ms_jiffies_conv, NULL); |
2363 | } | 2046 | } |
2364 | 2047 | ||
2365 | #ifdef CONFIG_SYSVIPC | ||
2366 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2367 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2368 | { | ||
2369 | void *which; | ||
2370 | which = get_ipc(table, write); | ||
2371 | return __do_proc_dointvec(which, table, write, filp, buffer, | ||
2372 | lenp, ppos, NULL, NULL); | ||
2373 | } | ||
2374 | |||
2375 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2376 | struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2377 | { | ||
2378 | void *which; | ||
2379 | which = get_ipc(table, write); | ||
2380 | return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, | ||
2381 | lenp, ppos, 1l, 1l); | ||
2382 | } | ||
2383 | |||
2384 | #endif | ||
2385 | |||
2386 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 2048 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
2387 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2049 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2388 | { | 2050 | { |
@@ -2413,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
2413 | return -ENOSYS; | 2075 | return -ENOSYS; |
2414 | } | 2076 | } |
2415 | 2077 | ||
2416 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
2417 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2418 | { | ||
2419 | return -ENOSYS; | ||
2420 | } | ||
2421 | |||
2422 | #ifdef CONFIG_SYSVIPC | ||
2423 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | ||
2424 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2425 | { | ||
2426 | return -ENOSYS; | ||
2427 | } | ||
2428 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2429 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2430 | { | ||
2431 | return -ENOSYS; | ||
2432 | } | ||
2433 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2434 | struct file *filp, void __user *buffer, | ||
2435 | size_t *lenp, loff_t *ppos) | ||
2436 | { | ||
2437 | return -ENOSYS; | ||
2438 | } | ||
2439 | #endif | ||
2440 | |||
2441 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | 2078 | int proc_dointvec(ctl_table *table, int write, struct file *filp, |
2442 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2079 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2443 | { | 2080 | { |
@@ -2648,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2648 | } | 2285 | } |
2649 | 2286 | ||
2650 | 2287 | ||
2651 | /* The generic string strategy routine: */ | ||
2652 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2653 | void __user *oldval, size_t __user *oldlenp, | ||
2654 | void __user *newval, size_t newlen) | ||
2655 | { | ||
2656 | struct ctl_table uts_table; | ||
2657 | int r, write; | ||
2658 | write = newval && newlen; | ||
2659 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
2660 | uts_table.data = get_uts(table, write); | ||
2661 | r = sysctl_string(&uts_table, name, nlen, | ||
2662 | oldval, oldlenp, newval, newlen); | ||
2663 | put_uts(table, write, uts_table.data); | ||
2664 | return r; | ||
2665 | } | ||
2666 | |||
2667 | #ifdef CONFIG_SYSVIPC | ||
2668 | /* The generic sysctl ipc data routine. */ | ||
2669 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2670 | void __user *oldval, size_t __user *oldlenp, | ||
2671 | void __user *newval, size_t newlen) | ||
2672 | { | ||
2673 | size_t len; | ||
2674 | void *data; | ||
2675 | |||
2676 | /* Get out of I don't have a variable */ | ||
2677 | if (!table->data || !table->maxlen) | ||
2678 | return -ENOTDIR; | ||
2679 | |||
2680 | data = get_ipc(table, 1); | ||
2681 | if (!data) | ||
2682 | return -ENOTDIR; | ||
2683 | |||
2684 | if (oldval && oldlenp) { | ||
2685 | if (get_user(len, oldlenp)) | ||
2686 | return -EFAULT; | ||
2687 | if (len) { | ||
2688 | if (len > table->maxlen) | ||
2689 | len = table->maxlen; | ||
2690 | if (copy_to_user(oldval, data, len)) | ||
2691 | return -EFAULT; | ||
2692 | if (put_user(len, oldlenp)) | ||
2693 | return -EFAULT; | ||
2694 | } | ||
2695 | } | ||
2696 | |||
2697 | if (newval && newlen) { | ||
2698 | if (newlen > table->maxlen) | ||
2699 | newlen = table->maxlen; | ||
2700 | |||
2701 | if (copy_from_user(data, newval, newlen)) | ||
2702 | return -EFAULT; | ||
2703 | } | ||
2704 | return 1; | ||
2705 | } | ||
2706 | #endif | ||
2707 | 2288 | ||
2708 | #else /* CONFIG_SYSCTL_SYSCALL */ | 2289 | #else /* CONFIG_SYSCTL_SYSCALL */ |
2709 | 2290 | ||
@@ -2769,20 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2769 | return -ENOSYS; | 2350 | return -ENOSYS; |
2770 | } | 2351 | } |
2771 | 2352 | ||
2772 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2773 | void __user *oldval, size_t __user *oldlenp, | ||
2774 | void __user *newval, size_t newlen) | ||
2775 | { | ||
2776 | return -ENOSYS; | ||
2777 | } | ||
2778 | #ifdef CONFIG_SYSVIPC | ||
2779 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2780 | void __user *oldval, size_t __user *oldlenp, | ||
2781 | void __user *newval, size_t newlen) | ||
2782 | { | ||
2783 | return -ENOSYS; | ||
2784 | } | ||
2785 | #endif | ||
2786 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 2353 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
2787 | 2354 | ||
2788 | /* | 2355 | /* |
diff --git a/kernel/time.c b/kernel/time.c index 0e017bff4c19..c6c80ea5d0ea 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec) | |||
470 | return tv; | 470 | return tv; |
471 | } | 471 | } |
472 | 472 | ||
473 | /* | ||
474 | * Convert jiffies to milliseconds and back. | ||
475 | * | ||
476 | * Avoid unnecessary multiplications/divisions in the | ||
477 | * two most common HZ cases: | ||
478 | */ | ||
479 | unsigned int jiffies_to_msecs(const unsigned long j) | ||
480 | { | ||
481 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | ||
482 | return (MSEC_PER_SEC / HZ) * j; | ||
483 | #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) | ||
484 | return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); | ||
485 | #else | ||
486 | return (j * MSEC_PER_SEC) / HZ; | ||
487 | #endif | ||
488 | } | ||
489 | EXPORT_SYMBOL(jiffies_to_msecs); | ||
490 | |||
491 | unsigned int jiffies_to_usecs(const unsigned long j) | ||
492 | { | ||
493 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | ||
494 | return (USEC_PER_SEC / HZ) * j; | ||
495 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
496 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); | ||
497 | #else | ||
498 | return (j * USEC_PER_SEC) / HZ; | ||
499 | #endif | ||
500 | } | ||
501 | EXPORT_SYMBOL(jiffies_to_usecs); | ||
502 | |||
503 | /* | ||
504 | * When we convert to jiffies then we interpret incoming values | ||
505 | * the following way: | ||
506 | * | ||
507 | * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) | ||
508 | * | ||
509 | * - 'too large' values [that would result in larger than | ||
510 | * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. | ||
511 | * | ||
512 | * - all other values are converted to jiffies by either multiplying | ||
513 | * the input value by a factor or dividing it with a factor | ||
514 | * | ||
515 | * We must also be careful about 32-bit overflows. | ||
516 | */ | ||
517 | unsigned long msecs_to_jiffies(const unsigned int m) | ||
518 | { | ||
519 | /* | ||
520 | * Negative value, means infinite timeout: | ||
521 | */ | ||
522 | if ((int)m < 0) | ||
523 | return MAX_JIFFY_OFFSET; | ||
524 | |||
525 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | ||
526 | /* | ||
527 | * HZ is equal to or smaller than 1000, and 1000 is a nice | ||
528 | * round multiple of HZ, divide with the factor between them, | ||
529 | * but round upwards: | ||
530 | */ | ||
531 | return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); | ||
532 | #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) | ||
533 | /* | ||
534 | * HZ is larger than 1000, and HZ is a nice round multiple of | ||
535 | * 1000 - simply multiply with the factor between them. | ||
536 | * | ||
537 | * But first make sure the multiplication result cannot | ||
538 | * overflow: | ||
539 | */ | ||
540 | if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) | ||
541 | return MAX_JIFFY_OFFSET; | ||
542 | |||
543 | return m * (HZ / MSEC_PER_SEC); | ||
544 | #else | ||
545 | /* | ||
546 | * Generic case - multiply, round and divide. But first | ||
547 | * check that if we are doing a net multiplication, that | ||
548 | * we wouldnt overflow: | ||
549 | */ | ||
550 | if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) | ||
551 | return MAX_JIFFY_OFFSET; | ||
552 | |||
553 | return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; | ||
554 | #endif | ||
555 | } | ||
556 | EXPORT_SYMBOL(msecs_to_jiffies); | ||
557 | |||
558 | unsigned long usecs_to_jiffies(const unsigned int u) | ||
559 | { | ||
560 | if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) | ||
561 | return MAX_JIFFY_OFFSET; | ||
562 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | ||
563 | return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); | ||
564 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
565 | return u * (HZ / USEC_PER_SEC); | ||
566 | #else | ||
567 | return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; | ||
568 | #endif | ||
569 | } | ||
570 | EXPORT_SYMBOL(usecs_to_jiffies); | ||
571 | |||
572 | /* | ||
573 | * The TICK_NSEC - 1 rounds up the value to the next resolution. Note | ||
574 | * that a remainder subtract here would not do the right thing as the | ||
575 | * resolution values don't fall on second boundries. I.e. the line: | ||
576 | * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. | ||
577 | * | ||
578 | * Rather, we just shift the bits off the right. | ||
579 | * | ||
580 | * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec | ||
581 | * value to a scaled second value. | ||
582 | */ | ||
583 | unsigned long | ||
584 | timespec_to_jiffies(const struct timespec *value) | ||
585 | { | ||
586 | unsigned long sec = value->tv_sec; | ||
587 | long nsec = value->tv_nsec + TICK_NSEC - 1; | ||
588 | |||
589 | if (sec >= MAX_SEC_IN_JIFFIES){ | ||
590 | sec = MAX_SEC_IN_JIFFIES; | ||
591 | nsec = 0; | ||
592 | } | ||
593 | return (((u64)sec * SEC_CONVERSION) + | ||
594 | (((u64)nsec * NSEC_CONVERSION) >> | ||
595 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
596 | |||
597 | } | ||
598 | EXPORT_SYMBOL(timespec_to_jiffies); | ||
599 | |||
600 | void | ||
601 | jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | ||
602 | { | ||
603 | /* | ||
604 | * Convert jiffies to nanoseconds and separate with | ||
605 | * one divide. | ||
606 | */ | ||
607 | u64 nsec = (u64)jiffies * TICK_NSEC; | ||
608 | value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); | ||
609 | } | ||
610 | EXPORT_SYMBOL(jiffies_to_timespec); | ||
611 | |||
612 | /* Same for "timeval" | ||
613 | * | ||
614 | * Well, almost. The problem here is that the real system resolution is | ||
615 | * in nanoseconds and the value being converted is in micro seconds. | ||
616 | * Also for some machines (those that use HZ = 1024, in-particular), | ||
617 | * there is a LARGE error in the tick size in microseconds. | ||
618 | |||
619 | * The solution we use is to do the rounding AFTER we convert the | ||
620 | * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. | ||
621 | * Instruction wise, this should cost only an additional add with carry | ||
622 | * instruction above the way it was done above. | ||
623 | */ | ||
624 | unsigned long | ||
625 | timeval_to_jiffies(const struct timeval *value) | ||
626 | { | ||
627 | unsigned long sec = value->tv_sec; | ||
628 | long usec = value->tv_usec; | ||
629 | |||
630 | if (sec >= MAX_SEC_IN_JIFFIES){ | ||
631 | sec = MAX_SEC_IN_JIFFIES; | ||
632 | usec = 0; | ||
633 | } | ||
634 | return (((u64)sec * SEC_CONVERSION) + | ||
635 | (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> | ||
636 | (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
637 | } | ||
638 | |||
639 | void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) | ||
640 | { | ||
641 | /* | ||
642 | * Convert jiffies to nanoseconds and separate with | ||
643 | * one divide. | ||
644 | */ | ||
645 | u64 nsec = (u64)jiffies * TICK_NSEC; | ||
646 | long tv_usec; | ||
647 | |||
648 | value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); | ||
649 | tv_usec /= NSEC_PER_USEC; | ||
650 | value->tv_usec = tv_usec; | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * Convert jiffies/jiffies_64 to clock_t and back. | ||
655 | */ | ||
656 | clock_t jiffies_to_clock_t(long x) | ||
657 | { | ||
658 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | ||
659 | return x / (HZ / USER_HZ); | ||
660 | #else | ||
661 | u64 tmp = (u64)x * TICK_NSEC; | ||
662 | do_div(tmp, (NSEC_PER_SEC / USER_HZ)); | ||
663 | return (long)tmp; | ||
664 | #endif | ||
665 | } | ||
666 | EXPORT_SYMBOL(jiffies_to_clock_t); | ||
667 | |||
668 | unsigned long clock_t_to_jiffies(unsigned long x) | ||
669 | { | ||
670 | #if (HZ % USER_HZ)==0 | ||
671 | if (x >= ~0UL / (HZ / USER_HZ)) | ||
672 | return ~0UL; | ||
673 | return x * (HZ / USER_HZ); | ||
674 | #else | ||
675 | u64 jif; | ||
676 | |||
677 | /* Don't worry about loss of precision here .. */ | ||
678 | if (x >= ~0UL / HZ * USER_HZ) | ||
679 | return ~0UL; | ||
680 | |||
681 | /* .. but do try to contain it here */ | ||
682 | jif = x * (u64) HZ; | ||
683 | do_div(jif, USER_HZ); | ||
684 | return jif; | ||
685 | #endif | ||
686 | } | ||
687 | EXPORT_SYMBOL(clock_t_to_jiffies); | ||
688 | |||
689 | u64 jiffies_64_to_clock_t(u64 x) | ||
690 | { | ||
691 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | ||
692 | do_div(x, HZ / USER_HZ); | ||
693 | #else | ||
694 | /* | ||
695 | * There are better ways that don't overflow early, | ||
696 | * but even this doesn't overflow in hundreds of years | ||
697 | * in 64 bits, so.. | ||
698 | */ | ||
699 | x *= TICK_NSEC; | ||
700 | do_div(x, (NSEC_PER_SEC / USER_HZ)); | ||
701 | #endif | ||
702 | return x; | ||
703 | } | ||
704 | |||
705 | EXPORT_SYMBOL(jiffies_64_to_clock_t); | ||
706 | |||
707 | u64 nsec_to_clock_t(u64 x) | ||
708 | { | ||
709 | #if (NSEC_PER_SEC % USER_HZ) == 0 | ||
710 | do_div(x, (NSEC_PER_SEC / USER_HZ)); | ||
711 | #elif (USER_HZ % 512) == 0 | ||
712 | x *= USER_HZ/512; | ||
713 | do_div(x, (NSEC_PER_SEC / 512)); | ||
714 | #else | ||
715 | /* | ||
716 | * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, | ||
717 | * overflow after 64.99 years. | ||
718 | * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... | ||
719 | */ | ||
720 | x *= 9; | ||
721 | do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / | ||
722 | USER_HZ)); | ||
723 | #endif | ||
724 | return x; | ||
725 | } | ||
726 | |||
473 | #if (BITS_PER_LONG < 64) | 727 | #if (BITS_PER_LONG < 64) |
474 | u64 get_jiffies_64(void) | 728 | u64 get_jiffies_64(void) |
475 | { | 729 | { |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 000000000000..f66351126544 --- /dev/null +++ b/kernel/time/Kconfig | |||
@@ -0,0 +1,25 @@ | |||
1 | # | ||
2 | # Timer subsystem related configuration options | ||
3 | # | ||
4 | config TICK_ONESHOT | ||
5 | bool | ||
6 | default n | ||
7 | |||
8 | config NO_HZ | ||
9 | bool "Tickless System (Dynamic Ticks)" | ||
10 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | ||
11 | select TICK_ONESHOT | ||
12 | help | ||
13 | This option enables a tickless system: timer interrupts will | ||
14 | only trigger on an as-needed basis both when the system is | ||
15 | busy and when the system is idle. | ||
16 | |||
17 | config HIGH_RES_TIMERS | ||
18 | bool "High Resolution Timer Support" | ||
19 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | ||
20 | select TICK_ONESHOT | ||
21 | help | ||
22 | This option enables high resolution timer support. If your | ||
23 | hardware is not capable then this option only increases | ||
24 | the size of the kernel image. | ||
25 | |||
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 61a3907d16fb..93bccba1f265 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1 +1,8 @@ | |||
1 | obj-y += ntp.o clocksource.o jiffies.o | 1 | obj-y += ntp.o clocksource.o jiffies.o timer_list.o |
2 | |||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | ||
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o | ||
6 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | ||
7 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | ||
8 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 000000000000..67932ea78c17 --- /dev/null +++ b/kernel/time/clockevents.c | |||
@@ -0,0 +1,345 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/clockevents.c | ||
3 | * | ||
4 | * This file contains functions which manage clock event devices. | ||
5 | * | ||
6 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
7 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
8 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
9 | * | ||
10 | * This code is licenced under the GPL version 2. For details see | ||
11 | * kernel-base/COPYING. | ||
12 | */ | ||
13 | |||
14 | #include <linux/clockchips.h> | ||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/notifier.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <linux/sysdev.h> | ||
21 | |||
22 | /* The registered clock event devices */ | ||
23 | static LIST_HEAD(clockevent_devices); | ||
24 | static LIST_HEAD(clockevents_released); | ||
25 | |||
26 | /* Notification for clock events */ | ||
27 | static RAW_NOTIFIER_HEAD(clockevents_chain); | ||
28 | |||
29 | /* Protection for the above */ | ||
30 | static DEFINE_SPINLOCK(clockevents_lock); | ||
31 | |||
32 | /** | ||
33 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | ||
34 | * @latch: value to convert | ||
35 | * @evt: pointer to clock event device descriptor | ||
36 | * | ||
37 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
38 | */ | ||
39 | unsigned long clockevent_delta2ns(unsigned long latch, | ||
40 | struct clock_event_device *evt) | ||
41 | { | ||
42 | u64 clc = ((u64) latch << evt->shift); | ||
43 | |||
44 | do_div(clc, evt->mult); | ||
45 | if (clc < 1000) | ||
46 | clc = 1000; | ||
47 | if (clc > LONG_MAX) | ||
48 | clc = LONG_MAX; | ||
49 | |||
50 | return (unsigned long) clc; | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * clockevents_set_mode - set the operating mode of a clock event device | ||
55 | * @dev: device to modify | ||
56 | * @mode: new mode | ||
57 | * | ||
58 | * Must be called with interrupts disabled ! | ||
59 | */ | ||
60 | void clockevents_set_mode(struct clock_event_device *dev, | ||
61 | enum clock_event_mode mode) | ||
62 | { | ||
63 | if (dev->mode != mode) { | ||
64 | dev->set_mode(mode, dev); | ||
65 | dev->mode = mode; | ||
66 | } | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * clockevents_program_event - Reprogram the clock event device. | ||
71 | * @expires: absolute expiry time (monotonic clock) | ||
72 | * | ||
73 | * Returns 0 on success, -ETIME when the event is in the past. | ||
74 | */ | ||
75 | int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | ||
76 | ktime_t now) | ||
77 | { | ||
78 | unsigned long long clc; | ||
79 | int64_t delta; | ||
80 | |||
81 | delta = ktime_to_ns(ktime_sub(expires, now)); | ||
82 | |||
83 | if (delta <= 0) | ||
84 | return -ETIME; | ||
85 | |||
86 | dev->next_event = expires; | ||
87 | |||
88 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
89 | return 0; | ||
90 | |||
91 | if (delta > dev->max_delta_ns) | ||
92 | delta = dev->max_delta_ns; | ||
93 | if (delta < dev->min_delta_ns) | ||
94 | delta = dev->min_delta_ns; | ||
95 | |||
96 | clc = delta * dev->mult; | ||
97 | clc >>= dev->shift; | ||
98 | |||
99 | return dev->set_next_event((unsigned long) clc, dev); | ||
100 | } | ||
101 | |||
102 | /** | ||
103 | * clockevents_register_notifier - register a clock events change listener | ||
104 | */ | ||
105 | int clockevents_register_notifier(struct notifier_block *nb) | ||
106 | { | ||
107 | int ret; | ||
108 | |||
109 | spin_lock(&clockevents_lock); | ||
110 | ret = raw_notifier_chain_register(&clockevents_chain, nb); | ||
111 | spin_unlock(&clockevents_lock); | ||
112 | |||
113 | return ret; | ||
114 | } | ||
115 | |||
116 | /** | ||
117 | * clockevents_unregister_notifier - unregister a clock events change listener | ||
118 | */ | ||
119 | void clockevents_unregister_notifier(struct notifier_block *nb) | ||
120 | { | ||
121 | spin_lock(&clockevents_lock); | ||
122 | raw_notifier_chain_unregister(&clockevents_chain, nb); | ||
123 | spin_unlock(&clockevents_lock); | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Notify about a clock event change. Called with clockevents_lock | ||
128 | * held. | ||
129 | */ | ||
130 | static void clockevents_do_notify(unsigned long reason, void *dev) | ||
131 | { | ||
132 | raw_notifier_call_chain(&clockevents_chain, reason, dev); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * Called after a notify add to make devices availble which were | ||
137 | * released from the notifier call. | ||
138 | */ | ||
139 | static void clockevents_notify_released(void) | ||
140 | { | ||
141 | struct clock_event_device *dev; | ||
142 | |||
143 | while (!list_empty(&clockevents_released)) { | ||
144 | dev = list_entry(clockevents_released.next, | ||
145 | struct clock_event_device, list); | ||
146 | list_del(&dev->list); | ||
147 | list_add(&dev->list, &clockevent_devices); | ||
148 | clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * clockevents_register_device - register a clock event device | ||
154 | * @dev: device to register | ||
155 | */ | ||
156 | void clockevents_register_device(struct clock_event_device *dev) | ||
157 | { | ||
158 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
159 | |||
160 | spin_lock(&clockevents_lock); | ||
161 | |||
162 | list_add(&dev->list, &clockevent_devices); | ||
163 | clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); | ||
164 | clockevents_notify_released(); | ||
165 | |||
166 | spin_unlock(&clockevents_lock); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Noop handler when we shut down an event device | ||
171 | */ | ||
172 | static void clockevents_handle_noop(struct clock_event_device *dev) | ||
173 | { | ||
174 | } | ||
175 | |||
176 | /** | ||
177 | * clockevents_exchange_device - release and request clock devices | ||
178 | * @old: device to release (can be NULL) | ||
179 | * @new: device to request (can be NULL) | ||
180 | * | ||
181 | * Called from the notifier chain. clockevents_lock is held already | ||
182 | */ | ||
183 | void clockevents_exchange_device(struct clock_event_device *old, | ||
184 | struct clock_event_device *new) | ||
185 | { | ||
186 | unsigned long flags; | ||
187 | |||
188 | local_irq_save(flags); | ||
189 | /* | ||
190 | * Caller releases a clock event device. We queue it into the | ||
191 | * released list and do a notify add later. | ||
192 | */ | ||
193 | if (old) { | ||
194 | old->event_handler = clockevents_handle_noop; | ||
195 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | ||
196 | list_del(&old->list); | ||
197 | list_add(&old->list, &clockevents_released); | ||
198 | } | ||
199 | |||
200 | if (new) { | ||
201 | BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); | ||
202 | clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); | ||
203 | } | ||
204 | local_irq_restore(flags); | ||
205 | } | ||
206 | |||
207 | /** | ||
208 | * clockevents_request_device | ||
209 | */ | ||
210 | struct clock_event_device *clockevents_request_device(unsigned int features, | ||
211 | cpumask_t cpumask) | ||
212 | { | ||
213 | struct clock_event_device *cur, *dev = NULL; | ||
214 | struct list_head *tmp; | ||
215 | |||
216 | spin_lock(&clockevents_lock); | ||
217 | |||
218 | list_for_each(tmp, &clockevent_devices) { | ||
219 | cur = list_entry(tmp, struct clock_event_device, list); | ||
220 | |||
221 | if ((cur->features & features) == features && | ||
222 | cpus_equal(cpumask, cur->cpumask)) { | ||
223 | if (!dev || dev->rating < cur->rating) | ||
224 | dev = cur; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | clockevents_exchange_device(NULL, dev); | ||
229 | |||
230 | spin_unlock(&clockevents_lock); | ||
231 | |||
232 | return dev; | ||
233 | } | ||
234 | |||
235 | /** | ||
236 | * clockevents_release_device | ||
237 | */ | ||
238 | void clockevents_release_device(struct clock_event_device *dev) | ||
239 | { | ||
240 | spin_lock(&clockevents_lock); | ||
241 | |||
242 | clockevents_exchange_device(dev, NULL); | ||
243 | clockevents_notify_released(); | ||
244 | |||
245 | spin_unlock(&clockevents_lock); | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * clockevents_notify - notification about relevant events | ||
250 | */ | ||
251 | void clockevents_notify(unsigned long reason, void *arg) | ||
252 | { | ||
253 | spin_lock(&clockevents_lock); | ||
254 | clockevents_do_notify(reason, arg); | ||
255 | |||
256 | switch (reason) { | ||
257 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
258 | /* | ||
259 | * Unregister the clock event devices which were | ||
260 | * released from the users in the notify chain. | ||
261 | */ | ||
262 | while (!list_empty(&clockevents_released)) { | ||
263 | struct clock_event_device *dev; | ||
264 | |||
265 | dev = list_entry(clockevents_released.next, | ||
266 | struct clock_event_device, list); | ||
267 | list_del(&dev->list); | ||
268 | } | ||
269 | break; | ||
270 | default: | ||
271 | break; | ||
272 | } | ||
273 | spin_unlock(&clockevents_lock); | ||
274 | } | ||
275 | EXPORT_SYMBOL_GPL(clockevents_notify); | ||
276 | |||
277 | #ifdef CONFIG_SYSFS | ||
278 | |||
279 | /** | ||
280 | * clockevents_show_registered - sysfs interface for listing clockevents | ||
281 | * @dev: unused | ||
282 | * @buf: char buffer to be filled with clock events list | ||
283 | * | ||
284 | * Provides sysfs interface for listing registered clock event devices | ||
285 | */ | ||
286 | static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf) | ||
287 | { | ||
288 | struct list_head *tmp; | ||
289 | char *p = buf; | ||
290 | int cpu; | ||
291 | |||
292 | spin_lock(&clockevents_lock); | ||
293 | |||
294 | list_for_each(tmp, &clockevent_devices) { | ||
295 | struct clock_event_device *ce; | ||
296 | |||
297 | ce = list_entry(tmp, struct clock_event_device, list); | ||
298 | p += sprintf(p, "%-20s F:%04x M:%d", ce->name, | ||
299 | ce->features, ce->mode); | ||
300 | p += sprintf(p, " C:"); | ||
301 | if (!cpus_equal(ce->cpumask, cpu_possible_map)) { | ||
302 | for_each_cpu_mask(cpu, ce->cpumask) | ||
303 | p += sprintf(p, " %d", cpu); | ||
304 | } else { | ||
305 | /* | ||
306 | * FIXME: Add the cpu which is handling this sucker | ||
307 | */ | ||
308 | } | ||
309 | p += sprintf(p, "\n"); | ||
310 | } | ||
311 | |||
312 | spin_unlock(&clockevents_lock); | ||
313 | |||
314 | return p - buf; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * Sysfs setup bits: | ||
319 | */ | ||
320 | static SYSDEV_ATTR(registered, 0600, | ||
321 | clockevents_show_registered, NULL); | ||
322 | |||
323 | static struct sysdev_class clockevents_sysclass = { | ||
324 | set_kset_name("clockevents"), | ||
325 | }; | ||
326 | |||
327 | static struct sys_device clockevents_sys_device = { | ||
328 | .id = 0, | ||
329 | .cls = &clockevents_sysclass, | ||
330 | }; | ||
331 | |||
332 | static int __init clockevents_sysfs_init(void) | ||
333 | { | ||
334 | int error = sysdev_class_register(&clockevents_sysclass); | ||
335 | |||
336 | if (!error) | ||
337 | error = sysdev_register(&clockevents_sys_device); | ||
338 | if (!error) | ||
339 | error = sysdev_create_file( | ||
340 | &clockevents_sys_device, | ||
341 | &attr_registered); | ||
342 | return error; | ||
343 | } | ||
344 | device_initcall(clockevents_sysfs_init); | ||
345 | #endif | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9ef176c4e09..193a0793af95 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/init.h> | 29 | #include <linux/init.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ | 31 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ |
32 | #include <linux/tick.h> | ||
32 | 33 | ||
33 | /* XXX - Would like a better way for initializing curr_clocksource */ | 34 | /* XXX - Would like a better way for initializing curr_clocksource */ |
34 | extern struct clocksource clocksource_jiffies; | 35 | extern struct clocksource clocksource_jiffies; |
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies; | |||
48 | */ | 49 | */ |
49 | static struct clocksource *curr_clocksource = &clocksource_jiffies; | 50 | static struct clocksource *curr_clocksource = &clocksource_jiffies; |
50 | static struct clocksource *next_clocksource; | 51 | static struct clocksource *next_clocksource; |
52 | static struct clocksource *clocksource_override; | ||
51 | static LIST_HEAD(clocksource_list); | 53 | static LIST_HEAD(clocksource_list); |
52 | static DEFINE_SPINLOCK(clocksource_lock); | 54 | static DEFINE_SPINLOCK(clocksource_lock); |
53 | static char override_name[32]; | 55 | static char override_name[32]; |
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void) | |||
62 | finished_booting = 1; | 64 | finished_booting = 1; |
63 | return 0; | 65 | return 0; |
64 | } | 66 | } |
65 | |||
66 | late_initcall(clocksource_done_booting); | 67 | late_initcall(clocksource_done_booting); |
67 | 68 | ||
69 | #ifdef CONFIG_CLOCKSOURCE_WATCHDOG | ||
70 | static LIST_HEAD(watchdog_list); | ||
71 | static struct clocksource *watchdog; | ||
72 | static struct timer_list watchdog_timer; | ||
73 | static DEFINE_SPINLOCK(watchdog_lock); | ||
74 | static cycle_t watchdog_last; | ||
75 | /* | ||
76 | * Interval: 0.5sec Treshold: 0.0625s | ||
77 | */ | ||
78 | #define WATCHDOG_INTERVAL (HZ >> 1) | ||
79 | #define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) | ||
80 | |||
81 | static void clocksource_ratewd(struct clocksource *cs, int64_t delta) | ||
82 | { | ||
83 | if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) | ||
84 | return; | ||
85 | |||
86 | printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", | ||
87 | cs->name, delta); | ||
88 | cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); | ||
89 | clocksource_change_rating(cs, 0); | ||
90 | cs->flags &= ~CLOCK_SOURCE_WATCHDOG; | ||
91 | list_del(&cs->wd_list); | ||
92 | } | ||
93 | |||
94 | static void clocksource_watchdog(unsigned long data) | ||
95 | { | ||
96 | struct clocksource *cs, *tmp; | ||
97 | cycle_t csnow, wdnow; | ||
98 | int64_t wd_nsec, cs_nsec; | ||
99 | |||
100 | spin_lock(&watchdog_lock); | ||
101 | |||
102 | wdnow = watchdog->read(); | ||
103 | wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); | ||
104 | watchdog_last = wdnow; | ||
105 | |||
106 | list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { | ||
107 | csnow = cs->read(); | ||
108 | /* Initialized ? */ | ||
109 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | ||
110 | if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && | ||
111 | (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { | ||
112 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | ||
113 | /* | ||
114 | * We just marked the clocksource as | ||
115 | * highres-capable, notify the rest of the | ||
116 | * system as well so that we transition | ||
117 | * into high-res mode: | ||
118 | */ | ||
119 | tick_clock_notify(); | ||
120 | } | ||
121 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | ||
122 | cs->wd_last = csnow; | ||
123 | } else { | ||
124 | cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); | ||
125 | cs->wd_last = csnow; | ||
126 | /* Check the delta. Might remove from the list ! */ | ||
127 | clocksource_ratewd(cs, cs_nsec - wd_nsec); | ||
128 | } | ||
129 | } | ||
130 | |||
131 | if (!list_empty(&watchdog_list)) { | ||
132 | __mod_timer(&watchdog_timer, | ||
133 | watchdog_timer.expires + WATCHDOG_INTERVAL); | ||
134 | } | ||
135 | spin_unlock(&watchdog_lock); | ||
136 | } | ||
137 | static void clocksource_check_watchdog(struct clocksource *cs) | ||
138 | { | ||
139 | struct clocksource *cse; | ||
140 | unsigned long flags; | ||
141 | |||
142 | spin_lock_irqsave(&watchdog_lock, flags); | ||
143 | if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { | ||
144 | int started = !list_empty(&watchdog_list); | ||
145 | |||
146 | list_add(&cs->wd_list, &watchdog_list); | ||
147 | if (!started && watchdog) { | ||
148 | watchdog_last = watchdog->read(); | ||
149 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | ||
150 | add_timer(&watchdog_timer); | ||
151 | } | ||
152 | } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { | ||
153 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | ||
154 | |||
155 | if (!watchdog || cs->rating > watchdog->rating) { | ||
156 | if (watchdog) | ||
157 | del_timer(&watchdog_timer); | ||
158 | watchdog = cs; | ||
159 | init_timer(&watchdog_timer); | ||
160 | watchdog_timer.function = clocksource_watchdog; | ||
161 | |||
162 | /* Reset watchdog cycles */ | ||
163 | list_for_each_entry(cse, &watchdog_list, wd_list) | ||
164 | cse->flags &= ~CLOCK_SOURCE_WATCHDOG; | ||
165 | /* Start if list is not empty */ | ||
166 | if (!list_empty(&watchdog_list)) { | ||
167 | watchdog_last = watchdog->read(); | ||
168 | watchdog_timer.expires = | ||
169 | jiffies + WATCHDOG_INTERVAL; | ||
170 | add_timer(&watchdog_timer); | ||
171 | } | ||
172 | } | ||
173 | } | ||
174 | spin_unlock_irqrestore(&watchdog_lock, flags); | ||
175 | } | ||
176 | #else | ||
177 | static void clocksource_check_watchdog(struct clocksource *cs) | ||
178 | { | ||
179 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | ||
180 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | ||
181 | } | ||
182 | #endif | ||
183 | |||
68 | /** | 184 | /** |
69 | * clocksource_get_next - Returns the selected clocksource | 185 | * clocksource_get_next - Returns the selected clocksource |
70 | * | 186 | * |
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void) | |||
84 | } | 200 | } |
85 | 201 | ||
86 | /** | 202 | /** |
87 | * select_clocksource - Finds the best registered clocksource. | 203 | * select_clocksource - Selects the best registered clocksource. |
88 | * | 204 | * |
89 | * Private function. Must hold clocksource_lock when called. | 205 | * Private function. Must hold clocksource_lock when called. |
90 | * | 206 | * |
91 | * Looks through the list of registered clocksources, returning | 207 | * Select the clocksource with the best rating, or the clocksource, |
92 | * the one with the highest rating value. If there is a clocksource | 208 | * which is selected by userspace override. |
93 | * name that matches the override string, it returns that clocksource. | ||
94 | */ | 209 | */ |
95 | static struct clocksource *select_clocksource(void) | 210 | static struct clocksource *select_clocksource(void) |
96 | { | 211 | { |
97 | struct clocksource *best = NULL; | 212 | struct clocksource *next; |
98 | struct list_head *tmp; | ||
99 | 213 | ||
100 | list_for_each(tmp, &clocksource_list) { | 214 | if (list_empty(&clocksource_list)) |
101 | struct clocksource *src; | 215 | return NULL; |
102 | 216 | ||
103 | src = list_entry(tmp, struct clocksource, list); | 217 | if (clocksource_override) |
104 | if (!best) | 218 | next = clocksource_override; |
105 | best = src; | 219 | else |
106 | 220 | next = list_entry(clocksource_list.next, struct clocksource, | |
107 | /* check for override: */ | 221 | list); |
108 | if (strlen(src->name) == strlen(override_name) && | 222 | |
109 | !strcmp(src->name, override_name)) { | 223 | if (next == curr_clocksource) |
110 | best = src; | 224 | return NULL; |
111 | break; | ||
112 | } | ||
113 | /* pick the highest rating: */ | ||
114 | if (src->rating > best->rating) | ||
115 | best = src; | ||
116 | } | ||
117 | 225 | ||
118 | return best; | 226 | return next; |
119 | } | 227 | } |
120 | 228 | ||
121 | /** | 229 | /* |
122 | * is_registered_source - Checks if clocksource is registered | 230 | * Enqueue the clocksource sorted by rating |
123 | * @c: pointer to a clocksource | ||
124 | * | ||
125 | * Private helper function. Must hold clocksource_lock when called. | ||
126 | * | ||
127 | * Returns one if the clocksource is already registered, zero otherwise. | ||
128 | */ | 231 | */ |
129 | static int is_registered_source(struct clocksource *c) | 232 | static int clocksource_enqueue(struct clocksource *c) |
130 | { | 233 | { |
131 | int len = strlen(c->name); | 234 | struct list_head *tmp, *entry = &clocksource_list; |
132 | struct list_head *tmp; | ||
133 | 235 | ||
134 | list_for_each(tmp, &clocksource_list) { | 236 | list_for_each(tmp, &clocksource_list) { |
135 | struct clocksource *src; | 237 | struct clocksource *cs; |
136 | 238 | ||
137 | src = list_entry(tmp, struct clocksource, list); | 239 | cs = list_entry(tmp, struct clocksource, list); |
138 | if (strlen(src->name) == len && !strcmp(src->name, c->name)) | 240 | if (cs == c) |
139 | return 1; | 241 | return -EBUSY; |
242 | /* Keep track of the place, where to insert */ | ||
243 | if (cs->rating >= c->rating) | ||
244 | entry = tmp; | ||
140 | } | 245 | } |
246 | list_add(&c->list, entry); | ||
247 | |||
248 | if (strlen(c->name) == strlen(override_name) && | ||
249 | !strcmp(c->name, override_name)) | ||
250 | clocksource_override = c; | ||
141 | 251 | ||
142 | return 0; | 252 | return 0; |
143 | } | 253 | } |
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c) | |||
150 | */ | 260 | */ |
151 | int clocksource_register(struct clocksource *c) | 261 | int clocksource_register(struct clocksource *c) |
152 | { | 262 | { |
153 | int ret = 0; | ||
154 | unsigned long flags; | 263 | unsigned long flags; |
264 | int ret; | ||
155 | 265 | ||
156 | spin_lock_irqsave(&clocksource_lock, flags); | 266 | spin_lock_irqsave(&clocksource_lock, flags); |
157 | /* check if clocksource is already registered */ | 267 | ret = clocksource_enqueue(c); |
158 | if (is_registered_source(c)) { | 268 | if (!ret) |
159 | printk("register_clocksource: Cannot register %s. " | ||
160 | "Already registered!", c->name); | ||
161 | ret = -EBUSY; | ||
162 | } else { | ||
163 | /* register it */ | ||
164 | list_add(&c->list, &clocksource_list); | ||
165 | /* scan the registered clocksources, and pick the best one */ | ||
166 | next_clocksource = select_clocksource(); | 269 | next_clocksource = select_clocksource(); |
167 | } | ||
168 | spin_unlock_irqrestore(&clocksource_lock, flags); | 270 | spin_unlock_irqrestore(&clocksource_lock, flags); |
271 | if (!ret) | ||
272 | clocksource_check_watchdog(c); | ||
169 | return ret; | 273 | return ret; |
170 | } | 274 | } |
171 | EXPORT_SYMBOL(clocksource_register); | 275 | EXPORT_SYMBOL(clocksource_register); |
172 | 276 | ||
173 | /** | 277 | /** |
174 | * clocksource_reselect - Rescan list for next clocksource | 278 | * clocksource_change_rating - Change the rating of a registered clocksource |
175 | * | 279 | * |
176 | * A quick helper function to be used if a clocksource changes its | ||
177 | * rating. Forces the clocksource list to be re-scanned for the best | ||
178 | * clocksource. | ||
179 | */ | 280 | */ |
180 | void clocksource_reselect(void) | 281 | void clocksource_change_rating(struct clocksource *cs, int rating) |
181 | { | 282 | { |
182 | unsigned long flags; | 283 | unsigned long flags; |
183 | 284 | ||
184 | spin_lock_irqsave(&clocksource_lock, flags); | 285 | spin_lock_irqsave(&clocksource_lock, flags); |
286 | list_del(&cs->list); | ||
287 | cs->rating = rating; | ||
288 | clocksource_enqueue(cs); | ||
185 | next_clocksource = select_clocksource(); | 289 | next_clocksource = select_clocksource(); |
186 | spin_unlock_irqrestore(&clocksource_lock, flags); | 290 | spin_unlock_irqrestore(&clocksource_lock, flags); |
187 | } | 291 | } |
188 | EXPORT_SYMBOL(clocksource_reselect); | ||
189 | 292 | ||
190 | #ifdef CONFIG_SYSFS | 293 | #ifdef CONFIG_SYSFS |
191 | /** | 294 | /** |
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) | |||
221 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | 324 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, |
222 | const char *buf, size_t count) | 325 | const char *buf, size_t count) |
223 | { | 326 | { |
327 | struct clocksource *ovr = NULL; | ||
328 | struct list_head *tmp; | ||
224 | size_t ret = count; | 329 | size_t ret = count; |
330 | int len; | ||
331 | |||
225 | /* strings from sysfs write are not 0 terminated! */ | 332 | /* strings from sysfs write are not 0 terminated! */ |
226 | if (count >= sizeof(override_name)) | 333 | if (count >= sizeof(override_name)) |
227 | return -EINVAL; | 334 | return -EINVAL; |
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, | |||
229 | /* strip of \n: */ | 336 | /* strip of \n: */ |
230 | if (buf[count-1] == '\n') | 337 | if (buf[count-1] == '\n') |
231 | count--; | 338 | count--; |
232 | if (count < 1) | ||
233 | return -EINVAL; | ||
234 | 339 | ||
235 | spin_lock_irq(&clocksource_lock); | 340 | spin_lock_irq(&clocksource_lock); |
236 | 341 | ||
237 | /* copy the name given: */ | 342 | if (count > 0) |
238 | memcpy(override_name, buf, count); | 343 | memcpy(override_name, buf, count); |
239 | override_name[count] = 0; | 344 | override_name[count] = 0; |
240 | 345 | ||
241 | /* try to select it: */ | 346 | len = strlen(override_name); |
242 | next_clocksource = select_clocksource(); | 347 | if (len) { |
348 | ovr = clocksource_override; | ||
349 | /* try to select it: */ | ||
350 | list_for_each(tmp, &clocksource_list) { | ||
351 | struct clocksource *cs; | ||
352 | |||
353 | cs = list_entry(tmp, struct clocksource, list); | ||
354 | if (strlen(cs->name) == len && | ||
355 | !strcmp(cs->name, override_name)) | ||
356 | ovr = cs; | ||
357 | } | ||
358 | } | ||
359 | |||
360 | /* Reselect, when the override name has changed */ | ||
361 | if (ovr != clocksource_override) { | ||
362 | clocksource_override = ovr; | ||
363 | next_clocksource = select_clocksource(); | ||
364 | } | ||
243 | 365 | ||
244 | spin_unlock_irq(&clocksource_lock); | 366 | spin_unlock_irq(&clocksource_lock); |
245 | 367 | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a99b2a6e6a07..3be8da8fed7e 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = { | |||
62 | .mask = 0xffffffff, /*32bits*/ | 62 | .mask = 0xffffffff, /*32bits*/ |
63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ |
64 | .shift = JIFFIES_SHIFT, | 64 | .shift = JIFFIES_SHIFT, |
65 | .is_continuous = 0, /* tick based, not free running */ | ||
66 | }; | 65 | }; |
67 | 66 | ||
68 | static int __init init_jiffies_clocksource(void) | 67 | static int __init init_jiffies_clocksource(void) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3afeaa3a73f9..eb12509e00bd 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base; | |||
24 | 24 | ||
25 | #define MAX_TICKADJ 500 /* microsecs */ | 25 | #define MAX_TICKADJ 500 /* microsecs */ |
26 | #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ | 26 | #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ |
27 | TICK_LENGTH_SHIFT) / HZ) | 27 | TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * phase-lock loop variables | 30 | * phase-lock loop variables |
@@ -46,13 +46,17 @@ long time_adjust; | |||
46 | 46 | ||
47 | static void ntp_update_frequency(void) | 47 | static void ntp_update_frequency(void) |
48 | { | 48 | { |
49 | tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; | 49 | u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) |
50 | tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; | 50 | << TICK_LENGTH_SHIFT; |
51 | tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); | 51 | second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; |
52 | second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); | ||
52 | 53 | ||
53 | do_div(tick_length_base, HZ); | 54 | tick_length_base = second_length; |
54 | 55 | ||
55 | tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; | 56 | do_div(second_length, HZ); |
57 | tick_nsec = second_length >> TICK_LENGTH_SHIFT; | ||
58 | |||
59 | do_div(tick_length_base, NTP_INTERVAL_FREQ); | ||
56 | } | 60 | } |
57 | 61 | ||
58 | /** | 62 | /** |
@@ -162,7 +166,7 @@ void second_overflow(void) | |||
162 | tick_length -= MAX_TICKADJ_SCALED; | 166 | tick_length -= MAX_TICKADJ_SCALED; |
163 | } else { | 167 | } else { |
164 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / | 168 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / |
165 | HZ) << TICK_LENGTH_SHIFT; | 169 | NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; |
166 | time_adjust = 0; | 170 | time_adjust = 0; |
167 | } | 171 | } |
168 | } | 172 | } |
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc) | |||
239 | result = -EINVAL; | 243 | result = -EINVAL; |
240 | goto leave; | 244 | goto leave; |
241 | } | 245 | } |
242 | time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); | 246 | time_freq = ((s64)txc->freq * NSEC_PER_USEC) |
247 | >> (SHIFT_USEC - SHIFT_NSEC); | ||
243 | } | 248 | } |
244 | 249 | ||
245 | if (txc->modes & ADJ_MAXERROR) { | 250 | if (txc->modes & ADJ_MAXERROR) { |
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc) | |||
309 | freq_adj += time_freq; | 314 | freq_adj += time_freq; |
310 | freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); | 315 | freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); |
311 | time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); | 316 | time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); |
312 | time_offset = (time_offset / HZ) << SHIFT_UPDATE; | 317 | time_offset = (time_offset / NTP_INTERVAL_FREQ) |
318 | << SHIFT_UPDATE; | ||
313 | } /* STA_PLL */ | 319 | } /* STA_PLL */ |
314 | } /* txc->modes & ADJ_OFFSET */ | 320 | } /* txc->modes & ADJ_OFFSET */ |
315 | if (txc->modes & ADJ_TICK) | 321 | if (txc->modes & ADJ_TICK) |
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | |||
324 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 330 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) |
325 | txc->offset = save_adjust; | 331 | txc->offset = save_adjust; |
326 | else | 332 | else |
327 | txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; | 333 | txc->offset = shift_right(time_offset, SHIFT_UPDATE) |
328 | txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); | 334 | * NTP_INTERVAL_FREQ / 1000; |
335 | txc->freq = (time_freq / NSEC_PER_USEC) | ||
336 | << (SHIFT_USEC - SHIFT_NSEC); | ||
329 | txc->maxerror = time_maxerror; | 337 | txc->maxerror = time_maxerror; |
330 | txc->esterror = time_esterror; | 338 | txc->esterror = time_esterror; |
331 | txc->status = time_status; | 339 | txc->status = time_status; |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 000000000000..12b3efeb9f6f --- /dev/null +++ b/kernel/time/tick-broadcast.c | |||
@@ -0,0 +1,480 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-broadcast.c | ||
3 | * | ||
4 | * This file contains functions which emulate a local clock-event | ||
5 | * device via a broadcast event source. | ||
6 | * | ||
7 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
8 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
9 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
10 | * | ||
11 | * This code is licenced under the GPL version 2. For details see | ||
12 | * kernel-base/COPYING. | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/profile.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/tick.h> | ||
22 | |||
23 | #include "tick-internal.h" | ||
24 | |||
25 | /* | ||
26 | * Broadcast support for broken x86 hardware, where the local apic | ||
27 | * timer stops in C3 state. | ||
28 | */ | ||
29 | |||
30 | struct tick_device tick_broadcast_device; | ||
31 | static cpumask_t tick_broadcast_mask; | ||
32 | static DEFINE_SPINLOCK(tick_broadcast_lock); | ||
33 | |||
34 | /* | ||
35 | * Debugging: see timer_list.c | ||
36 | */ | ||
37 | struct tick_device *tick_get_broadcast_device(void) | ||
38 | { | ||
39 | return &tick_broadcast_device; | ||
40 | } | ||
41 | |||
42 | cpumask_t *tick_get_broadcast_mask(void) | ||
43 | { | ||
44 | return &tick_broadcast_mask; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Start the device in periodic mode | ||
49 | */ | ||
50 | static void tick_broadcast_start_periodic(struct clock_event_device *bc) | ||
51 | { | ||
52 | if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
53 | tick_setup_periodic(bc, 1); | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * Check, if the device can be utilized as broadcast device: | ||
58 | */ | ||
59 | int tick_check_broadcast_device(struct clock_event_device *dev) | ||
60 | { | ||
61 | if (tick_broadcast_device.evtdev || | ||
62 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
63 | return 0; | ||
64 | |||
65 | clockevents_exchange_device(NULL, dev); | ||
66 | tick_broadcast_device.evtdev = dev; | ||
67 | if (!cpus_empty(tick_broadcast_mask)) | ||
68 | tick_broadcast_start_periodic(dev); | ||
69 | return 1; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Check, if the device is the broadcast device | ||
74 | */ | ||
75 | int tick_is_broadcast_device(struct clock_event_device *dev) | ||
76 | { | ||
77 | return (dev && tick_broadcast_device.evtdev == dev); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Check, if the device is disfunctional and a place holder, which | ||
82 | * needs to be handled by the broadcast device. | ||
83 | */ | ||
84 | int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | ||
85 | { | ||
86 | unsigned long flags; | ||
87 | int ret = 0; | ||
88 | |||
89 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
90 | |||
91 | /* | ||
92 | * Devices might be registered with both periodic and oneshot | ||
93 | * mode disabled. This signals, that the device needs to be | ||
94 | * operated from the broadcast device and is a placeholder for | ||
95 | * the cpu local device. | ||
96 | */ | ||
97 | if (!tick_device_is_functional(dev)) { | ||
98 | dev->event_handler = tick_handle_periodic; | ||
99 | cpu_set(cpu, tick_broadcast_mask); | ||
100 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | ||
101 | ret = 1; | ||
102 | } | ||
103 | |||
104 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
105 | return ret; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Broadcast the event to the cpus, which are set in the mask | ||
110 | */ | ||
111 | int tick_do_broadcast(cpumask_t mask) | ||
112 | { | ||
113 | int ret = 0, cpu = smp_processor_id(); | ||
114 | struct tick_device *td; | ||
115 | |||
116 | /* | ||
117 | * Check, if the current cpu is in the mask | ||
118 | */ | ||
119 | if (cpu_isset(cpu, mask)) { | ||
120 | cpu_clear(cpu, mask); | ||
121 | td = &per_cpu(tick_cpu_device, cpu); | ||
122 | td->evtdev->event_handler(td->evtdev); | ||
123 | ret = 1; | ||
124 | } | ||
125 | |||
126 | if (!cpus_empty(mask)) { | ||
127 | /* | ||
128 | * It might be necessary to actually check whether the devices | ||
129 | * have different broadcast functions. For now, just use the | ||
130 | * one of the first device. This works as long as we have this | ||
131 | * misfeature only on x86 (lapic) | ||
132 | */ | ||
133 | cpu = first_cpu(mask); | ||
134 | td = &per_cpu(tick_cpu_device, cpu); | ||
135 | td->evtdev->broadcast(mask); | ||
136 | ret = 1; | ||
137 | } | ||
138 | return ret; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * Periodic broadcast: | ||
143 | * - invoke the broadcast handlers | ||
144 | */ | ||
145 | static void tick_do_periodic_broadcast(void) | ||
146 | { | ||
147 | cpumask_t mask; | ||
148 | |||
149 | spin_lock(&tick_broadcast_lock); | ||
150 | |||
151 | cpus_and(mask, cpu_online_map, tick_broadcast_mask); | ||
152 | tick_do_broadcast(mask); | ||
153 | |||
154 | spin_unlock(&tick_broadcast_lock); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Event handler for periodic broadcast ticks | ||
159 | */ | ||
160 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | ||
161 | { | ||
162 | dev->next_event.tv64 = KTIME_MAX; | ||
163 | |||
164 | tick_do_periodic_broadcast(); | ||
165 | |||
166 | /* | ||
167 | * The device is in periodic mode. No reprogramming necessary: | ||
168 | */ | ||
169 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | ||
170 | return; | ||
171 | |||
172 | /* | ||
173 | * Setup the next period for devices, which do not have | ||
174 | * periodic mode: | ||
175 | */ | ||
176 | for (;;) { | ||
177 | ktime_t next = ktime_add(dev->next_event, tick_period); | ||
178 | |||
179 | if (!clockevents_program_event(dev, next, ktime_get())) | ||
180 | return; | ||
181 | tick_do_periodic_broadcast(); | ||
182 | } | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Powerstate information: The system enters/leaves a state, where | ||
187 | * affected devices might stop | ||
188 | */ | ||
189 | static void tick_do_broadcast_on_off(void *why) | ||
190 | { | ||
191 | struct clock_event_device *bc, *dev; | ||
192 | struct tick_device *td; | ||
193 | unsigned long flags, *reason = why; | ||
194 | int cpu; | ||
195 | |||
196 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
197 | |||
198 | cpu = smp_processor_id(); | ||
199 | td = &per_cpu(tick_cpu_device, cpu); | ||
200 | dev = td->evtdev; | ||
201 | bc = tick_broadcast_device.evtdev; | ||
202 | |||
203 | /* | ||
204 | * Is the device in broadcast mode forever or is it not | ||
205 | * affected by the powerstate ? | ||
206 | */ | ||
207 | if (!dev || !tick_device_is_functional(dev) || | ||
208 | !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
209 | goto out; | ||
210 | |||
211 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { | ||
212 | if (!cpu_isset(cpu, tick_broadcast_mask)) { | ||
213 | cpu_set(cpu, tick_broadcast_mask); | ||
214 | if (td->mode == TICKDEV_MODE_PERIODIC) | ||
215 | clockevents_set_mode(dev, | ||
216 | CLOCK_EVT_MODE_SHUTDOWN); | ||
217 | } | ||
218 | } else { | ||
219 | if (cpu_isset(cpu, tick_broadcast_mask)) { | ||
220 | cpu_clear(cpu, tick_broadcast_mask); | ||
221 | if (td->mode == TICKDEV_MODE_PERIODIC) | ||
222 | tick_setup_periodic(dev, 0); | ||
223 | } | ||
224 | } | ||
225 | |||
226 | if (cpus_empty(tick_broadcast_mask)) | ||
227 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
228 | else { | ||
229 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | ||
230 | tick_broadcast_start_periodic(bc); | ||
231 | else | ||
232 | tick_broadcast_setup_oneshot(bc); | ||
233 | } | ||
234 | out: | ||
235 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Powerstate information: The system enters/leaves a state, where | ||
240 | * affected devices might stop. | ||
241 | */ | ||
242 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | ||
243 | { | ||
244 | int cpu = get_cpu(); | ||
245 | |||
246 | if (cpu == *oncpu) | ||
247 | tick_do_broadcast_on_off(&reason); | ||
248 | else | ||
249 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, | ||
250 | &reason, 1, 1); | ||
251 | put_cpu(); | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Set the periodic handler depending on broadcast on/off | ||
256 | */ | ||
257 | void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | ||
258 | { | ||
259 | if (!broadcast) | ||
260 | dev->event_handler = tick_handle_periodic; | ||
261 | else | ||
262 | dev->event_handler = tick_handle_periodic_broadcast; | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * Remove a CPU from broadcasting | ||
267 | */ | ||
268 | void tick_shutdown_broadcast(unsigned int *cpup) | ||
269 | { | ||
270 | struct clock_event_device *bc; | ||
271 | unsigned long flags; | ||
272 | unsigned int cpu = *cpup; | ||
273 | |||
274 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
275 | |||
276 | bc = tick_broadcast_device.evtdev; | ||
277 | cpu_clear(cpu, tick_broadcast_mask); | ||
278 | |||
279 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { | ||
280 | if (bc && cpus_empty(tick_broadcast_mask)) | ||
281 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
282 | } | ||
283 | |||
284 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
285 | } | ||
286 | |||
287 | #ifdef CONFIG_TICK_ONESHOT | ||
288 | |||
289 | static cpumask_t tick_broadcast_oneshot_mask; | ||
290 | |||
291 | /* | ||
292 | * Debugging: see timer_list.c | ||
293 | */ | ||
294 | cpumask_t *tick_get_broadcast_oneshot_mask(void) | ||
295 | { | ||
296 | return &tick_broadcast_oneshot_mask; | ||
297 | } | ||
298 | |||
299 | static int tick_broadcast_set_event(ktime_t expires, int force) | ||
300 | { | ||
301 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | ||
302 | ktime_t now = ktime_get(); | ||
303 | int res; | ||
304 | |||
305 | for(;;) { | ||
306 | res = clockevents_program_event(bc, expires, now); | ||
307 | if (!res || !force) | ||
308 | return res; | ||
309 | now = ktime_get(); | ||
310 | expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * Reprogram the broadcast device: | ||
316 | * | ||
317 | * Called with tick_broadcast_lock held and interrupts disabled. | ||
318 | */ | ||
319 | static int tick_broadcast_reprogram(void) | ||
320 | { | ||
321 | ktime_t expires = { .tv64 = KTIME_MAX }; | ||
322 | struct tick_device *td; | ||
323 | int cpu; | ||
324 | |||
325 | /* | ||
326 | * Find the event which expires next: | ||
327 | */ | ||
328 | for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; | ||
329 | cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { | ||
330 | td = &per_cpu(tick_cpu_device, cpu); | ||
331 | if (td->evtdev->next_event.tv64 < expires.tv64) | ||
332 | expires = td->evtdev->next_event; | ||
333 | } | ||
334 | |||
335 | if (expires.tv64 == KTIME_MAX) | ||
336 | return 0; | ||
337 | |||
338 | return tick_broadcast_set_event(expires, 0); | ||
339 | } | ||
340 | |||
341 | /* | ||
342 | * Handle oneshot mode broadcasting | ||
343 | */ | ||
344 | static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) | ||
345 | { | ||
346 | struct tick_device *td; | ||
347 | cpumask_t mask; | ||
348 | ktime_t now; | ||
349 | int cpu; | ||
350 | |||
351 | spin_lock(&tick_broadcast_lock); | ||
352 | again: | ||
353 | dev->next_event.tv64 = KTIME_MAX; | ||
354 | mask = CPU_MASK_NONE; | ||
355 | now = ktime_get(); | ||
356 | /* Find all expired events */ | ||
357 | for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; | ||
358 | cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { | ||
359 | td = &per_cpu(tick_cpu_device, cpu); | ||
360 | if (td->evtdev->next_event.tv64 <= now.tv64) | ||
361 | cpu_set(cpu, mask); | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Wakeup the cpus which have an expired event. The broadcast | ||
366 | * device is reprogrammed in the return from idle code. | ||
367 | */ | ||
368 | if (!tick_do_broadcast(mask)) { | ||
369 | /* | ||
370 | * The global event did not expire any CPU local | ||
371 | * events. This happens in dyntick mode, as the | ||
372 | * maximum PIT delta is quite small. | ||
373 | */ | ||
374 | if (tick_broadcast_reprogram()) | ||
375 | goto again; | ||
376 | } | ||
377 | spin_unlock(&tick_broadcast_lock); | ||
378 | } | ||
379 | |||
380 | /* | ||
381 | * Powerstate information: The system enters/leaves a state, where | ||
382 | * affected devices might stop | ||
383 | */ | ||
384 | void tick_broadcast_oneshot_control(unsigned long reason) | ||
385 | { | ||
386 | struct clock_event_device *bc, *dev; | ||
387 | struct tick_device *td; | ||
388 | unsigned long flags; | ||
389 | int cpu; | ||
390 | |||
391 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
392 | |||
393 | /* | ||
394 | * Periodic mode does not care about the enter/exit of power | ||
395 | * states | ||
396 | */ | ||
397 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | ||
398 | goto out; | ||
399 | |||
400 | bc = tick_broadcast_device.evtdev; | ||
401 | cpu = smp_processor_id(); | ||
402 | td = &per_cpu(tick_cpu_device, cpu); | ||
403 | dev = td->evtdev; | ||
404 | |||
405 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
406 | goto out; | ||
407 | |||
408 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | ||
409 | if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { | ||
410 | cpu_set(cpu, tick_broadcast_oneshot_mask); | ||
411 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | ||
412 | if (dev->next_event.tv64 < bc->next_event.tv64) | ||
413 | tick_broadcast_set_event(dev->next_event, 1); | ||
414 | } | ||
415 | } else { | ||
416 | if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { | ||
417 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | ||
418 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | ||
419 | if (dev->next_event.tv64 != KTIME_MAX) | ||
420 | tick_program_event(dev->next_event, 1); | ||
421 | } | ||
422 | } | ||
423 | |||
424 | out: | ||
425 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
426 | } | ||
427 | |||
428 | /** | ||
429 | * tick_broadcast_setup_highres - setup the broadcast device for highres | ||
430 | */ | ||
431 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
432 | { | ||
433 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { | ||
434 | bc->event_handler = tick_handle_oneshot_broadcast; | ||
435 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
436 | bc->next_event.tv64 = KTIME_MAX; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | /* | ||
441 | * Select oneshot operating mode for the broadcast device | ||
442 | */ | ||
443 | void tick_broadcast_switch_to_oneshot(void) | ||
444 | { | ||
445 | struct clock_event_device *bc; | ||
446 | unsigned long flags; | ||
447 | |||
448 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
449 | |||
450 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; | ||
451 | bc = tick_broadcast_device.evtdev; | ||
452 | if (bc) | ||
453 | tick_broadcast_setup_oneshot(bc); | ||
454 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
455 | } | ||
456 | |||
457 | |||
458 | /* | ||
459 | * Remove a dead CPU from broadcasting | ||
460 | */ | ||
461 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | ||
462 | { | ||
463 | struct clock_event_device *bc; | ||
464 | unsigned long flags; | ||
465 | unsigned int cpu = *cpup; | ||
466 | |||
467 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
468 | |||
469 | bc = tick_broadcast_device.evtdev; | ||
470 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | ||
471 | |||
472 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { | ||
473 | if (bc && cpus_empty(tick_broadcast_oneshot_mask)) | ||
474 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
475 | } | ||
476 | |||
477 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
478 | } | ||
479 | |||
480 | #endif | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 000000000000..4500e347f1bb --- /dev/null +++ b/kernel/time/tick-common.c | |||
@@ -0,0 +1,346 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-common.c | ||
3 | * | ||
4 | * This file contains the base functions to manage periodic tick | ||
5 | * related events. | ||
6 | * | ||
7 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
8 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
9 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
10 | * | ||
11 | * This code is licenced under the GPL version 2. For details see | ||
12 | * kernel-base/COPYING. | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/profile.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/tick.h> | ||
22 | |||
23 | #include "tick-internal.h" | ||
24 | |||
25 | /* | ||
26 | * Tick devices | ||
27 | */ | ||
28 | DEFINE_PER_CPU(struct tick_device, tick_cpu_device); | ||
29 | /* | ||
30 | * Tick next event: keeps track of the tick time | ||
31 | */ | ||
32 | ktime_t tick_next_period; | ||
33 | ktime_t tick_period; | ||
34 | static int tick_do_timer_cpu = -1; | ||
35 | DEFINE_SPINLOCK(tick_device_lock); | ||
36 | |||
37 | /* | ||
38 | * Debugging: see timer_list.c | ||
39 | */ | ||
40 | struct tick_device *tick_get_device(int cpu) | ||
41 | { | ||
42 | return &per_cpu(tick_cpu_device, cpu); | ||
43 | } | ||
44 | |||
45 | /** | ||
46 | * tick_is_oneshot_available - check for a oneshot capable event device | ||
47 | */ | ||
48 | int tick_is_oneshot_available(void) | ||
49 | { | ||
50 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | ||
51 | |||
52 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Periodic tick | ||
57 | */ | ||
58 | static void tick_periodic(int cpu) | ||
59 | { | ||
60 | if (tick_do_timer_cpu == cpu) { | ||
61 | write_seqlock(&xtime_lock); | ||
62 | |||
63 | /* Keep track of the next tick event */ | ||
64 | tick_next_period = ktime_add(tick_next_period, tick_period); | ||
65 | |||
66 | do_timer(1); | ||
67 | write_sequnlock(&xtime_lock); | ||
68 | } | ||
69 | |||
70 | update_process_times(user_mode(get_irq_regs())); | ||
71 | profile_tick(CPU_PROFILING); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Event handler for periodic ticks | ||
76 | */ | ||
77 | void tick_handle_periodic(struct clock_event_device *dev) | ||
78 | { | ||
79 | int cpu = smp_processor_id(); | ||
80 | |||
81 | tick_periodic(cpu); | ||
82 | |||
83 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | ||
84 | return; | ||
85 | /* | ||
86 | * Setup the next period for devices, which do not have | ||
87 | * periodic mode: | ||
88 | */ | ||
89 | for (;;) { | ||
90 | ktime_t next = ktime_add(dev->next_event, tick_period); | ||
91 | |||
92 | if (!clockevents_program_event(dev, next, ktime_get())) | ||
93 | return; | ||
94 | tick_periodic(cpu); | ||
95 | } | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * Setup the device for a periodic tick | ||
100 | */ | ||
101 | void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | ||
102 | { | ||
103 | tick_set_periodic_handler(dev, broadcast); | ||
104 | |||
105 | /* Broadcast setup ? */ | ||
106 | if (!tick_device_is_functional(dev)) | ||
107 | return; | ||
108 | |||
109 | if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { | ||
110 | clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); | ||
111 | } else { | ||
112 | unsigned long seq; | ||
113 | ktime_t next; | ||
114 | |||
115 | do { | ||
116 | seq = read_seqbegin(&xtime_lock); | ||
117 | next = tick_next_period; | ||
118 | } while (read_seqretry(&xtime_lock, seq)); | ||
119 | |||
120 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | ||
121 | |||
122 | for (;;) { | ||
123 | if (!clockevents_program_event(dev, next, ktime_get())) | ||
124 | return; | ||
125 | next = ktime_add(next, tick_period); | ||
126 | } | ||
127 | } | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * Setup the tick device | ||
132 | */ | ||
133 | static void tick_setup_device(struct tick_device *td, | ||
134 | struct clock_event_device *newdev, int cpu, | ||
135 | cpumask_t cpumask) | ||
136 | { | ||
137 | ktime_t next_event; | ||
138 | void (*handler)(struct clock_event_device *) = NULL; | ||
139 | |||
140 | /* | ||
141 | * First device setup ? | ||
142 | */ | ||
143 | if (!td->evtdev) { | ||
144 | /* | ||
145 | * If no cpu took the do_timer update, assign it to | ||
146 | * this cpu: | ||
147 | */ | ||
148 | if (tick_do_timer_cpu == -1) { | ||
149 | tick_do_timer_cpu = cpu; | ||
150 | tick_next_period = ktime_get(); | ||
151 | tick_period = ktime_set(0, NSEC_PER_SEC / HZ); | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Startup in periodic mode first. | ||
156 | */ | ||
157 | td->mode = TICKDEV_MODE_PERIODIC; | ||
158 | } else { | ||
159 | handler = td->evtdev->event_handler; | ||
160 | next_event = td->evtdev->next_event; | ||
161 | } | ||
162 | |||
163 | td->evtdev = newdev; | ||
164 | |||
165 | /* | ||
166 | * When the device is not per cpu, pin the interrupt to the | ||
167 | * current cpu: | ||
168 | */ | ||
169 | if (!cpus_equal(newdev->cpumask, cpumask)) | ||
170 | irq_set_affinity(newdev->irq, cpumask); | ||
171 | |||
172 | /* | ||
173 | * When global broadcasting is active, check if the current | ||
174 | * device is registered as a placeholder for broadcast mode. | ||
175 | * This allows us to handle this x86 misfeature in a generic | ||
176 | * way. | ||
177 | */ | ||
178 | if (tick_device_uses_broadcast(newdev, cpu)) | ||
179 | return; | ||
180 | |||
181 | if (td->mode == TICKDEV_MODE_PERIODIC) | ||
182 | tick_setup_periodic(newdev, 0); | ||
183 | else | ||
184 | tick_setup_oneshot(newdev, handler, next_event); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * Check, if the new registered device should be used. | ||
189 | */ | ||
190 | static int tick_check_new_device(struct clock_event_device *newdev) | ||
191 | { | ||
192 | struct clock_event_device *curdev; | ||
193 | struct tick_device *td; | ||
194 | int cpu, ret = NOTIFY_OK; | ||
195 | unsigned long flags; | ||
196 | cpumask_t cpumask; | ||
197 | |||
198 | spin_lock_irqsave(&tick_device_lock, flags); | ||
199 | |||
200 | cpu = smp_processor_id(); | ||
201 | if (!cpu_isset(cpu, newdev->cpumask)) | ||
202 | goto out; | ||
203 | |||
204 | td = &per_cpu(tick_cpu_device, cpu); | ||
205 | curdev = td->evtdev; | ||
206 | cpumask = cpumask_of_cpu(cpu); | ||
207 | |||
208 | /* cpu local device ? */ | ||
209 | if (!cpus_equal(newdev->cpumask, cpumask)) { | ||
210 | |||
211 | /* | ||
212 | * If the cpu affinity of the device interrupt can not | ||
213 | * be set, ignore it. | ||
214 | */ | ||
215 | if (!irq_can_set_affinity(newdev->irq)) | ||
216 | goto out_bc; | ||
217 | |||
218 | /* | ||
219 | * If we have a cpu local device already, do not replace it | ||
220 | * by a non cpu local device | ||
221 | */ | ||
222 | if (curdev && cpus_equal(curdev->cpumask, cpumask)) | ||
223 | goto out_bc; | ||
224 | } | ||
225 | |||
226 | /* | ||
227 | * If we have an active device, then check the rating and the oneshot | ||
228 | * feature. | ||
229 | */ | ||
230 | if (curdev) { | ||
231 | /* | ||
232 | * Prefer one shot capable devices ! | ||
233 | */ | ||
234 | if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && | ||
235 | !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
236 | goto out_bc; | ||
237 | /* | ||
238 | * Check the rating | ||
239 | */ | ||
240 | if (curdev->rating >= newdev->rating) | ||
241 | goto out_bc; | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Replace the eventually existing device by the new | ||
246 | * device. If the current device is the broadcast device, do | ||
247 | * not give it back to the clockevents layer ! | ||
248 | */ | ||
249 | if (tick_is_broadcast_device(curdev)) { | ||
250 | clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); | ||
251 | curdev = NULL; | ||
252 | } | ||
253 | clockevents_exchange_device(curdev, newdev); | ||
254 | tick_setup_device(td, newdev, cpu, cpumask); | ||
255 | if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) | ||
256 | tick_oneshot_notify(); | ||
257 | |||
258 | spin_unlock_irqrestore(&tick_device_lock, flags); | ||
259 | return NOTIFY_STOP; | ||
260 | |||
261 | out_bc: | ||
262 | /* | ||
263 | * Can the new device be used as a broadcast device ? | ||
264 | */ | ||
265 | if (tick_check_broadcast_device(newdev)) | ||
266 | ret = NOTIFY_STOP; | ||
267 | out: | ||
268 | spin_unlock_irqrestore(&tick_device_lock, flags); | ||
269 | |||
270 | return ret; | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * Shutdown an event device on a given cpu: | ||
275 | * | ||
276 | * This is called on a life CPU, when a CPU is dead. So we cannot | ||
277 | * access the hardware device itself. | ||
278 | * We just set the mode and remove it from the lists. | ||
279 | */ | ||
280 | static void tick_shutdown(unsigned int *cpup) | ||
281 | { | ||
282 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); | ||
283 | struct clock_event_device *dev = td->evtdev; | ||
284 | unsigned long flags; | ||
285 | |||
286 | spin_lock_irqsave(&tick_device_lock, flags); | ||
287 | td->mode = TICKDEV_MODE_PERIODIC; | ||
288 | if (dev) { | ||
289 | /* | ||
290 | * Prevent that the clock events layer tries to call | ||
291 | * the set mode function! | ||
292 | */ | ||
293 | dev->mode = CLOCK_EVT_MODE_UNUSED; | ||
294 | clockevents_exchange_device(dev, NULL); | ||
295 | td->evtdev = NULL; | ||
296 | } | ||
297 | spin_unlock_irqrestore(&tick_device_lock, flags); | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Notification about clock event devices | ||
302 | */ | ||
303 | static int tick_notify(struct notifier_block *nb, unsigned long reason, | ||
304 | void *dev) | ||
305 | { | ||
306 | switch (reason) { | ||
307 | |||
308 | case CLOCK_EVT_NOTIFY_ADD: | ||
309 | return tick_check_new_device(dev); | ||
310 | |||
311 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | ||
312 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | ||
313 | tick_broadcast_on_off(reason, dev); | ||
314 | break; | ||
315 | |||
316 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | ||
317 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | ||
318 | tick_broadcast_oneshot_control(reason); | ||
319 | break; | ||
320 | |||
321 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
322 | tick_shutdown_broadcast_oneshot(dev); | ||
323 | tick_shutdown_broadcast(dev); | ||
324 | tick_shutdown(dev); | ||
325 | break; | ||
326 | |||
327 | default: | ||
328 | break; | ||
329 | } | ||
330 | |||
331 | return NOTIFY_OK; | ||
332 | } | ||
333 | |||
334 | static struct notifier_block tick_notifier = { | ||
335 | .notifier_call = tick_notify, | ||
336 | }; | ||
337 | |||
338 | /** | ||
339 | * tick_init - initialize the tick control | ||
340 | * | ||
341 | * Register the notifier with the clockevents framework | ||
342 | */ | ||
343 | void __init tick_init(void) | ||
344 | { | ||
345 | clockevents_register_notifier(&tick_notifier); | ||
346 | } | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 000000000000..54861a0f29ff --- /dev/null +++ b/kernel/time/tick-internal.h | |||
@@ -0,0 +1,110 @@ | |||
1 | /* | ||
2 | * tick internal variable and functions used by low/high res code | ||
3 | */ | ||
4 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | ||
5 | extern spinlock_t tick_device_lock; | ||
6 | extern ktime_t tick_next_period; | ||
7 | extern ktime_t tick_period; | ||
8 | |||
9 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | ||
10 | extern void tick_handle_periodic(struct clock_event_device *dev); | ||
11 | |||
12 | /* | ||
13 | * NO_HZ / high resolution timer shared code | ||
14 | */ | ||
15 | #ifdef CONFIG_TICK_ONESHOT | ||
16 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | ||
17 | void (*handler)(struct clock_event_device *), | ||
18 | ktime_t nextevt); | ||
19 | extern int tick_program_event(ktime_t expires, int force); | ||
20 | extern void tick_oneshot_notify(void); | ||
21 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | ||
22 | |||
23 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
24 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | ||
25 | extern void tick_broadcast_oneshot_control(unsigned long reason); | ||
26 | extern void tick_broadcast_switch_to_oneshot(void); | ||
27 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | ||
28 | # else /* BROADCAST */ | ||
29 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
30 | { | ||
31 | BUG(); | ||
32 | } | ||
33 | static inline void tick_broadcast_oneshot_control(unsigned long reason) { } | ||
34 | static inline void tick_broadcast_switch_to_oneshot(void) { } | ||
35 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
36 | # endif /* !BROADCAST */ | ||
37 | |||
38 | #else /* !ONESHOT */ | ||
39 | static inline | ||
40 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
41 | void (*handler)(struct clock_event_device *), | ||
42 | ktime_t nextevt) | ||
43 | { | ||
44 | BUG(); | ||
45 | } | ||
46 | static inline int tick_program_event(ktime_t expires, int force) | ||
47 | { | ||
48 | return 0; | ||
49 | } | ||
50 | static inline void tick_oneshot_notify(void) { } | ||
51 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
52 | { | ||
53 | BUG(); | ||
54 | } | ||
55 | static inline void tick_broadcast_oneshot_control(unsigned long reason) { } | ||
56 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
57 | #endif /* !TICK_ONESHOT */ | ||
58 | |||
59 | /* | ||
60 | * Broadcasting support | ||
61 | */ | ||
62 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
63 | extern int tick_do_broadcast(cpumask_t mask); | ||
64 | |||
65 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | ||
66 | extern int tick_check_broadcast_device(struct clock_event_device *dev); | ||
67 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
68 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | ||
69 | extern void tick_shutdown_broadcast(unsigned int *cpup); | ||
70 | |||
71 | extern void | ||
72 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
73 | |||
74 | #else /* !BROADCAST */ | ||
75 | |||
76 | static inline int tick_check_broadcast_device(struct clock_event_device *dev) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) | ||
82 | { | ||
83 | return 0; | ||
84 | } | ||
85 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, | ||
86 | int cpu) | ||
87 | { | ||
88 | return 0; | ||
89 | } | ||
90 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
91 | static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } | ||
92 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } | ||
93 | |||
94 | /* | ||
95 | * Set the periodic handler in non broadcast mode | ||
96 | */ | ||
97 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, | ||
98 | int broadcast) | ||
99 | { | ||
100 | dev->event_handler = tick_handle_periodic; | ||
101 | } | ||
102 | #endif /* !BROADCAST */ | ||
103 | |||
104 | /* | ||
105 | * Check, if the device is functional or a dummy for broadcast | ||
106 | */ | ||
107 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
108 | { | ||
109 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
110 | } | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 000000000000..2e8b7ff863cc --- /dev/null +++ b/kernel/time/tick-oneshot.c | |||
@@ -0,0 +1,84 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-oneshot.c | ||
3 | * | ||
4 | * This file contains functions which manage high resolution tick | ||
5 | * related events. | ||
6 | * | ||
7 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
8 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
9 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
10 | * | ||
11 | * This code is licenced under the GPL version 2. For details see | ||
12 | * kernel-base/COPYING. | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/profile.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/tick.h> | ||
22 | |||
23 | #include "tick-internal.h" | ||
24 | |||
25 | /** | ||
26 | * tick_program_event | ||
27 | */ | ||
28 | int tick_program_event(ktime_t expires, int force) | ||
29 | { | ||
30 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | ||
31 | ktime_t now = ktime_get(); | ||
32 | |||
33 | while (1) { | ||
34 | int ret = clockevents_program_event(dev, expires, now); | ||
35 | |||
36 | if (!ret || !force) | ||
37 | return ret; | ||
38 | now = ktime_get(); | ||
39 | expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | /** | ||
44 | * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) | ||
45 | */ | ||
46 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
47 | void (*handler)(struct clock_event_device *), | ||
48 | ktime_t next_event) | ||
49 | { | ||
50 | newdev->event_handler = handler; | ||
51 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | ||
52 | clockevents_program_event(newdev, next_event, ktime_get()); | ||
53 | } | ||
54 | |||
55 | /** | ||
56 | * tick_switch_to_oneshot - switch to oneshot mode | ||
57 | */ | ||
58 | int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | ||
59 | { | ||
60 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | ||
61 | struct clock_event_device *dev = td->evtdev; | ||
62 | |||
63 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || | ||
64 | !tick_device_is_functional(dev)) | ||
65 | return -EINVAL; | ||
66 | |||
67 | td->mode = TICKDEV_MODE_ONESHOT; | ||
68 | dev->event_handler = handler; | ||
69 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | ||
70 | tick_broadcast_switch_to_oneshot(); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
75 | /** | ||
76 | * tick_init_highres - switch to high resolution mode | ||
77 | * | ||
78 | * Called with interrupts disabled. | ||
79 | */ | ||
80 | int tick_init_highres(void) | ||
81 | { | ||
82 | return tick_switch_to_oneshot(hrtimer_interrupt); | ||
83 | } | ||
84 | #endif | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 000000000000..95e41f7f850b --- /dev/null +++ b/kernel/time/tick-sched.c | |||
@@ -0,0 +1,563 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-sched.c | ||
3 | * | ||
4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner | ||
7 | * | ||
8 | * No idle tick implementation for low and high resolution timers | ||
9 | * | ||
10 | * Started by: Thomas Gleixner and Ingo Molnar | ||
11 | * | ||
12 | * For licencing details see kernel-base/COPYING | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/kernel_stat.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/profile.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/tick.h> | ||
23 | |||
24 | #include "tick-internal.h" | ||
25 | |||
26 | /* | ||
27 | * Per cpu nohz control structure | ||
28 | */ | ||
29 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | ||
30 | |||
31 | /* | ||
32 | * The time, when the last jiffy update happened. Protected by xtime_lock. | ||
33 | */ | ||
34 | static ktime_t last_jiffies_update; | ||
35 | |||
36 | struct tick_sched *tick_get_tick_sched(int cpu) | ||
37 | { | ||
38 | return &per_cpu(tick_cpu_sched, cpu); | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Must be called with interrupts disabled ! | ||
43 | */ | ||
44 | static void tick_do_update_jiffies64(ktime_t now) | ||
45 | { | ||
46 | unsigned long ticks = 0; | ||
47 | ktime_t delta; | ||
48 | |||
49 | /* Reevalute with xtime_lock held */ | ||
50 | write_seqlock(&xtime_lock); | ||
51 | |||
52 | delta = ktime_sub(now, last_jiffies_update); | ||
53 | if (delta.tv64 >= tick_period.tv64) { | ||
54 | |||
55 | delta = ktime_sub(delta, tick_period); | ||
56 | last_jiffies_update = ktime_add(last_jiffies_update, | ||
57 | tick_period); | ||
58 | |||
59 | /* Slow path for long timeouts */ | ||
60 | if (unlikely(delta.tv64 >= tick_period.tv64)) { | ||
61 | s64 incr = ktime_to_ns(tick_period); | ||
62 | |||
63 | ticks = ktime_divns(delta, incr); | ||
64 | |||
65 | last_jiffies_update = ktime_add_ns(last_jiffies_update, | ||
66 | incr * ticks); | ||
67 | } | ||
68 | do_timer(++ticks); | ||
69 | } | ||
70 | write_sequnlock(&xtime_lock); | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Initialize and return retrieve the jiffies update. | ||
75 | */ | ||
76 | static ktime_t tick_init_jiffy_update(void) | ||
77 | { | ||
78 | ktime_t period; | ||
79 | |||
80 | write_seqlock(&xtime_lock); | ||
81 | /* Did we start the jiffies update yet ? */ | ||
82 | if (last_jiffies_update.tv64 == 0) | ||
83 | last_jiffies_update = tick_next_period; | ||
84 | period = last_jiffies_update; | ||
85 | write_sequnlock(&xtime_lock); | ||
86 | return period; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * NOHZ - aka dynamic tick functionality | ||
91 | */ | ||
92 | #ifdef CONFIG_NO_HZ | ||
93 | /* | ||
94 | * NO HZ enabled ? | ||
95 | */ | ||
96 | static int tick_nohz_enabled __read_mostly = 1; | ||
97 | |||
98 | /* | ||
99 | * Enable / Disable tickless mode | ||
100 | */ | ||
101 | static int __init setup_tick_nohz(char *str) | ||
102 | { | ||
103 | if (!strcmp(str, "off")) | ||
104 | tick_nohz_enabled = 0; | ||
105 | else if (!strcmp(str, "on")) | ||
106 | tick_nohz_enabled = 1; | ||
107 | else | ||
108 | return 0; | ||
109 | return 1; | ||
110 | } | ||
111 | |||
112 | __setup("nohz=", setup_tick_nohz); | ||
113 | |||
114 | /** | ||
115 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | ||
116 | * | ||
117 | * Called from interrupt entry when the CPU was idle | ||
118 | * | ||
119 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies | ||
120 | * must be updated. Otherwise an interrupt handler could use a stale jiffy | ||
121 | * value. We do this unconditionally on any cpu, as we don't know whether the | ||
122 | * cpu, which has the update task assigned is in a long sleep. | ||
123 | */ | ||
124 | void tick_nohz_update_jiffies(void) | ||
125 | { | ||
126 | int cpu = smp_processor_id(); | ||
127 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
128 | unsigned long flags; | ||
129 | ktime_t now; | ||
130 | |||
131 | if (!ts->tick_stopped) | ||
132 | return; | ||
133 | |||
134 | cpu_clear(cpu, nohz_cpu_mask); | ||
135 | now = ktime_get(); | ||
136 | |||
137 | local_irq_save(flags); | ||
138 | tick_do_update_jiffies64(now); | ||
139 | local_irq_restore(flags); | ||
140 | } | ||
141 | |||
142 | /** | ||
143 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
144 | * | ||
145 | * When the next event is more than a tick into the future, stop the idle tick | ||
146 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
147 | * just interrupted by an interrupt which did not cause a reschedule. | ||
148 | */ | ||
149 | void tick_nohz_stop_sched_tick(void) | ||
150 | { | ||
151 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | ||
152 | struct tick_sched *ts; | ||
153 | ktime_t last_update, expires, now, delta; | ||
154 | int cpu; | ||
155 | |||
156 | local_irq_save(flags); | ||
157 | |||
158 | cpu = smp_processor_id(); | ||
159 | ts = &per_cpu(tick_cpu_sched, cpu); | ||
160 | |||
161 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
162 | goto end; | ||
163 | |||
164 | if (need_resched()) | ||
165 | goto end; | ||
166 | |||
167 | cpu = smp_processor_id(); | ||
168 | BUG_ON(local_softirq_pending()); | ||
169 | |||
170 | now = ktime_get(); | ||
171 | /* | ||
172 | * When called from irq_exit we need to account the idle sleep time | ||
173 | * correctly. | ||
174 | */ | ||
175 | if (ts->tick_stopped) { | ||
176 | delta = ktime_sub(now, ts->idle_entrytime); | ||
177 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
178 | } | ||
179 | |||
180 | ts->idle_entrytime = now; | ||
181 | ts->idle_calls++; | ||
182 | |||
183 | /* Read jiffies and the time when jiffies were updated last */ | ||
184 | do { | ||
185 | seq = read_seqbegin(&xtime_lock); | ||
186 | last_update = last_jiffies_update; | ||
187 | last_jiffies = jiffies; | ||
188 | } while (read_seqretry(&xtime_lock, seq)); | ||
189 | |||
190 | /* Get the next timer wheel timer */ | ||
191 | next_jiffies = get_next_timer_interrupt(last_jiffies); | ||
192 | delta_jiffies = next_jiffies - last_jiffies; | ||
193 | |||
194 | /* | ||
195 | * Do not stop the tick, if we are only one off | ||
196 | * or if the cpu is required for rcu | ||
197 | */ | ||
198 | if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu))) | ||
199 | goto out; | ||
200 | |||
201 | /* Schedule the tick, if we are at least one jiffie off */ | ||
202 | if ((long)delta_jiffies >= 1) { | ||
203 | |||
204 | if (rcu_needs_cpu(cpu)) | ||
205 | delta_jiffies = 1; | ||
206 | else | ||
207 | cpu_set(cpu, nohz_cpu_mask); | ||
208 | /* | ||
209 | * nohz_stop_sched_tick can be called several times before | ||
210 | * the nohz_restart_sched_tick is called. This happens when | ||
211 | * interrupts arrive which do not cause a reschedule. In the | ||
212 | * first call we save the current tick time, so we can restart | ||
213 | * the scheduler tick in nohz_restart_sched_tick. | ||
214 | */ | ||
215 | if (!ts->tick_stopped) { | ||
216 | ts->idle_tick = ts->sched_timer.expires; | ||
217 | ts->tick_stopped = 1; | ||
218 | ts->idle_jiffies = last_jiffies; | ||
219 | } | ||
220 | /* | ||
221 | * calculate the expiry time for the next timer wheel | ||
222 | * timer | ||
223 | */ | ||
224 | expires = ktime_add_ns(last_update, tick_period.tv64 * | ||
225 | delta_jiffies); | ||
226 | ts->idle_expires = expires; | ||
227 | ts->idle_sleeps++; | ||
228 | |||
229 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | ||
230 | hrtimer_start(&ts->sched_timer, expires, | ||
231 | HRTIMER_MODE_ABS); | ||
232 | /* Check, if the timer was already in the past */ | ||
233 | if (hrtimer_active(&ts->sched_timer)) | ||
234 | goto out; | ||
235 | } else if(!tick_program_event(expires, 0)) | ||
236 | goto out; | ||
237 | /* | ||
238 | * We are past the event already. So we crossed a | ||
239 | * jiffie boundary. Update jiffies and raise the | ||
240 | * softirq. | ||
241 | */ | ||
242 | tick_do_update_jiffies64(ktime_get()); | ||
243 | cpu_clear(cpu, nohz_cpu_mask); | ||
244 | } | ||
245 | raise_softirq_irqoff(TIMER_SOFTIRQ); | ||
246 | out: | ||
247 | ts->next_jiffies = next_jiffies; | ||
248 | ts->last_jiffies = last_jiffies; | ||
249 | end: | ||
250 | local_irq_restore(flags); | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * nohz_restart_sched_tick - restart the idle tick from the idle task | ||
255 | * | ||
256 | * Restart the idle tick when the CPU is woken up from idle | ||
257 | */ | ||
258 | void tick_nohz_restart_sched_tick(void) | ||
259 | { | ||
260 | int cpu = smp_processor_id(); | ||
261 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
262 | unsigned long ticks; | ||
263 | ktime_t now, delta; | ||
264 | |||
265 | if (!ts->tick_stopped) | ||
266 | return; | ||
267 | |||
268 | /* Update jiffies first */ | ||
269 | now = ktime_get(); | ||
270 | |||
271 | local_irq_disable(); | ||
272 | tick_do_update_jiffies64(now); | ||
273 | cpu_clear(cpu, nohz_cpu_mask); | ||
274 | |||
275 | /* Account the idle time */ | ||
276 | delta = ktime_sub(now, ts->idle_entrytime); | ||
277 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
278 | |||
279 | /* | ||
280 | * We stopped the tick in idle. Update process times would miss the | ||
281 | * time we slept as update_process_times does only a 1 tick | ||
282 | * accounting. Enforce that this is accounted to idle ! | ||
283 | */ | ||
284 | ticks = jiffies - ts->idle_jiffies; | ||
285 | /* | ||
286 | * We might be one off. Do not randomly account a huge number of ticks! | ||
287 | */ | ||
288 | if (ticks && ticks < LONG_MAX) { | ||
289 | add_preempt_count(HARDIRQ_OFFSET); | ||
290 | account_system_time(current, HARDIRQ_OFFSET, | ||
291 | jiffies_to_cputime(ticks)); | ||
292 | sub_preempt_count(HARDIRQ_OFFSET); | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * Cancel the scheduled timer and restore the tick | ||
297 | */ | ||
298 | ts->tick_stopped = 0; | ||
299 | hrtimer_cancel(&ts->sched_timer); | ||
300 | ts->sched_timer.expires = ts->idle_tick; | ||
301 | |||
302 | while (1) { | ||
303 | /* Forward the time to expire in the future */ | ||
304 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
305 | |||
306 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | ||
307 | hrtimer_start(&ts->sched_timer, | ||
308 | ts->sched_timer.expires, | ||
309 | HRTIMER_MODE_ABS); | ||
310 | /* Check, if the timer was already in the past */ | ||
311 | if (hrtimer_active(&ts->sched_timer)) | ||
312 | break; | ||
313 | } else { | ||
314 | if (!tick_program_event(ts->sched_timer.expires, 0)) | ||
315 | break; | ||
316 | } | ||
317 | /* Update jiffies and reread time */ | ||
318 | tick_do_update_jiffies64(now); | ||
319 | now = ktime_get(); | ||
320 | } | ||
321 | local_irq_enable(); | ||
322 | } | ||
323 | |||
324 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | ||
325 | { | ||
326 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
327 | return tick_program_event(ts->sched_timer.expires, 0); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * The nohz low res interrupt handler | ||
332 | */ | ||
333 | static void tick_nohz_handler(struct clock_event_device *dev) | ||
334 | { | ||
335 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
336 | struct pt_regs *regs = get_irq_regs(); | ||
337 | ktime_t now = ktime_get(); | ||
338 | |||
339 | dev->next_event.tv64 = KTIME_MAX; | ||
340 | |||
341 | /* Check, if the jiffies need an update */ | ||
342 | tick_do_update_jiffies64(now); | ||
343 | |||
344 | /* | ||
345 | * When we are idle and the tick is stopped, we have to touch | ||
346 | * the watchdog as we might not schedule for a really long | ||
347 | * time. This happens on complete idle SMP systems while | ||
348 | * waiting on the login prompt. We also increment the "start | ||
349 | * of idle" jiffy stamp so the idle accounting adjustment we | ||
350 | * do when we go busy again does not account too much ticks. | ||
351 | */ | ||
352 | if (ts->tick_stopped) { | ||
353 | touch_softlockup_watchdog(); | ||
354 | ts->idle_jiffies++; | ||
355 | } | ||
356 | |||
357 | update_process_times(user_mode(regs)); | ||
358 | profile_tick(CPU_PROFILING); | ||
359 | |||
360 | /* Do not restart, when we are in the idle loop */ | ||
361 | if (ts->tick_stopped) | ||
362 | return; | ||
363 | |||
364 | while (tick_nohz_reprogram(ts, now)) { | ||
365 | now = ktime_get(); | ||
366 | tick_do_update_jiffies64(now); | ||
367 | } | ||
368 | } | ||
369 | |||
370 | /** | ||
371 | * tick_nohz_switch_to_nohz - switch to nohz mode | ||
372 | */ | ||
373 | static void tick_nohz_switch_to_nohz(void) | ||
374 | { | ||
375 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
376 | ktime_t next; | ||
377 | |||
378 | if (!tick_nohz_enabled) | ||
379 | return; | ||
380 | |||
381 | local_irq_disable(); | ||
382 | if (tick_switch_to_oneshot(tick_nohz_handler)) { | ||
383 | local_irq_enable(); | ||
384 | return; | ||
385 | } | ||
386 | |||
387 | ts->nohz_mode = NOHZ_MODE_LOWRES; | ||
388 | |||
389 | /* | ||
390 | * Recycle the hrtimer in ts, so we can share the | ||
391 | * hrtimer_forward with the highres code. | ||
392 | */ | ||
393 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
394 | /* Get the next period */ | ||
395 | next = tick_init_jiffy_update(); | ||
396 | |||
397 | for (;;) { | ||
398 | ts->sched_timer.expires = next; | ||
399 | if (!tick_program_event(next, 0)) | ||
400 | break; | ||
401 | next = ktime_add(next, tick_period); | ||
402 | } | ||
403 | local_irq_enable(); | ||
404 | |||
405 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | ||
406 | smp_processor_id()); | ||
407 | } | ||
408 | |||
409 | #else | ||
410 | |||
411 | static inline void tick_nohz_switch_to_nohz(void) { } | ||
412 | |||
413 | #endif /* NO_HZ */ | ||
414 | |||
415 | /* | ||
416 | * High resolution timer specific code | ||
417 | */ | ||
418 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
419 | /* | ||
420 | * We rearm the timer until we get disabled by the idle code | ||
421 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | ||
422 | */ | ||
423 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | ||
424 | { | ||
425 | struct tick_sched *ts = | ||
426 | container_of(timer, struct tick_sched, sched_timer); | ||
427 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
428 | struct pt_regs *regs = get_irq_regs(); | ||
429 | ktime_t now = ktime_get(); | ||
430 | |||
431 | /* Check, if the jiffies need an update */ | ||
432 | tick_do_update_jiffies64(now); | ||
433 | |||
434 | /* | ||
435 | * Do not call, when we are not in irq context and have | ||
436 | * no valid regs pointer | ||
437 | */ | ||
438 | if (regs) { | ||
439 | /* | ||
440 | * When we are idle and the tick is stopped, we have to touch | ||
441 | * the watchdog as we might not schedule for a really long | ||
442 | * time. This happens on complete idle SMP systems while | ||
443 | * waiting on the login prompt. We also increment the "start of | ||
444 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
445 | * when we go busy again does not account too much ticks. | ||
446 | */ | ||
447 | if (ts->tick_stopped) { | ||
448 | touch_softlockup_watchdog(); | ||
449 | ts->idle_jiffies++; | ||
450 | } | ||
451 | /* | ||
452 | * update_process_times() might take tasklist_lock, hence | ||
453 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
454 | * never accessible by userspace APIs, so this is safe to do. | ||
455 | */ | ||
456 | spin_unlock(&base->lock); | ||
457 | update_process_times(user_mode(regs)); | ||
458 | profile_tick(CPU_PROFILING); | ||
459 | spin_lock(&base->lock); | ||
460 | } | ||
461 | |||
462 | /* Do not restart, when we are in the idle loop */ | ||
463 | if (ts->tick_stopped) | ||
464 | return HRTIMER_NORESTART; | ||
465 | |||
466 | hrtimer_forward(timer, now, tick_period); | ||
467 | |||
468 | return HRTIMER_RESTART; | ||
469 | } | ||
470 | |||
471 | /** | ||
472 | * tick_setup_sched_timer - setup the tick emulation timer | ||
473 | */ | ||
474 | void tick_setup_sched_timer(void) | ||
475 | { | ||
476 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
477 | ktime_t now = ktime_get(); | ||
478 | |||
479 | /* | ||
480 | * Emulate tick processing via per-CPU hrtimers: | ||
481 | */ | ||
482 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
483 | ts->sched_timer.function = tick_sched_timer; | ||
484 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
485 | |||
486 | /* Get the next period */ | ||
487 | ts->sched_timer.expires = tick_init_jiffy_update(); | ||
488 | |||
489 | for (;;) { | ||
490 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
491 | hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, | ||
492 | HRTIMER_MODE_ABS); | ||
493 | /* Check, if the timer was already in the past */ | ||
494 | if (hrtimer_active(&ts->sched_timer)) | ||
495 | break; | ||
496 | now = ktime_get(); | ||
497 | } | ||
498 | |||
499 | #ifdef CONFIG_NO_HZ | ||
500 | if (tick_nohz_enabled) | ||
501 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | ||
502 | #endif | ||
503 | } | ||
504 | |||
505 | void tick_cancel_sched_timer(int cpu) | ||
506 | { | ||
507 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
508 | |||
509 | if (ts->sched_timer.base) | ||
510 | hrtimer_cancel(&ts->sched_timer); | ||
511 | ts->tick_stopped = 0; | ||
512 | ts->nohz_mode = NOHZ_MODE_INACTIVE; | ||
513 | } | ||
514 | #endif /* HIGH_RES_TIMERS */ | ||
515 | |||
516 | /** | ||
517 | * Async notification about clocksource changes | ||
518 | */ | ||
519 | void tick_clock_notify(void) | ||
520 | { | ||
521 | int cpu; | ||
522 | |||
523 | for_each_possible_cpu(cpu) | ||
524 | set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Async notification about clock event changes | ||
529 | */ | ||
530 | void tick_oneshot_notify(void) | ||
531 | { | ||
532 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
533 | |||
534 | set_bit(0, &ts->check_clocks); | ||
535 | } | ||
536 | |||
537 | /** | ||
538 | * Check, if a change happened, which makes oneshot possible. | ||
539 | * | ||
540 | * Called cyclic from the hrtimer softirq (driven by the timer | ||
541 | * softirq) allow_nohz signals, that we can switch into low-res nohz | ||
542 | * mode, because high resolution timers are disabled (either compile | ||
543 | * or runtime). | ||
544 | */ | ||
545 | int tick_check_oneshot_change(int allow_nohz) | ||
546 | { | ||
547 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
548 | |||
549 | if (!test_and_clear_bit(0, &ts->check_clocks)) | ||
550 | return 0; | ||
551 | |||
552 | if (ts->nohz_mode != NOHZ_MODE_INACTIVE) | ||
553 | return 0; | ||
554 | |||
555 | if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) | ||
556 | return 0; | ||
557 | |||
558 | if (!allow_nohz) | ||
559 | return 1; | ||
560 | |||
561 | tick_nohz_switch_to_nohz(); | ||
562 | return 0; | ||
563 | } | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 000000000000..f82c635c3d5c --- /dev/null +++ b/kernel/time/timer_list.c | |||
@@ -0,0 +1,287 @@ | |||
1 | /* | ||
2 | * kernel/time/timer_list.c | ||
3 | * | ||
4 | * List pending timers | ||
5 | * | ||
6 | * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License version 2 as | ||
10 | * published by the Free Software Foundation. | ||
11 | */ | ||
12 | |||
13 | #include <linux/proc_fs.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | #include <linux/kallsyms.h> | ||
19 | #include <linux/tick.h> | ||
20 | |||
21 | #include <asm/uaccess.h> | ||
22 | |||
23 | typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); | ||
24 | |||
25 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | ||
26 | |||
27 | /* | ||
28 | * This allows printing both to /proc/timer_list and | ||
29 | * to the console (on SysRq-Q): | ||
30 | */ | ||
31 | #define SEQ_printf(m, x...) \ | ||
32 | do { \ | ||
33 | if (m) \ | ||
34 | seq_printf(m, x); \ | ||
35 | else \ | ||
36 | printk(x); \ | ||
37 | } while (0) | ||
38 | |||
39 | static void print_name_offset(struct seq_file *m, void *sym) | ||
40 | { | ||
41 | unsigned long addr = (unsigned long)sym; | ||
42 | char namebuf[KSYM_NAME_LEN+1]; | ||
43 | unsigned long size, offset; | ||
44 | const char *sym_name; | ||
45 | char *modname; | ||
46 | |||
47 | sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); | ||
48 | if (sym_name) | ||
49 | SEQ_printf(m, "%s", sym_name); | ||
50 | else | ||
51 | SEQ_printf(m, "<%p>", sym); | ||
52 | } | ||
53 | |||
54 | static void | ||
55 | print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) | ||
56 | { | ||
57 | #ifdef CONFIG_TIMER_STATS | ||
58 | char tmp[TASK_COMM_LEN + 1]; | ||
59 | #endif | ||
60 | SEQ_printf(m, " #%d: ", idx); | ||
61 | print_name_offset(m, timer); | ||
62 | SEQ_printf(m, ", "); | ||
63 | print_name_offset(m, timer->function); | ||
64 | SEQ_printf(m, ", S:%02lx", timer->state); | ||
65 | #ifdef CONFIG_TIMER_STATS | ||
66 | SEQ_printf(m, ", "); | ||
67 | print_name_offset(m, timer->start_site); | ||
68 | memcpy(tmp, timer->start_comm, TASK_COMM_LEN); | ||
69 | tmp[TASK_COMM_LEN] = 0; | ||
70 | SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); | ||
71 | #endif | ||
72 | SEQ_printf(m, "\n"); | ||
73 | SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", | ||
74 | (unsigned long long)ktime_to_ns(timer->expires), | ||
75 | (unsigned long long)(ktime_to_ns(timer->expires) - now)); | ||
76 | } | ||
77 | |||
78 | static void | ||
79 | print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | ||
80 | u64 now) | ||
81 | { | ||
82 | struct hrtimer *timer, tmp; | ||
83 | unsigned long next = 0, i; | ||
84 | struct rb_node *curr; | ||
85 | unsigned long flags; | ||
86 | |||
87 | next_one: | ||
88 | i = 0; | ||
89 | spin_lock_irqsave(&base->cpu_base->lock, flags); | ||
90 | |||
91 | curr = base->first; | ||
92 | /* | ||
93 | * Crude but we have to do this O(N*N) thing, because | ||
94 | * we have to unlock the base when printing: | ||
95 | */ | ||
96 | while (curr && i < next) { | ||
97 | curr = rb_next(curr); | ||
98 | i++; | ||
99 | } | ||
100 | |||
101 | if (curr) { | ||
102 | |||
103 | timer = rb_entry(curr, struct hrtimer, node); | ||
104 | tmp = *timer; | ||
105 | spin_unlock_irqrestore(&base->cpu_base->lock, flags); | ||
106 | |||
107 | print_timer(m, &tmp, i, now); | ||
108 | next++; | ||
109 | goto next_one; | ||
110 | } | ||
111 | spin_unlock_irqrestore(&base->cpu_base->lock, flags); | ||
112 | } | ||
113 | |||
114 | static void | ||
115 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | ||
116 | { | ||
117 | SEQ_printf(m, " .index: %d\n", | ||
118 | base->index); | ||
119 | SEQ_printf(m, " .resolution: %Ld nsecs\n", | ||
120 | (unsigned long long)ktime_to_ns(base->resolution)); | ||
121 | SEQ_printf(m, " .get_time: "); | ||
122 | print_name_offset(m, base->get_time); | ||
123 | SEQ_printf(m, "\n"); | ||
124 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
125 | SEQ_printf(m, " .offset: %Ld nsecs\n", | ||
126 | ktime_to_ns(base->offset)); | ||
127 | #endif | ||
128 | SEQ_printf(m, "active timers:\n"); | ||
129 | print_active_timers(m, base, now); | ||
130 | } | ||
131 | |||
132 | static void print_cpu(struct seq_file *m, int cpu, u64 now) | ||
133 | { | ||
134 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | ||
135 | int i; | ||
136 | |||
137 | SEQ_printf(m, "\ncpu: %d\n", cpu); | ||
138 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | ||
139 | SEQ_printf(m, " clock %d:\n", i); | ||
140 | print_base(m, cpu_base->clock_base + i, now); | ||
141 | } | ||
142 | #define P(x) \ | ||
143 | SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) | ||
144 | #define P_ns(x) \ | ||
145 | SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ | ||
146 | (u64)(ktime_to_ns(cpu_base->x))) | ||
147 | |||
148 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
149 | P_ns(expires_next); | ||
150 | P(hres_active); | ||
151 | P(nr_events); | ||
152 | #endif | ||
153 | #undef P | ||
154 | #undef P_ns | ||
155 | |||
156 | #ifdef CONFIG_TICK_ONESHOT | ||
157 | # define P(x) \ | ||
158 | SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) | ||
159 | # define P_ns(x) \ | ||
160 | SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ | ||
161 | (u64)(ktime_to_ns(ts->x))) | ||
162 | { | ||
163 | struct tick_sched *ts = tick_get_tick_sched(cpu); | ||
164 | P(nohz_mode); | ||
165 | P_ns(idle_tick); | ||
166 | P(tick_stopped); | ||
167 | P(idle_jiffies); | ||
168 | P(idle_calls); | ||
169 | P(idle_sleeps); | ||
170 | P_ns(idle_entrytime); | ||
171 | P_ns(idle_sleeptime); | ||
172 | P(last_jiffies); | ||
173 | P(next_jiffies); | ||
174 | P_ns(idle_expires); | ||
175 | SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); | ||
176 | } | ||
177 | #endif | ||
178 | |||
179 | #undef P | ||
180 | #undef P_ns | ||
181 | } | ||
182 | |||
183 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
184 | static void | ||
185 | print_tickdevice(struct seq_file *m, struct tick_device *td) | ||
186 | { | ||
187 | struct clock_event_device *dev = td->evtdev; | ||
188 | |||
189 | SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); | ||
190 | |||
191 | SEQ_printf(m, "Clock Event Device: "); | ||
192 | if (!dev) { | ||
193 | SEQ_printf(m, "<NULL>\n"); | ||
194 | return; | ||
195 | } | ||
196 | SEQ_printf(m, "%s\n", dev->name); | ||
197 | SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); | ||
198 | SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); | ||
199 | SEQ_printf(m, " mult: %ld\n", dev->mult); | ||
200 | SEQ_printf(m, " shift: %d\n", dev->shift); | ||
201 | SEQ_printf(m, " mode: %d\n", dev->mode); | ||
202 | SEQ_printf(m, " next_event: %Ld nsecs\n", | ||
203 | (unsigned long long) ktime_to_ns(dev->next_event)); | ||
204 | |||
205 | SEQ_printf(m, " set_next_event: "); | ||
206 | print_name_offset(m, dev->set_next_event); | ||
207 | SEQ_printf(m, "\n"); | ||
208 | |||
209 | SEQ_printf(m, " set_mode: "); | ||
210 | print_name_offset(m, dev->set_mode); | ||
211 | SEQ_printf(m, "\n"); | ||
212 | |||
213 | SEQ_printf(m, " event_handler: "); | ||
214 | print_name_offset(m, dev->event_handler); | ||
215 | SEQ_printf(m, "\n"); | ||
216 | } | ||
217 | |||
218 | static void timer_list_show_tickdevices(struct seq_file *m) | ||
219 | { | ||
220 | int cpu; | ||
221 | |||
222 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
223 | print_tickdevice(m, tick_get_broadcast_device()); | ||
224 | SEQ_printf(m, "tick_broadcast_mask: %08lx\n", | ||
225 | tick_get_broadcast_mask()->bits[0]); | ||
226 | #ifdef CONFIG_TICK_ONESHOT | ||
227 | SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", | ||
228 | tick_get_broadcast_oneshot_mask()->bits[0]); | ||
229 | #endif | ||
230 | SEQ_printf(m, "\n"); | ||
231 | #endif | ||
232 | for_each_online_cpu(cpu) | ||
233 | print_tickdevice(m, tick_get_device(cpu)); | ||
234 | SEQ_printf(m, "\n"); | ||
235 | } | ||
236 | #else | ||
237 | static void timer_list_show_tickdevices(struct seq_file *m) { } | ||
238 | #endif | ||
239 | |||
240 | static int timer_list_show(struct seq_file *m, void *v) | ||
241 | { | ||
242 | u64 now = ktime_to_ns(ktime_get()); | ||
243 | int cpu; | ||
244 | |||
245 | SEQ_printf(m, "Timer List Version: v0.3\n"); | ||
246 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | ||
247 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | ||
248 | |||
249 | for_each_online_cpu(cpu) | ||
250 | print_cpu(m, cpu, now); | ||
251 | |||
252 | SEQ_printf(m, "\n"); | ||
253 | timer_list_show_tickdevices(m); | ||
254 | |||
255 | return 0; | ||
256 | } | ||
257 | |||
258 | void sysrq_timer_list_show(void) | ||
259 | { | ||
260 | timer_list_show(NULL, NULL); | ||
261 | } | ||
262 | |||
263 | static int timer_list_open(struct inode *inode, struct file *filp) | ||
264 | { | ||
265 | return single_open(filp, timer_list_show, NULL); | ||
266 | } | ||
267 | |||
268 | static struct file_operations timer_list_fops = { | ||
269 | .open = timer_list_open, | ||
270 | .read = seq_read, | ||
271 | .llseek = seq_lseek, | ||
272 | .release = seq_release, | ||
273 | }; | ||
274 | |||
275 | static int __init init_timer_list_procfs(void) | ||
276 | { | ||
277 | struct proc_dir_entry *pe; | ||
278 | |||
279 | pe = create_proc_entry("timer_list", 0644, NULL); | ||
280 | if (!pe) | ||
281 | return -ENOMEM; | ||
282 | |||
283 | pe->proc_fops = &timer_list_fops; | ||
284 | |||
285 | return 0; | ||
286 | } | ||
287 | __initcall(init_timer_list_procfs); | ||
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 000000000000..1bc4882e28e0 --- /dev/null +++ b/kernel/time/timer_stats.c | |||
@@ -0,0 +1,411 @@ | |||
1 | /* | ||
2 | * kernel/time/timer_stats.c | ||
3 | * | ||
4 | * Collect timer usage statistics. | ||
5 | * | ||
6 | * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar | ||
7 | * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * timer_stats is based on timer_top, a similar functionality which was part of | ||
10 | * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the | ||
11 | * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based | ||
12 | * on dynamic allocation of the statistics entries and linear search based | ||
13 | * lookup combined with a global lock, rather than the static array, hash | ||
14 | * and per-CPU locking which is used by timer_stats. It was written for the | ||
15 | * pre hrtimer kernel code and therefore did not take hrtimers into account. | ||
16 | * Nevertheless it provided the base for the timer_stats implementation and | ||
17 | * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks | ||
18 | * for this effort. | ||
19 | * | ||
20 | * timer_top.c is | ||
21 | * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus | ||
22 | * Written by Daniel Petrini <d.pensator@gmail.com> | ||
23 | * timer_top.c was released under the GNU General Public License version 2 | ||
24 | * | ||
25 | * We export the addresses and counting of timer functions being called, | ||
26 | * the pid and cmdline from the owner process if applicable. | ||
27 | * | ||
28 | * Start/stop data collection: | ||
29 | * # echo 1[0] >/proc/timer_stats | ||
30 | * | ||
31 | * Display the information collected so far: | ||
32 | * # cat /proc/timer_stats | ||
33 | * | ||
34 | * This program is free software; you can redistribute it and/or modify | ||
35 | * it under the terms of the GNU General Public License version 2 as | ||
36 | * published by the Free Software Foundation. | ||
37 | */ | ||
38 | |||
39 | #include <linux/proc_fs.h> | ||
40 | #include <linux/module.h> | ||
41 | #include <linux/spinlock.h> | ||
42 | #include <linux/sched.h> | ||
43 | #include <linux/seq_file.h> | ||
44 | #include <linux/kallsyms.h> | ||
45 | |||
46 | #include <asm/uaccess.h> | ||
47 | |||
48 | /* | ||
49 | * This is our basic unit of interest: a timer expiry event identified | ||
50 | * by the timer, its start/expire functions and the PID of the task that | ||
51 | * started the timer. We count the number of times an event happens: | ||
52 | */ | ||
53 | struct entry { | ||
54 | /* | ||
55 | * Hash list: | ||
56 | */ | ||
57 | struct entry *next; | ||
58 | |||
59 | /* | ||
60 | * Hash keys: | ||
61 | */ | ||
62 | void *timer; | ||
63 | void *start_func; | ||
64 | void *expire_func; | ||
65 | pid_t pid; | ||
66 | |||
67 | /* | ||
68 | * Number of timeout events: | ||
69 | */ | ||
70 | unsigned long count; | ||
71 | |||
72 | /* | ||
73 | * We save the command-line string to preserve | ||
74 | * this information past task exit: | ||
75 | */ | ||
76 | char comm[TASK_COMM_LEN + 1]; | ||
77 | |||
78 | } ____cacheline_aligned_in_smp; | ||
79 | |||
80 | /* | ||
81 | * Spinlock protecting the tables - not taken during lookup: | ||
82 | */ | ||
83 | static DEFINE_SPINLOCK(table_lock); | ||
84 | |||
85 | /* | ||
86 | * Per-CPU lookup locks for fast hash lookup: | ||
87 | */ | ||
88 | static DEFINE_PER_CPU(spinlock_t, lookup_lock); | ||
89 | |||
90 | /* | ||
91 | * Mutex to serialize state changes with show-stats activities: | ||
92 | */ | ||
93 | static DEFINE_MUTEX(show_mutex); | ||
94 | |||
95 | /* | ||
96 | * Collection status, active/inactive: | ||
97 | */ | ||
98 | static int __read_mostly active; | ||
99 | |||
100 | /* | ||
101 | * Beginning/end timestamps of measurement: | ||
102 | */ | ||
103 | static ktime_t time_start, time_stop; | ||
104 | |||
105 | /* | ||
106 | * tstat entry structs only get allocated while collection is | ||
107 | * active and never freed during that time - this simplifies | ||
108 | * things quite a bit. | ||
109 | * | ||
110 | * They get freed when a new collection period is started. | ||
111 | */ | ||
112 | #define MAX_ENTRIES_BITS 10 | ||
113 | #define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) | ||
114 | |||
115 | static unsigned long nr_entries; | ||
116 | static struct entry entries[MAX_ENTRIES]; | ||
117 | |||
118 | static atomic_t overflow_count; | ||
119 | |||
120 | static void reset_entries(void) | ||
121 | { | ||
122 | nr_entries = 0; | ||
123 | memset(entries, 0, sizeof(entries)); | ||
124 | atomic_set(&overflow_count, 0); | ||
125 | } | ||
126 | |||
127 | static struct entry *alloc_entry(void) | ||
128 | { | ||
129 | if (nr_entries >= MAX_ENTRIES) | ||
130 | return NULL; | ||
131 | |||
132 | return entries + nr_entries++; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * The entries are in a hash-table, for fast lookup: | ||
137 | */ | ||
138 | #define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) | ||
139 | #define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) | ||
140 | #define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) | ||
141 | |||
142 | #define __tstat_hashfn(entry) \ | ||
143 | (((unsigned long)(entry)->timer ^ \ | ||
144 | (unsigned long)(entry)->start_func ^ \ | ||
145 | (unsigned long)(entry)->expire_func ^ \ | ||
146 | (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) | ||
147 | |||
148 | #define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) | ||
149 | |||
150 | static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; | ||
151 | |||
152 | static int match_entries(struct entry *entry1, struct entry *entry2) | ||
153 | { | ||
154 | return entry1->timer == entry2->timer && | ||
155 | entry1->start_func == entry2->start_func && | ||
156 | entry1->expire_func == entry2->expire_func && | ||
157 | entry1->pid == entry2->pid; | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Look up whether an entry matching this item is present | ||
162 | * in the hash already. Must be called with irqs off and the | ||
163 | * lookup lock held: | ||
164 | */ | ||
165 | static struct entry *tstat_lookup(struct entry *entry, char *comm) | ||
166 | { | ||
167 | struct entry **head, *curr, *prev; | ||
168 | |||
169 | head = tstat_hashentry(entry); | ||
170 | curr = *head; | ||
171 | |||
172 | /* | ||
173 | * The fastpath is when the entry is already hashed, | ||
174 | * we do this with the lookup lock held, but with the | ||
175 | * table lock not held: | ||
176 | */ | ||
177 | while (curr) { | ||
178 | if (match_entries(curr, entry)) | ||
179 | return curr; | ||
180 | |||
181 | curr = curr->next; | ||
182 | } | ||
183 | /* | ||
184 | * Slowpath: allocate, set up and link a new hash entry: | ||
185 | */ | ||
186 | prev = NULL; | ||
187 | curr = *head; | ||
188 | |||
189 | spin_lock(&table_lock); | ||
190 | /* | ||
191 | * Make sure we have not raced with another CPU: | ||
192 | */ | ||
193 | while (curr) { | ||
194 | if (match_entries(curr, entry)) | ||
195 | goto out_unlock; | ||
196 | |||
197 | prev = curr; | ||
198 | curr = curr->next; | ||
199 | } | ||
200 | |||
201 | curr = alloc_entry(); | ||
202 | if (curr) { | ||
203 | *curr = *entry; | ||
204 | curr->count = 0; | ||
205 | memcpy(curr->comm, comm, TASK_COMM_LEN); | ||
206 | if (prev) | ||
207 | prev->next = curr; | ||
208 | else | ||
209 | *head = curr; | ||
210 | curr->next = NULL; | ||
211 | } | ||
212 | out_unlock: | ||
213 | spin_unlock(&table_lock); | ||
214 | |||
215 | return curr; | ||
216 | } | ||
217 | |||
218 | /** | ||
219 | * timer_stats_update_stats - Update the statistics for a timer. | ||
220 | * @timer: pointer to either a timer_list or a hrtimer | ||
221 | * @pid: the pid of the task which set up the timer | ||
222 | * @startf: pointer to the function which did the timer setup | ||
223 | * @timerf: pointer to the timer callback function of the timer | ||
224 | * @comm: name of the process which set up the timer | ||
225 | * | ||
226 | * When the timer is already registered, then the event counter is | ||
227 | * incremented. Otherwise the timer is registered in a free slot. | ||
228 | */ | ||
229 | void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | ||
230 | void *timerf, char * comm) | ||
231 | { | ||
232 | /* | ||
233 | * It doesnt matter which lock we take: | ||
234 | */ | ||
235 | spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); | ||
236 | struct entry *entry, input; | ||
237 | unsigned long flags; | ||
238 | |||
239 | input.timer = timer; | ||
240 | input.start_func = startf; | ||
241 | input.expire_func = timerf; | ||
242 | input.pid = pid; | ||
243 | |||
244 | spin_lock_irqsave(lock, flags); | ||
245 | if (!active) | ||
246 | goto out_unlock; | ||
247 | |||
248 | entry = tstat_lookup(&input, comm); | ||
249 | if (likely(entry)) | ||
250 | entry->count++; | ||
251 | else | ||
252 | atomic_inc(&overflow_count); | ||
253 | |||
254 | out_unlock: | ||
255 | spin_unlock_irqrestore(lock, flags); | ||
256 | } | ||
257 | |||
258 | static void print_name_offset(struct seq_file *m, unsigned long addr) | ||
259 | { | ||
260 | char namebuf[KSYM_NAME_LEN+1]; | ||
261 | unsigned long size, offset; | ||
262 | const char *sym_name; | ||
263 | char *modname; | ||
264 | |||
265 | sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); | ||
266 | if (sym_name) | ||
267 | seq_printf(m, "%s", sym_name); | ||
268 | else | ||
269 | seq_printf(m, "<%p>", (void *)addr); | ||
270 | } | ||
271 | |||
272 | static int tstats_show(struct seq_file *m, void *v) | ||
273 | { | ||
274 | struct timespec period; | ||
275 | struct entry *entry; | ||
276 | unsigned long ms; | ||
277 | long events = 0; | ||
278 | ktime_t time; | ||
279 | int i; | ||
280 | |||
281 | mutex_lock(&show_mutex); | ||
282 | /* | ||
283 | * If still active then calculate up to now: | ||
284 | */ | ||
285 | if (active) | ||
286 | time_stop = ktime_get(); | ||
287 | |||
288 | time = ktime_sub(time_stop, time_start); | ||
289 | |||
290 | period = ktime_to_timespec(time); | ||
291 | ms = period.tv_nsec / 1000000; | ||
292 | |||
293 | seq_puts(m, "Timer Stats Version: v0.1\n"); | ||
294 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | ||
295 | if (atomic_read(&overflow_count)) | ||
296 | seq_printf(m, "Overflow: %d entries\n", | ||
297 | atomic_read(&overflow_count)); | ||
298 | |||
299 | for (i = 0; i < nr_entries; i++) { | ||
300 | entry = entries + i; | ||
301 | seq_printf(m, "%4lu, %5d %-16s ", | ||
302 | entry->count, entry->pid, entry->comm); | ||
303 | |||
304 | print_name_offset(m, (unsigned long)entry->start_func); | ||
305 | seq_puts(m, " ("); | ||
306 | print_name_offset(m, (unsigned long)entry->expire_func); | ||
307 | seq_puts(m, ")\n"); | ||
308 | |||
309 | events += entry->count; | ||
310 | } | ||
311 | |||
312 | ms += period.tv_sec * 1000; | ||
313 | if (!ms) | ||
314 | ms = 1; | ||
315 | |||
316 | if (events && period.tv_sec) | ||
317 | seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, | ||
318 | events / period.tv_sec, events * 1000 / ms); | ||
319 | else | ||
320 | seq_printf(m, "%ld total events\n", events); | ||
321 | |||
322 | mutex_unlock(&show_mutex); | ||
323 | |||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * After a state change, make sure all concurrent lookup/update | ||
329 | * activities have stopped: | ||
330 | */ | ||
331 | static void sync_access(void) | ||
332 | { | ||
333 | unsigned long flags; | ||
334 | int cpu; | ||
335 | |||
336 | for_each_online_cpu(cpu) { | ||
337 | spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); | ||
338 | /* nothing */ | ||
339 | spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); | ||
340 | } | ||
341 | } | ||
342 | |||
343 | static ssize_t tstats_write(struct file *file, const char __user *buf, | ||
344 | size_t count, loff_t *offs) | ||
345 | { | ||
346 | char ctl[2]; | ||
347 | |||
348 | if (count != 2 || *offs) | ||
349 | return -EINVAL; | ||
350 | |||
351 | if (copy_from_user(ctl, buf, count)) | ||
352 | return -EFAULT; | ||
353 | |||
354 | mutex_lock(&show_mutex); | ||
355 | switch (ctl[0]) { | ||
356 | case '0': | ||
357 | if (active) { | ||
358 | active = 0; | ||
359 | time_stop = ktime_get(); | ||
360 | sync_access(); | ||
361 | } | ||
362 | break; | ||
363 | case '1': | ||
364 | if (!active) { | ||
365 | reset_entries(); | ||
366 | time_start = ktime_get(); | ||
367 | active = 1; | ||
368 | } | ||
369 | break; | ||
370 | default: | ||
371 | count = -EINVAL; | ||
372 | } | ||
373 | mutex_unlock(&show_mutex); | ||
374 | |||
375 | return count; | ||
376 | } | ||
377 | |||
378 | static int tstats_open(struct inode *inode, struct file *filp) | ||
379 | { | ||
380 | return single_open(filp, tstats_show, NULL); | ||
381 | } | ||
382 | |||
383 | static struct file_operations tstats_fops = { | ||
384 | .open = tstats_open, | ||
385 | .read = seq_read, | ||
386 | .write = tstats_write, | ||
387 | .llseek = seq_lseek, | ||
388 | .release = seq_release, | ||
389 | }; | ||
390 | |||
391 | void __init init_timer_stats(void) | ||
392 | { | ||
393 | int cpu; | ||
394 | |||
395 | for_each_possible_cpu(cpu) | ||
396 | spin_lock_init(&per_cpu(lookup_lock, cpu)); | ||
397 | } | ||
398 | |||
399 | static int __init init_tstats_procfs(void) | ||
400 | { | ||
401 | struct proc_dir_entry *pe; | ||
402 | |||
403 | pe = create_proc_entry("timer_stats", 0644, NULL); | ||
404 | if (!pe) | ||
405 | return -ENOMEM; | ||
406 | |||
407 | pe->proc_fops = &tstats_fops; | ||
408 | |||
409 | return 0; | ||
410 | } | ||
411 | __initcall(init_tstats_procfs); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 8533c3796082..cb1b86a9c52f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -34,6 +34,8 @@ | |||
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
37 | #include <linux/tick.h> | ||
38 | #include <linux/kallsyms.h> | ||
37 | 39 | ||
38 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
39 | #include <asm/unistd.h> | 41 | #include <asm/unistd.h> |
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
262 | list_add_tail(&timer->entry, vec); | 264 | list_add_tail(&timer->entry, vec); |
263 | } | 265 | } |
264 | 266 | ||
267 | #ifdef CONFIG_TIMER_STATS | ||
268 | void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) | ||
269 | { | ||
270 | if (timer->start_site) | ||
271 | return; | ||
272 | |||
273 | timer->start_site = addr; | ||
274 | memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); | ||
275 | timer->start_pid = current->pid; | ||
276 | } | ||
277 | #endif | ||
278 | |||
265 | /** | 279 | /** |
266 | * init_timer - initialize a timer. | 280 | * init_timer - initialize a timer. |
267 | * @timer: the timer to be initialized | 281 | * @timer: the timer to be initialized |
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer) | |||
273 | { | 287 | { |
274 | timer->entry.next = NULL; | 288 | timer->entry.next = NULL; |
275 | timer->base = __raw_get_cpu_var(tvec_bases); | 289 | timer->base = __raw_get_cpu_var(tvec_bases); |
290 | #ifdef CONFIG_TIMER_STATS | ||
291 | timer->start_site = NULL; | ||
292 | timer->start_pid = -1; | ||
293 | memset(timer->start_comm, 0, TASK_COMM_LEN); | ||
294 | #endif | ||
276 | } | 295 | } |
277 | EXPORT_SYMBOL(init_timer); | 296 | EXPORT_SYMBOL(init_timer); |
278 | 297 | ||
279 | static inline void detach_timer(struct timer_list *timer, | 298 | static inline void detach_timer(struct timer_list *timer, |
280 | int clear_pending) | 299 | int clear_pending) |
281 | { | 300 | { |
282 | struct list_head *entry = &timer->entry; | 301 | struct list_head *entry = &timer->entry; |
283 | 302 | ||
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
324 | unsigned long flags; | 343 | unsigned long flags; |
325 | int ret = 0; | 344 | int ret = 0; |
326 | 345 | ||
346 | timer_stats_timer_set_start_info(timer); | ||
327 | BUG_ON(!timer->function); | 347 | BUG_ON(!timer->function); |
328 | 348 | ||
329 | base = lock_timer_base(timer, &flags); | 349 | base = lock_timer_base(timer, &flags); |
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
374 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 394 | tvec_base_t *base = per_cpu(tvec_bases, cpu); |
375 | unsigned long flags; | 395 | unsigned long flags; |
376 | 396 | ||
397 | timer_stats_timer_set_start_info(timer); | ||
377 | BUG_ON(timer_pending(timer) || !timer->function); | 398 | BUG_ON(timer_pending(timer) || !timer->function); |
378 | spin_lock_irqsave(&base->lock, flags); | 399 | spin_lock_irqsave(&base->lock, flags); |
379 | timer->base = base; | 400 | timer->base = base; |
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
406 | { | 427 | { |
407 | BUG_ON(!timer->function); | 428 | BUG_ON(!timer->function); |
408 | 429 | ||
430 | timer_stats_timer_set_start_info(timer); | ||
409 | /* | 431 | /* |
410 | * This is a common optimization triggered by the | 432 | * This is a common optimization triggered by the |
411 | * networking code - if the timer is re-modified | 433 | * networking code - if the timer is re-modified |
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer) | |||
436 | unsigned long flags; | 458 | unsigned long flags; |
437 | int ret = 0; | 459 | int ret = 0; |
438 | 460 | ||
461 | timer_stats_timer_clear_start_info(timer); | ||
439 | if (timer_pending(timer)) { | 462 | if (timer_pending(timer)) { |
440 | base = lock_timer_base(timer, &flags); | 463 | base = lock_timer_base(timer, &flags); |
441 | if (timer_pending(timer)) { | 464 | if (timer_pending(timer)) { |
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base) | |||
569 | fn = timer->function; | 592 | fn = timer->function; |
570 | data = timer->data; | 593 | data = timer->data; |
571 | 594 | ||
595 | timer_stats_account_timer(timer); | ||
596 | |||
572 | set_running_timer(base, timer); | 597 | set_running_timer(base, timer); |
573 | detach_timer(timer, 1); | 598 | detach_timer(timer, 1); |
574 | spin_unlock_irq(&base->lock); | 599 | spin_unlock_irq(&base->lock); |
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base) | |||
591 | spin_unlock_irq(&base->lock); | 616 | spin_unlock_irq(&base->lock); |
592 | } | 617 | } |
593 | 618 | ||
594 | #ifdef CONFIG_NO_IDLE_HZ | 619 | #if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) |
595 | /* | 620 | /* |
596 | * Find out when the next timer event is due to happen. This | 621 | * Find out when the next timer event is due to happen. This |
597 | * is used on S/390 to stop all activity when a cpus is idle. | 622 | * is used on S/390 to stop all activity when a cpus is idle. |
598 | * This functions needs to be called disabled. | 623 | * This functions needs to be called disabled. |
599 | */ | 624 | */ |
600 | unsigned long next_timer_interrupt(void) | 625 | static unsigned long __next_timer_interrupt(tvec_base_t *base) |
601 | { | 626 | { |
602 | tvec_base_t *base; | 627 | unsigned long timer_jiffies = base->timer_jiffies; |
603 | struct list_head *list; | 628 | unsigned long expires = timer_jiffies + (LONG_MAX >> 1); |
629 | int index, slot, array, found = 0; | ||
604 | struct timer_list *nte; | 630 | struct timer_list *nte; |
605 | unsigned long expires; | ||
606 | unsigned long hr_expires = MAX_JIFFY_OFFSET; | ||
607 | ktime_t hr_delta; | ||
608 | tvec_t *varray[4]; | 631 | tvec_t *varray[4]; |
609 | int i, j; | ||
610 | |||
611 | hr_delta = hrtimer_get_next_event(); | ||
612 | if (hr_delta.tv64 != KTIME_MAX) { | ||
613 | struct timespec tsdelta; | ||
614 | tsdelta = ktime_to_timespec(hr_delta); | ||
615 | hr_expires = timespec_to_jiffies(&tsdelta); | ||
616 | if (hr_expires < 3) | ||
617 | return hr_expires + jiffies; | ||
618 | } | ||
619 | hr_expires += jiffies; | ||
620 | |||
621 | base = __get_cpu_var(tvec_bases); | ||
622 | spin_lock(&base->lock); | ||
623 | expires = base->timer_jiffies + (LONG_MAX >> 1); | ||
624 | list = NULL; | ||
625 | 632 | ||
626 | /* Look for timer events in tv1. */ | 633 | /* Look for timer events in tv1. */ |
627 | j = base->timer_jiffies & TVR_MASK; | 634 | index = slot = timer_jiffies & TVR_MASK; |
628 | do { | 635 | do { |
629 | list_for_each_entry(nte, base->tv1.vec + j, entry) { | 636 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { |
637 | found = 1; | ||
630 | expires = nte->expires; | 638 | expires = nte->expires; |
631 | if (j < (base->timer_jiffies & TVR_MASK)) | 639 | /* Look at the cascade bucket(s)? */ |
632 | list = base->tv2.vec + (INDEX(0)); | 640 | if (!index || slot < index) |
633 | goto found; | 641 | goto cascade; |
642 | return expires; | ||
634 | } | 643 | } |
635 | j = (j + 1) & TVR_MASK; | 644 | slot = (slot + 1) & TVR_MASK; |
636 | } while (j != (base->timer_jiffies & TVR_MASK)); | 645 | } while (slot != index); |
646 | |||
647 | cascade: | ||
648 | /* Calculate the next cascade event */ | ||
649 | if (index) | ||
650 | timer_jiffies += TVR_SIZE - index; | ||
651 | timer_jiffies >>= TVR_BITS; | ||
637 | 652 | ||
638 | /* Check tv2-tv5. */ | 653 | /* Check tv2-tv5. */ |
639 | varray[0] = &base->tv2; | 654 | varray[0] = &base->tv2; |
640 | varray[1] = &base->tv3; | 655 | varray[1] = &base->tv3; |
641 | varray[2] = &base->tv4; | 656 | varray[2] = &base->tv4; |
642 | varray[3] = &base->tv5; | 657 | varray[3] = &base->tv5; |
643 | for (i = 0; i < 4; i++) { | 658 | |
644 | j = INDEX(i); | 659 | for (array = 0; array < 4; array++) { |
660 | tvec_t *varp = varray[array]; | ||
661 | |||
662 | index = slot = timer_jiffies & TVN_MASK; | ||
645 | do { | 663 | do { |
646 | if (list_empty(varray[i]->vec + j)) { | 664 | list_for_each_entry(nte, varp->vec + slot, entry) { |
647 | j = (j + 1) & TVN_MASK; | 665 | found = 1; |
648 | continue; | ||
649 | } | ||
650 | list_for_each_entry(nte, varray[i]->vec + j, entry) | ||
651 | if (time_before(nte->expires, expires)) | 666 | if (time_before(nte->expires, expires)) |
652 | expires = nte->expires; | 667 | expires = nte->expires; |
653 | if (j < (INDEX(i)) && i < 3) | 668 | } |
654 | list = varray[i + 1]->vec + (INDEX(i + 1)); | 669 | /* |
655 | goto found; | 670 | * Do we still search for the first timer or are |
656 | } while (j != (INDEX(i))); | 671 | * we looking up the cascade buckets ? |
657 | } | 672 | */ |
658 | found: | 673 | if (found) { |
659 | if (list) { | 674 | /* Look at the cascade bucket(s)? */ |
660 | /* | 675 | if (!index || slot < index) |
661 | * The search wrapped. We need to look at the next list | 676 | break; |
662 | * from next tv element that would cascade into tv element | 677 | return expires; |
663 | * where we found the timer element. | 678 | } |
664 | */ | 679 | slot = (slot + 1) & TVN_MASK; |
665 | list_for_each_entry(nte, list, entry) { | 680 | } while (slot != index); |
666 | if (time_before(nte->expires, expires)) | 681 | |
667 | expires = nte->expires; | 682 | if (index) |
668 | } | 683 | timer_jiffies += TVN_SIZE - index; |
684 | timer_jiffies >>= TVN_BITS; | ||
669 | } | 685 | } |
670 | spin_unlock(&base->lock); | 686 | return expires; |
687 | } | ||
671 | 688 | ||
672 | /* | 689 | /* |
673 | * It can happen that other CPUs service timer IRQs and increment | 690 | * Check, if the next hrtimer event is before the next timer wheel |
674 | * jiffies, but we have not yet got a local timer tick to process | 691 | * event: |
675 | * the timer wheels. In that case, the expiry time can be before | 692 | */ |
676 | * jiffies, but since the high-resolution timer here is relative to | 693 | static unsigned long cmp_next_hrtimer_event(unsigned long now, |
677 | * jiffies, the default expression when high-resolution timers are | 694 | unsigned long expires) |
678 | * not active, | 695 | { |
679 | * | 696 | ktime_t hr_delta = hrtimer_get_next_event(); |
680 | * time_before(MAX_JIFFY_OFFSET + jiffies, expires) | 697 | struct timespec tsdelta; |
681 | * | 698 | |
682 | * would falsely evaluate to true. If that is the case, just | 699 | if (hr_delta.tv64 == KTIME_MAX) |
683 | * return jiffies so that we can immediately fire the local timer | 700 | return expires; |
684 | */ | ||
685 | if (time_before(expires, jiffies)) | ||
686 | return jiffies; | ||
687 | 701 | ||
688 | if (time_before(hr_expires, expires)) | 702 | if (hr_delta.tv64 <= TICK_NSEC) |
689 | return hr_expires; | 703 | return now; |
690 | 704 | ||
705 | tsdelta = ktime_to_timespec(hr_delta); | ||
706 | now += timespec_to_jiffies(&tsdelta); | ||
707 | if (time_before(now, expires)) | ||
708 | return now; | ||
691 | return expires; | 709 | return expires; |
692 | } | 710 | } |
711 | |||
712 | /** | ||
713 | * next_timer_interrupt - return the jiffy of the next pending timer | ||
714 | */ | ||
715 | unsigned long get_next_timer_interrupt(unsigned long now) | ||
716 | { | ||
717 | tvec_base_t *base = __get_cpu_var(tvec_bases); | ||
718 | unsigned long expires; | ||
719 | |||
720 | spin_lock(&base->lock); | ||
721 | expires = __next_timer_interrupt(base); | ||
722 | spin_unlock(&base->lock); | ||
723 | |||
724 | if (time_before_eq(expires, now)) | ||
725 | return now; | ||
726 | |||
727 | return cmp_next_hrtimer_event(now, expires); | ||
728 | } | ||
729 | |||
730 | #ifdef CONFIG_NO_IDLE_HZ | ||
731 | unsigned long next_timer_interrupt(void) | ||
732 | { | ||
733 | return get_next_timer_interrupt(jiffies); | ||
734 | } | ||
735 | #endif | ||
736 | |||
693 | #endif | 737 | #endif |
694 | 738 | ||
695 | /******************************************************************/ | 739 | /******************************************************************/ |
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday); | |||
832 | * | 876 | * |
833 | * Accumulates current time interval and initializes new clocksource | 877 | * Accumulates current time interval and initializes new clocksource |
834 | */ | 878 | */ |
835 | static int change_clocksource(void) | 879 | static void change_clocksource(void) |
836 | { | 880 | { |
837 | struct clocksource *new; | 881 | struct clocksource *new; |
838 | cycle_t now; | 882 | cycle_t now; |
839 | u64 nsec; | 883 | u64 nsec; |
884 | |||
840 | new = clocksource_get_next(); | 885 | new = clocksource_get_next(); |
841 | if (clock != new) { | 886 | |
842 | now = clocksource_read(new); | 887 | if (clock == new) |
843 | nsec = __get_nsec_offset(); | 888 | return; |
844 | timespec_add_ns(&xtime, nsec); | 889 | |
845 | 890 | now = clocksource_read(new); | |
846 | clock = new; | 891 | nsec = __get_nsec_offset(); |
847 | clock->cycle_last = now; | 892 | timespec_add_ns(&xtime, nsec); |
848 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | 893 | |
849 | clock->name); | 894 | clock = new; |
850 | return 1; | 895 | clock->cycle_last = now; |
851 | } else if (clock->update_callback) { | 896 | |
852 | return clock->update_callback(); | 897 | clock->error = 0; |
853 | } | 898 | clock->xtime_nsec = 0; |
854 | return 0; | 899 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
900 | |||
901 | tick_clock_notify(); | ||
902 | |||
903 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
904 | clock->name); | ||
855 | } | 905 | } |
856 | #else | 906 | #else |
857 | static inline int change_clocksource(void) | 907 | static inline void change_clocksource(void) { } |
858 | { | ||
859 | return 0; | ||
860 | } | ||
861 | #endif | 908 | #endif |
862 | 909 | ||
863 | /** | 910 | /** |
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void) | |||
871 | do { | 918 | do { |
872 | seq = read_seqbegin(&xtime_lock); | 919 | seq = read_seqbegin(&xtime_lock); |
873 | 920 | ||
874 | ret = clock->is_continuous; | 921 | ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
875 | 922 | ||
876 | } while (read_seqretry(&xtime_lock, seq)); | 923 | } while (read_seqretry(&xtime_lock, seq)); |
877 | 924 | ||
878 | return ret; | 925 | return ret; |
879 | } | 926 | } |
880 | 927 | ||
928 | /** | ||
929 | * read_persistent_clock - Return time in seconds from the persistent clock. | ||
930 | * | ||
931 | * Weak dummy function for arches that do not yet support it. | ||
932 | * Returns seconds from epoch using the battery backed persistent clock. | ||
933 | * Returns zero if unsupported. | ||
934 | * | ||
935 | * XXX - Do be sure to remove it once all arches implement it. | ||
936 | */ | ||
937 | unsigned long __attribute__((weak)) read_persistent_clock(void) | ||
938 | { | ||
939 | return 0; | ||
940 | } | ||
941 | |||
881 | /* | 942 | /* |
882 | * timekeeping_init - Initializes the clocksource and common timekeeping values | 943 | * timekeeping_init - Initializes the clocksource and common timekeeping values |
883 | */ | 944 | */ |
884 | void __init timekeeping_init(void) | 945 | void __init timekeeping_init(void) |
885 | { | 946 | { |
886 | unsigned long flags; | 947 | unsigned long flags; |
948 | unsigned long sec = read_persistent_clock(); | ||
887 | 949 | ||
888 | write_seqlock_irqsave(&xtime_lock, flags); | 950 | write_seqlock_irqsave(&xtime_lock, flags); |
889 | 951 | ||
890 | ntp_clear(); | 952 | ntp_clear(); |
891 | 953 | ||
892 | clock = clocksource_get_next(); | 954 | clock = clocksource_get_next(); |
893 | clocksource_calculate_interval(clock, tick_nsec); | 955 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
894 | clock->cycle_last = clocksource_read(clock); | 956 | clock->cycle_last = clocksource_read(clock); |
895 | 957 | ||
958 | xtime.tv_sec = sec; | ||
959 | xtime.tv_nsec = 0; | ||
960 | set_normalized_timespec(&wall_to_monotonic, | ||
961 | -xtime.tv_sec, -xtime.tv_nsec); | ||
962 | |||
896 | write_sequnlock_irqrestore(&xtime_lock, flags); | 963 | write_sequnlock_irqrestore(&xtime_lock, flags); |
897 | } | 964 | } |
898 | 965 | ||
899 | 966 | /* flag for if timekeeping is suspended */ | |
900 | static int timekeeping_suspended; | 967 | static int timekeeping_suspended; |
968 | /* time in seconds when suspend began */ | ||
969 | static unsigned long timekeeping_suspend_time; | ||
970 | |||
901 | /** | 971 | /** |
902 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 972 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
903 | * @dev: unused | 973 | * @dev: unused |
@@ -909,13 +979,26 @@ static int timekeeping_suspended; | |||
909 | static int timekeeping_resume(struct sys_device *dev) | 979 | static int timekeeping_resume(struct sys_device *dev) |
910 | { | 980 | { |
911 | unsigned long flags; | 981 | unsigned long flags; |
982 | unsigned long now = read_persistent_clock(); | ||
912 | 983 | ||
913 | write_seqlock_irqsave(&xtime_lock, flags); | 984 | write_seqlock_irqsave(&xtime_lock, flags); |
914 | /* restart the last cycle value */ | 985 | |
986 | if (now && (now > timekeeping_suspend_time)) { | ||
987 | unsigned long sleep_length = now - timekeeping_suspend_time; | ||
988 | |||
989 | xtime.tv_sec += sleep_length; | ||
990 | wall_to_monotonic.tv_sec -= sleep_length; | ||
991 | } | ||
992 | /* re-base the last cycle value */ | ||
915 | clock->cycle_last = clocksource_read(clock); | 993 | clock->cycle_last = clocksource_read(clock); |
916 | clock->error = 0; | 994 | clock->error = 0; |
917 | timekeeping_suspended = 0; | 995 | timekeeping_suspended = 0; |
918 | write_sequnlock_irqrestore(&xtime_lock, flags); | 996 | write_sequnlock_irqrestore(&xtime_lock, flags); |
997 | |||
998 | touch_softlockup_watchdog(); | ||
999 | /* Resume hrtimers */ | ||
1000 | clock_was_set(); | ||
1001 | |||
919 | return 0; | 1002 | return 0; |
920 | } | 1003 | } |
921 | 1004 | ||
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
925 | 1008 | ||
926 | write_seqlock_irqsave(&xtime_lock, flags); | 1009 | write_seqlock_irqsave(&xtime_lock, flags); |
927 | timekeeping_suspended = 1; | 1010 | timekeeping_suspended = 1; |
1011 | timekeeping_suspend_time = read_persistent_clock(); | ||
928 | write_sequnlock_irqrestore(&xtime_lock, flags); | 1012 | write_sequnlock_irqrestore(&xtime_lock, flags); |
929 | return 0; | 1013 | return 0; |
930 | } | 1014 | } |
@@ -1089,11 +1173,8 @@ static void update_wall_time(void) | |||
1089 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | 1173 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; |
1090 | 1174 | ||
1091 | /* check to see if there is a new clocksource to use */ | 1175 | /* check to see if there is a new clocksource to use */ |
1092 | if (change_clocksource()) { | 1176 | change_clocksource(); |
1093 | clock->error = 0; | 1177 | update_vsyscall(&xtime, clock); |
1094 | clock->xtime_nsec = 0; | ||
1095 | clocksource_calculate_interval(clock, tick_nsec); | ||
1096 | } | ||
1097 | } | 1178 | } |
1098 | 1179 | ||
1099 | /* | 1180 | /* |
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks) | |||
1162 | * This read-write spinlock protects us from races in SMP while | 1243 | * This read-write spinlock protects us from races in SMP while |
1163 | * playing with xtime and avenrun. | 1244 | * playing with xtime and avenrun. |
1164 | */ | 1245 | */ |
1165 | #ifndef ARCH_HAVE_XTIME_LOCK | 1246 | __attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
1166 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
1167 | 1247 | ||
1168 | EXPORT_SYMBOL(xtime_lock); | 1248 | EXPORT_SYMBOL(xtime_lock); |
1169 | #endif | ||
1170 | 1249 | ||
1171 | /* | 1250 | /* |
1172 | * This function runs timers and the timer-tq in bottom half context. | 1251 | * This function runs timers and the timer-tq in bottom half context. |
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h) | |||
1175 | { | 1254 | { |
1176 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 1255 | tvec_base_t *base = __get_cpu_var(tvec_bases); |
1177 | 1256 | ||
1178 | hrtimer_run_queues(); | 1257 | hrtimer_run_queues(); |
1258 | |||
1179 | if (time_after_eq(jiffies, base->timer_jiffies)) | 1259 | if (time_after_eq(jiffies, base->timer_jiffies)) |
1180 | __run_timers(base); | 1260 | __run_timers(base); |
1181 | } | 1261 | } |
@@ -1621,6 +1701,8 @@ void __init init_timers(void) | |||
1621 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1701 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, |
1622 | (void *)(long)smp_processor_id()); | 1702 | (void *)(long)smp_processor_id()); |
1623 | 1703 | ||
1704 | init_timer_stats(); | ||
1705 | |||
1624 | BUG_ON(err == NOTIFY_BAD); | 1706 | BUG_ON(err == NOTIFY_BAD); |
1625 | register_cpu_notifier(&timers_nb); | 1707 | register_cpu_notifier(&timers_nb); |
1626 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | 1708 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index baacc3691415..658f638c402c 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -22,8 +22,6 @@ | |||
22 | #include <linux/acct.h> | 22 | #include <linux/acct.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | 24 | ||
25 | |||
26 | #define USEC_PER_TICK (USEC_PER_SEC/HZ) | ||
27 | /* | 25 | /* |
28 | * fill in basic accounting fields | 26 | * fill in basic accounting fields |
29 | */ | 27 | */ |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c new file mode 100644 index 000000000000..f22b9dbd2a9c --- /dev/null +++ b/kernel/utsname_sysctl.c | |||
@@ -0,0 +1,146 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 | ||
3 | * | ||
4 | * Author: Eric Biederman <ebiederm@xmision.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation, version 2 of the | ||
9 | * License. | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/uts.h> | ||
14 | #include <linux/utsname.h> | ||
15 | #include <linux/version.h> | ||
16 | #include <linux/sysctl.h> | ||
17 | |||
18 | static void *get_uts(ctl_table *table, int write) | ||
19 | { | ||
20 | char *which = table->data; | ||
21 | #ifdef CONFIG_UTS_NS | ||
22 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
23 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
24 | #endif | ||
25 | if (!write) | ||
26 | down_read(&uts_sem); | ||
27 | else | ||
28 | down_write(&uts_sem); | ||
29 | return which; | ||
30 | } | ||
31 | |||
32 | static void put_uts(ctl_table *table, int write, void *which) | ||
33 | { | ||
34 | if (!write) | ||
35 | up_read(&uts_sem); | ||
36 | else | ||
37 | up_write(&uts_sem); | ||
38 | } | ||
39 | |||
40 | #ifdef CONFIG_PROC_FS | ||
41 | /* | ||
42 | * Special case of dostring for the UTS structure. This has locks | ||
43 | * to observe. Should this be in kernel/sys.c ???? | ||
44 | */ | ||
45 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
46 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
47 | { | ||
48 | struct ctl_table uts_table; | ||
49 | int r; | ||
50 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
51 | uts_table.data = get_uts(table, write); | ||
52 | r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); | ||
53 | put_uts(table, write, uts_table.data); | ||
54 | return r; | ||
55 | } | ||
56 | #else | ||
57 | #define proc_do_uts_string NULL | ||
58 | #endif | ||
59 | |||
60 | |||
61 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
62 | /* The generic string strategy routine: */ | ||
63 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
64 | void __user *oldval, size_t __user *oldlenp, | ||
65 | void __user *newval, size_t newlen) | ||
66 | { | ||
67 | struct ctl_table uts_table; | ||
68 | int r, write; | ||
69 | write = newval && newlen; | ||
70 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
71 | uts_table.data = get_uts(table, write); | ||
72 | r = sysctl_string(&uts_table, name, nlen, | ||
73 | oldval, oldlenp, newval, newlen); | ||
74 | put_uts(table, write, uts_table.data); | ||
75 | return r; | ||
76 | } | ||
77 | #else | ||
78 | #define sysctl_uts_string NULL | ||
79 | #endif | ||
80 | |||
81 | static struct ctl_table uts_kern_table[] = { | ||
82 | { | ||
83 | .ctl_name = KERN_OSTYPE, | ||
84 | .procname = "ostype", | ||
85 | .data = init_uts_ns.name.sysname, | ||
86 | .maxlen = sizeof(init_uts_ns.name.sysname), | ||
87 | .mode = 0444, | ||
88 | .proc_handler = proc_do_uts_string, | ||
89 | .strategy = sysctl_uts_string, | ||
90 | }, | ||
91 | { | ||
92 | .ctl_name = KERN_OSRELEASE, | ||
93 | .procname = "osrelease", | ||
94 | .data = init_uts_ns.name.release, | ||
95 | .maxlen = sizeof(init_uts_ns.name.release), | ||
96 | .mode = 0444, | ||
97 | .proc_handler = proc_do_uts_string, | ||
98 | .strategy = sysctl_uts_string, | ||
99 | }, | ||
100 | { | ||
101 | .ctl_name = KERN_VERSION, | ||
102 | .procname = "version", | ||
103 | .data = init_uts_ns.name.version, | ||
104 | .maxlen = sizeof(init_uts_ns.name.version), | ||
105 | .mode = 0444, | ||
106 | .proc_handler = proc_do_uts_string, | ||
107 | .strategy = sysctl_uts_string, | ||
108 | }, | ||
109 | { | ||
110 | .ctl_name = KERN_NODENAME, | ||
111 | .procname = "hostname", | ||
112 | .data = init_uts_ns.name.nodename, | ||
113 | .maxlen = sizeof(init_uts_ns.name.nodename), | ||
114 | .mode = 0644, | ||
115 | .proc_handler = proc_do_uts_string, | ||
116 | .strategy = sysctl_uts_string, | ||
117 | }, | ||
118 | { | ||
119 | .ctl_name = KERN_DOMAINNAME, | ||
120 | .procname = "domainname", | ||
121 | .data = init_uts_ns.name.domainname, | ||
122 | .maxlen = sizeof(init_uts_ns.name.domainname), | ||
123 | .mode = 0644, | ||
124 | .proc_handler = proc_do_uts_string, | ||
125 | .strategy = sysctl_uts_string, | ||
126 | }, | ||
127 | {} | ||
128 | }; | ||
129 | |||
130 | static struct ctl_table uts_root_table[] = { | ||
131 | { | ||
132 | .ctl_name = CTL_KERN, | ||
133 | .procname = "kernel", | ||
134 | .mode = 0555, | ||
135 | .child = uts_kern_table, | ||
136 | }, | ||
137 | {} | ||
138 | }; | ||
139 | |||
140 | static int __init utsname_sysctl_init(void) | ||
141 | { | ||
142 | register_sysctl_table(uts_root_table); | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | __initcall(utsname_sysctl_init); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 020d1fff57dc..b6fa5e63085d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
218 | } | 218 | } |
219 | EXPORT_SYMBOL_GPL(queue_work); | 219 | EXPORT_SYMBOL_GPL(queue_work); |
220 | 220 | ||
221 | static void delayed_work_timer_fn(unsigned long __data) | 221 | void delayed_work_timer_fn(unsigned long __data) |
222 | { | 222 | { |
223 | struct delayed_work *dwork = (struct delayed_work *)__data; | 223 | struct delayed_work *dwork = (struct delayed_work *)__data; |
224 | struct workqueue_struct *wq = get_wq_data(&dwork->work); | 224 | struct workqueue_struct *wq = get_wq_data(&dwork->work); |
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, | |||
245 | struct timer_list *timer = &dwork->timer; | 245 | struct timer_list *timer = &dwork->timer; |
246 | struct work_struct *work = &dwork->work; | 246 | struct work_struct *work = &dwork->work; |
247 | 247 | ||
248 | timer_stats_timer_set_start_info(timer); | ||
248 | if (delay == 0) | 249 | if (delay == 0) |
249 | return queue_work(wq, work); | 250 | return queue_work(wq, work); |
250 | 251 | ||
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work); | |||
593 | * After waiting for a given time this puts a job in the kernel-global | 594 | * After waiting for a given time this puts a job in the kernel-global |
594 | * workqueue. | 595 | * workqueue. |
595 | */ | 596 | */ |
596 | int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) | 597 | int fastcall schedule_delayed_work(struct delayed_work *dwork, |
598 | unsigned long delay) | ||
597 | { | 599 | { |
600 | timer_stats_timer_set_start_info(&dwork->timer); | ||
598 | return queue_delayed_work(keventd_wq, dwork, delay); | 601 | return queue_delayed_work(keventd_wq, dwork, delay); |
599 | } | 602 | } |
600 | EXPORT_SYMBOL(schedule_delayed_work); | 603 | EXPORT_SYMBOL(schedule_delayed_work); |