aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJeff Garzik <jeff@garzik.org>2007-02-17 15:11:43 -0500
committerJeff Garzik <jeff@garzik.org>2007-02-17 15:11:43 -0500
commitf630fe2817601314b2eb7ca5ddc23c7834646731 (patch)
tree3bfb4939b7bbc3859575ca8b58fa3f929b015941 /kernel
parent48c871c1f6a7c7044dd76774fb469e65c7e2e4e8 (diff)
parent8a03d9a498eaf02c8a118752050a5154852c13bf (diff)
Merge branch 'master' into upstream
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c824
-rw-r--r--kernel/irq/chip.c25
-rw-r--r--kernel/irq/manage.c46
-rw-r--r--kernel/irq/proc.c24
-rw-r--r--kernel/itimer.c18
-rw-r--r--kernel/kmod.c44
-rw-r--r--kernel/lockdep_proc.c1
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/posix-cpu-timers.c15
-rw-r--r--kernel/posix-timers.c15
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c7
-rw-r--r--kernel/signal.c58
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/sysctl.c591
-rw-r--r--kernel/time.c254
-rw-r--r--kernel/time/Kconfig25
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/clockevents.c345
-rw-r--r--kernel/time/clocksource.c246
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c480
-rw-r--r--kernel/time/tick-common.c346
-rw-r--r--kernel/time/tick-internal.h110
-rw-r--r--kernel/time/tick-oneshot.c84
-rw-r--r--kernel/time/tick-sched.c563
-rw-r--r--kernel/time/timer_list.c287
-rw-r--r--kernel/time/timer_stats.c411
-rw-r--r--kernel/timer.c290
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/utsname_sysctl.c146
-rw-r--r--kernel/workqueue.c7
37 files changed, 4401 insertions, 931 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 14f4d45e0ae9..ac6b27abb1ad 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
47obj-$(CONFIG_SECCOMP) += seccomp.o 47obj-$(CONFIG_SECCOMP) += seccomp.o
48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
49obj-$(CONFIG_RELAY) += relay.o 49obj-$(CONFIG_RELAY) += relay.o
50obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
50obj-$(CONFIG_UTS_NS) += utsname.o 51obj-$(CONFIG_UTS_NS) += utsname.o
51obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
52obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b6293d94d96..d154cc786489 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
858 init_sigpending(&sig->shared_pending); 858 init_sigpending(&sig->shared_pending);
859 INIT_LIST_HEAD(&sig->posix_timers); 859 INIT_LIST_HEAD(&sig->posix_timers);
860 860
861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
862 sig->it_real_incr.tv64 = 0; 862 sig->it_real_incr.tv64 = 0;
863 sig->real_timer.function = it_real_fn; 863 sig->real_timer.function = it_real_fn;
864 sig->tsk = tsk; 864 sig->tsk = tsk;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a737de857d3..e749e7df14b1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1134 1134
1135 if (sec != MAX_SCHEDULE_TIMEOUT) { 1135 if (sec != MAX_SCHEDULE_TIMEOUT) {
1136 to = &timeout; 1136 to = &timeout;
1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); 1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1138 hrtimer_init_sleeper(to, current); 1138 hrtimer_init_sleeper(to, current);
1139 to->timer.expires = ktime_set(sec, nsec); 1139 to->timer.expires = ktime_set(sec, nsec);
1140 } 1140 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f44e499e8fca..476cb0c0b4a4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * linux/kernel/hrtimer.c 2 * linux/kernel/hrtimer.c
3 * 3 *
4 * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
6 * 7 *
7 * High-resolution kernel timers 8 * High-resolution kernel timers
8 * 9 *
@@ -31,12 +32,17 @@
31 */ 32 */
32 33
33#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
34#include <linux/module.h> 36#include <linux/module.h>
35#include <linux/percpu.h> 37#include <linux/percpu.h>
36#include <linux/hrtimer.h> 38#include <linux/hrtimer.h>
37#include <linux/notifier.h> 39#include <linux/notifier.h>
38#include <linux/syscalls.h> 40#include <linux/syscalls.h>
41#include <linux/kallsyms.h>
39#include <linux/interrupt.h> 42#include <linux/interrupt.h>
43#include <linux/tick.h>
44#include <linux/seq_file.h>
45#include <linux/err.h>
40 46
41#include <asm/uaccess.h> 47#include <asm/uaccess.h>
42 48
@@ -45,7 +51,7 @@
45 * 51 *
46 * returns the time in ktime_t format 52 * returns the time in ktime_t format
47 */ 53 */
48static ktime_t ktime_get(void) 54ktime_t ktime_get(void)
49{ 55{
50 struct timespec now; 56 struct timespec now;
51 57
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void)
59 * 65 *
60 * returns the time in ktime_t format 66 * returns the time in ktime_t format
61 */ 67 */
62static ktime_t ktime_get_real(void) 68ktime_t ktime_get_real(void)
63{ 69{
64 struct timespec now; 70 struct timespec now;
65 71
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
79 * This ensures that we capture erroneous accesses to these clock ids 85 * This ensures that we capture erroneous accesses to these clock ids
80 * rather than moving them into the range of valid clock id's. 86 * rather than moving them into the range of valid clock id's.
81 */ 87 */
82 88DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
83#define MAX_HRTIMER_BASES 2
84
85static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
86{ 89{
90
91 .clock_base =
87 { 92 {
88 .index = CLOCK_REALTIME, 93 {
89 .get_time = &ktime_get_real, 94 .index = CLOCK_REALTIME,
90 .resolution = KTIME_REALTIME_RES, 95 .get_time = &ktime_get_real,
91 }, 96 .resolution = KTIME_LOW_RES,
92 { 97 },
93 .index = CLOCK_MONOTONIC, 98 {
94 .get_time = &ktime_get, 99 .index = CLOCK_MONOTONIC,
95 .resolution = KTIME_MONOTONIC_RES, 100 .get_time = &ktime_get,
96 }, 101 .resolution = KTIME_LOW_RES,
102 },
103 }
97}; 104};
98 105
99/** 106/**
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
125 * Get the coarse grained time at the softirq based on xtime and 132 * Get the coarse grained time at the softirq based on xtime and
126 * wall_to_monotonic. 133 * wall_to_monotonic.
127 */ 134 */
128static void hrtimer_get_softirq_time(struct hrtimer_base *base) 135static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
129{ 136{
130 ktime_t xtim, tomono; 137 ktime_t xtim, tomono;
138 struct timespec xts;
131 unsigned long seq; 139 unsigned long seq;
132 140
133 do { 141 do {
134 seq = read_seqbegin(&xtime_lock); 142 seq = read_seqbegin(&xtime_lock);
135 xtim = timespec_to_ktime(xtime); 143#ifdef CONFIG_NO_HZ
136 tomono = timespec_to_ktime(wall_to_monotonic); 144 getnstimeofday(&xts);
137 145#else
146 xts = xtime;
147#endif
138 } while (read_seqretry(&xtime_lock, seq)); 148 } while (read_seqretry(&xtime_lock, seq));
139 149
140 base[CLOCK_REALTIME].softirq_time = xtim; 150 xtim = timespec_to_ktime(xts);
141 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); 151 tomono = timespec_to_ktime(wall_to_monotonic);
152 base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
153 base->clock_base[CLOCK_MONOTONIC].softirq_time =
154 ktime_add(xtim, tomono);
155}
156
157/*
158 * Helper function to check, whether the timer is running the callback
159 * function
160 */
161static inline int hrtimer_callback_running(struct hrtimer *timer)
162{
163 return timer->state & HRTIMER_STATE_CALLBACK;
142} 164}
143 165
144/* 166/*
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
147 */ 169 */
148#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
149 171
150#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
151
152/* 172/*
153 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock 173 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
154 * means that all timers which are tied to this base via timer->base are 174 * means that all timers which are tied to this base via timer->base are
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
161 * possible to set timer->base = NULL and drop the lock: the timer remains 181 * possible to set timer->base = NULL and drop the lock: the timer remains
162 * locked. 182 * locked.
163 */ 183 */
164static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, 184static
165 unsigned long *flags) 185struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
186 unsigned long *flags)
166{ 187{
167 struct hrtimer_base *base; 188 struct hrtimer_clock_base *base;
168 189
169 for (;;) { 190 for (;;) {
170 base = timer->base; 191 base = timer->base;
171 if (likely(base != NULL)) { 192 if (likely(base != NULL)) {
172 spin_lock_irqsave(&base->lock, *flags); 193 spin_lock_irqsave(&base->cpu_base->lock, *flags);
173 if (likely(base == timer->base)) 194 if (likely(base == timer->base))
174 return base; 195 return base;
175 /* The timer has migrated to another CPU: */ 196 /* The timer has migrated to another CPU: */
176 spin_unlock_irqrestore(&base->lock, *flags); 197 spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
177 } 198 }
178 cpu_relax(); 199 cpu_relax();
179 } 200 }
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
182/* 203/*
183 * Switch the timer base to the current CPU when possible. 204 * Switch the timer base to the current CPU when possible.
184 */ 205 */
185static inline struct hrtimer_base * 206static inline struct hrtimer_clock_base *
186switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) 207switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
187{ 208{
188 struct hrtimer_base *new_base; 209 struct hrtimer_clock_base *new_base;
210 struct hrtimer_cpu_base *new_cpu_base;
189 211
190 new_base = &__get_cpu_var(hrtimer_bases)[base->index]; 212 new_cpu_base = &__get_cpu_var(hrtimer_bases);
213 new_base = &new_cpu_base->clock_base[base->index];
191 214
192 if (base != new_base) { 215 if (base != new_base) {
193 /* 216 /*
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
199 * completed. There is no conflict as we hold the lock until 222 * completed. There is no conflict as we hold the lock until
200 * the timer is enqueued. 223 * the timer is enqueued.
201 */ 224 */
202 if (unlikely(base->curr_timer == timer)) 225 if (unlikely(hrtimer_callback_running(timer)))
203 return base; 226 return base;
204 227
205 /* See the comment in lock_timer_base() */ 228 /* See the comment in lock_timer_base() */
206 timer->base = NULL; 229 timer->base = NULL;
207 spin_unlock(&base->lock); 230 spin_unlock(&base->cpu_base->lock);
208 spin_lock(&new_base->lock); 231 spin_lock(&new_base->cpu_base->lock);
209 timer->base = new_base; 232 timer->base = new_base;
210 } 233 }
211 return new_base; 234 return new_base;
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
213 236
214#else /* CONFIG_SMP */ 237#else /* CONFIG_SMP */
215 238
216#define set_curr_timer(b, t) do { } while (0) 239static inline struct hrtimer_clock_base *
217
218static inline struct hrtimer_base *
219lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 240lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
220{ 241{
221 struct hrtimer_base *base = timer->base; 242 struct hrtimer_clock_base *base = timer->base;
222 243
223 spin_lock_irqsave(&base->lock, *flags); 244 spin_lock_irqsave(&base->cpu_base->lock, *flags);
224 245
225 return base; 246 return base;
226} 247}
227 248
228#define switch_hrtimer_base(t, b) (b) 249# define switch_hrtimer_base(t, b) (b)
229 250
230#endif /* !CONFIG_SMP */ 251#endif /* !CONFIG_SMP */
231 252
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
256 277
257 return ktime_add(kt, tmp); 278 return ktime_add(kt, tmp);
258} 279}
259
260#else /* CONFIG_KTIME_SCALAR */
261
262# endif /* !CONFIG_KTIME_SCALAR */ 280# endif /* !CONFIG_KTIME_SCALAR */
263 281
264/* 282/*
265 * Divide a ktime value by a nanosecond value 283 * Divide a ktime value by a nanosecond value
266 */ 284 */
267static unsigned long ktime_divns(const ktime_t kt, s64 div) 285unsigned long ktime_divns(const ktime_t kt, s64 div)
268{ 286{
269 u64 dclc, inc, dns; 287 u64 dclc, inc, dns;
270 int sft = 0; 288 int sft = 0;
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div)
281 299
282 return (unsigned long) dclc; 300 return (unsigned long) dclc;
283} 301}
284
285#else /* BITS_PER_LONG < 64 */
286# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
287#endif /* BITS_PER_LONG >= 64 */ 302#endif /* BITS_PER_LONG >= 64 */
288 303
304/* High resolution timer related functions */
305#ifdef CONFIG_HIGH_RES_TIMERS
306
307/*
308 * High resolution timer enabled ?
309 */
310static int hrtimer_hres_enabled __read_mostly = 1;
311
312/*
313 * Enable / Disable high resolution mode
314 */
315static int __init setup_hrtimer_hres(char *str)
316{
317 if (!strcmp(str, "off"))
318 hrtimer_hres_enabled = 0;
319 else if (!strcmp(str, "on"))
320 hrtimer_hres_enabled = 1;
321 else
322 return 0;
323 return 1;
324}
325
326__setup("highres=", setup_hrtimer_hres);
327
328/*
329 * hrtimer_high_res_enabled - query, if the highres mode is enabled
330 */
331static inline int hrtimer_is_hres_enabled(void)
332{
333 return hrtimer_hres_enabled;
334}
335
336/*
337 * Is the high resolution mode active ?
338 */
339static inline int hrtimer_hres_active(void)
340{
341 return __get_cpu_var(hrtimer_bases).hres_active;
342}
343
344/*
345 * Reprogram the event source with checking both queues for the
346 * next event
347 * Called with interrupts disabled and base->lock held
348 */
349static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
350{
351 int i;
352 struct hrtimer_clock_base *base = cpu_base->clock_base;
353 ktime_t expires;
354
355 cpu_base->expires_next.tv64 = KTIME_MAX;
356
357 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
358 struct hrtimer *timer;
359
360 if (!base->first)
361 continue;
362 timer = rb_entry(base->first, struct hrtimer, node);
363 expires = ktime_sub(timer->expires, base->offset);
364 if (expires.tv64 < cpu_base->expires_next.tv64)
365 cpu_base->expires_next = expires;
366 }
367
368 if (cpu_base->expires_next.tv64 != KTIME_MAX)
369 tick_program_event(cpu_base->expires_next, 1);
370}
371
372/*
373 * Shared reprogramming for clock_realtime and clock_monotonic
374 *
375 * When a timer is enqueued and expires earlier than the already enqueued
376 * timers, we have to check, whether it expires earlier than the timer for
377 * which the clock event device was armed.
378 *
379 * Called with interrupts disabled and base->cpu_base.lock held
380 */
381static int hrtimer_reprogram(struct hrtimer *timer,
382 struct hrtimer_clock_base *base)
383{
384 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
385 ktime_t expires = ktime_sub(timer->expires, base->offset);
386 int res;
387
388 /*
389 * When the callback is running, we do not reprogram the clock event
390 * device. The timer callback is either running on a different CPU or
391 * the callback is executed in the hrtimer_interupt context. The
392 * reprogramming is handled either by the softirq, which called the
393 * callback or at the end of the hrtimer_interrupt.
394 */
395 if (hrtimer_callback_running(timer))
396 return 0;
397
398 if (expires.tv64 >= expires_next->tv64)
399 return 0;
400
401 /*
402 * Clockevents returns -ETIME, when the event was in the past.
403 */
404 res = tick_program_event(expires, 0);
405 if (!IS_ERR_VALUE(res))
406 *expires_next = expires;
407 return res;
408}
409
410
411/*
412 * Retrigger next event is called after clock was set
413 *
414 * Called with interrupts disabled via on_each_cpu()
415 */
416static void retrigger_next_event(void *arg)
417{
418 struct hrtimer_cpu_base *base;
419 struct timespec realtime_offset;
420 unsigned long seq;
421
422 if (!hrtimer_hres_active())
423 return;
424
425 do {
426 seq = read_seqbegin(&xtime_lock);
427 set_normalized_timespec(&realtime_offset,
428 -wall_to_monotonic.tv_sec,
429 -wall_to_monotonic.tv_nsec);
430 } while (read_seqretry(&xtime_lock, seq));
431
432 base = &__get_cpu_var(hrtimer_bases);
433
434 /* Adjust CLOCK_REALTIME offset */
435 spin_lock(&base->lock);
436 base->clock_base[CLOCK_REALTIME].offset =
437 timespec_to_ktime(realtime_offset);
438
439 hrtimer_force_reprogram(base);
440 spin_unlock(&base->lock);
441}
442
443/*
444 * Clock realtime was set
445 *
446 * Change the offset of the realtime clock vs. the monotonic
447 * clock.
448 *
449 * We might have to reprogram the high resolution timer interrupt. On
450 * SMP we call the architecture specific code to retrigger _all_ high
451 * resolution timer interrupts. On UP we just disable interrupts and
452 * call the high resolution interrupt code.
453 */
454void clock_was_set(void)
455{
456 /* Retrigger the CPU local events everywhere */
457 on_each_cpu(retrigger_next_event, NULL, 0, 1);
458}
459
460/*
461 * Check, whether the timer is on the callback pending list
462 */
463static inline int hrtimer_cb_pending(const struct hrtimer *timer)
464{
465 return timer->state & HRTIMER_STATE_PENDING;
466}
467
468/*
469 * Remove a timer from the callback pending list
470 */
471static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
472{
473 list_del_init(&timer->cb_entry);
474}
475
476/*
477 * Initialize the high resolution related parts of cpu_base
478 */
479static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
480{
481 base->expires_next.tv64 = KTIME_MAX;
482 base->hres_active = 0;
483 INIT_LIST_HEAD(&base->cb_pending);
484}
485
486/*
487 * Initialize the high resolution related parts of a hrtimer
488 */
489static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
490{
491 INIT_LIST_HEAD(&timer->cb_entry);
492}
493
494/*
495 * When High resolution timers are active, try to reprogram. Note, that in case
496 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
497 * check happens. The timer gets enqueued into the rbtree. The reprogramming
498 * and expiry check is done in the hrtimer_interrupt or in the softirq.
499 */
500static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
501 struct hrtimer_clock_base *base)
502{
503 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
504
505 /* Timer is expired, act upon the callback mode */
506 switch(timer->cb_mode) {
507 case HRTIMER_CB_IRQSAFE_NO_RESTART:
508 /*
509 * We can call the callback from here. No restart
510 * happens, so no danger of recursion
511 */
512 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
513 return 1;
514 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
515 /*
516 * This is solely for the sched tick emulation with
517 * dynamic tick support to ensure that we do not
518 * restart the tick right on the edge and end up with
519 * the tick timer in the softirq ! The calling site
520 * takes care of this.
521 */
522 return 1;
523 case HRTIMER_CB_IRQSAFE:
524 case HRTIMER_CB_SOFTIRQ:
525 /*
526 * Move everything else into the softirq pending list !
527 */
528 list_add_tail(&timer->cb_entry,
529 &base->cpu_base->cb_pending);
530 timer->state = HRTIMER_STATE_PENDING;
531 raise_softirq(HRTIMER_SOFTIRQ);
532 return 1;
533 default:
534 BUG();
535 }
536 }
537 return 0;
538}
539
540/*
541 * Switch to high resolution mode
542 */
543static void hrtimer_switch_to_hres(void)
544{
545 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
546 unsigned long flags;
547
548 if (base->hres_active)
549 return;
550
551 local_irq_save(flags);
552
553 if (tick_init_highres()) {
554 local_irq_restore(flags);
555 return;
556 }
557 base->hres_active = 1;
558 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
559 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
560
561 tick_setup_sched_timer();
562
563 /* "Retrigger" the interrupt to get things going */
564 retrigger_next_event(NULL);
565 local_irq_restore(flags);
566 printk(KERN_INFO "Switched to high resolution mode on CPU %d\n",
567 smp_processor_id());
568}
569
570#else
571
572static inline int hrtimer_hres_active(void) { return 0; }
573static inline int hrtimer_is_hres_enabled(void) { return 0; }
574static inline void hrtimer_switch_to_hres(void) { }
575static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
576static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
577 struct hrtimer_clock_base *base)
578{
579 return 0;
580}
581static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
582static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
583static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
584static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
585
586#endif /* CONFIG_HIGH_RES_TIMERS */
587
588#ifdef CONFIG_TIMER_STATS
589void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
590{
591 if (timer->start_site)
592 return;
593
594 timer->start_site = addr;
595 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
596 timer->start_pid = current->pid;
597}
598#endif
599
289/* 600/*
290 * Counterpart to lock_timer_base above: 601 * Counterpart to lock_timer_base above:
291 */ 602 */
292static inline 603static inline
293void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 604void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
294{ 605{
295 spin_unlock_irqrestore(&timer->base->lock, *flags); 606 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
296} 607}
297 608
298/** 609/**
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
342 * The timer is inserted in expiry order. Insertion into the 653 * The timer is inserted in expiry order. Insertion into the
343 * red black tree is O(log(n)). Must hold the base lock. 654 * red black tree is O(log(n)). Must hold the base lock.
344 */ 655 */
345static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 656static void enqueue_hrtimer(struct hrtimer *timer,
657 struct hrtimer_clock_base *base, int reprogram)
346{ 658{
347 struct rb_node **link = &base->active.rb_node; 659 struct rb_node **link = &base->active.rb_node;
348 struct rb_node *parent = NULL; 660 struct rb_node *parent = NULL;
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
368 * Insert the timer to the rbtree and check whether it 680 * Insert the timer to the rbtree and check whether it
369 * replaces the first pending timer 681 * replaces the first pending timer
370 */ 682 */
371 rb_link_node(&timer->node, parent, link);
372 rb_insert_color(&timer->node, &base->active);
373
374 if (!base->first || timer->expires.tv64 < 683 if (!base->first || timer->expires.tv64 <
375 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 684 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
685 /*
686 * Reprogram the clock event device. When the timer is already
687 * expired hrtimer_enqueue_reprogram has either called the
688 * callback or added it to the pending list and raised the
689 * softirq.
690 *
691 * This is a NOP for !HIGHRES
692 */
693 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
694 return;
695
376 base->first = &timer->node; 696 base->first = &timer->node;
697 }
698
699 rb_link_node(&timer->node, parent, link);
700 rb_insert_color(&timer->node, &base->active);
701 /*
702 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
703 * state of a possibly running callback.
704 */
705 timer->state |= HRTIMER_STATE_ENQUEUED;
377} 706}
378 707
379/* 708/*
380 * __remove_hrtimer - internal function to remove a timer 709 * __remove_hrtimer - internal function to remove a timer
381 * 710 *
382 * Caller must hold the base lock. 711 * Caller must hold the base lock.
712 *
713 * High resolution timer mode reprograms the clock event device when the
714 * timer is the one which expires next. The caller can disable this by setting
715 * reprogram to zero. This is useful, when the context does a reprogramming
716 * anyway (e.g. timer interrupt)
383 */ 717 */
384static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 718static void __remove_hrtimer(struct hrtimer *timer,
719 struct hrtimer_clock_base *base,
720 unsigned long newstate, int reprogram)
385{ 721{
386 /* 722 /* High res. callback list. NOP for !HIGHRES */
387 * Remove the timer from the rbtree and replace the 723 if (hrtimer_cb_pending(timer))
388 * first entry pointer if necessary. 724 hrtimer_remove_cb_pending(timer);
389 */ 725 else {
390 if (base->first == &timer->node) 726 /*
391 base->first = rb_next(&timer->node); 727 * Remove the timer from the rbtree and replace the
392 rb_erase(&timer->node, &base->active); 728 * first entry pointer if necessary.
393 rb_set_parent(&timer->node, &timer->node); 729 */
730 if (base->first == &timer->node) {
731 base->first = rb_next(&timer->node);
732 /* Reprogram the clock event device. if enabled */
733 if (reprogram && hrtimer_hres_active())
734 hrtimer_force_reprogram(base->cpu_base);
735 }
736 rb_erase(&timer->node, &base->active);
737 }
738 timer->state = newstate;
394} 739}
395 740
396/* 741/*
397 * remove hrtimer, called with base lock held 742 * remove hrtimer, called with base lock held
398 */ 743 */
399static inline int 744static inline int
400remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 745remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
401{ 746{
402 if (hrtimer_active(timer)) { 747 if (hrtimer_is_queued(timer)) {
403 __remove_hrtimer(timer, base); 748 int reprogram;
749
750 /*
751 * Remove the timer and force reprogramming when high
752 * resolution mode is active and the timer is on the current
753 * CPU. If we remove a timer on another CPU, reprogramming is
754 * skipped. The interrupt event on this CPU is fired and
755 * reprogramming happens in the interrupt handler. This is a
756 * rare case and less expensive than a smp call.
757 */
758 timer_stats_hrtimer_clear_start_info(timer);
759 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
760 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
761 reprogram);
404 return 1; 762 return 1;
405 } 763 }
406 return 0; 764 return 0;
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
419int 777int
420hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 778hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
421{ 779{
422 struct hrtimer_base *base, *new_base; 780 struct hrtimer_clock_base *base, *new_base;
423 unsigned long flags; 781 unsigned long flags;
424 int ret; 782 int ret;
425 783
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
431 /* Switch the timer base, if necessary: */ 789 /* Switch the timer base, if necessary: */
432 new_base = switch_hrtimer_base(timer, base); 790 new_base = switch_hrtimer_base(timer, base);
433 791
434 if (mode == HRTIMER_REL) { 792 if (mode == HRTIMER_MODE_REL) {
435 tim = ktime_add(tim, new_base->get_time()); 793 tim = ktime_add(tim, new_base->get_time());
436 /* 794 /*
437 * CONFIG_TIME_LOW_RES is a temporary way for architectures 795 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
446 } 804 }
447 timer->expires = tim; 805 timer->expires = tim;
448 806
449 enqueue_hrtimer(timer, new_base); 807 timer_stats_hrtimer_set_start_info(timer);
808
809 enqueue_hrtimer(timer, new_base, base == new_base);
450 810
451 unlock_hrtimer_base(timer, &flags); 811 unlock_hrtimer_base(timer, &flags);
452 812
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
466 */ 826 */
467int hrtimer_try_to_cancel(struct hrtimer *timer) 827int hrtimer_try_to_cancel(struct hrtimer *timer)
468{ 828{
469 struct hrtimer_base *base; 829 struct hrtimer_clock_base *base;
470 unsigned long flags; 830 unsigned long flags;
471 int ret = -1; 831 int ret = -1;
472 832
473 base = lock_hrtimer_base(timer, &flags); 833 base = lock_hrtimer_base(timer, &flags);
474 834
475 if (base->curr_timer != timer) 835 if (!hrtimer_callback_running(timer))
476 ret = remove_hrtimer(timer, base); 836 ret = remove_hrtimer(timer, base);
477 837
478 unlock_hrtimer_base(timer, &flags); 838 unlock_hrtimer_base(timer, &flags);
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
508 */ 868 */
509ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 869ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
510{ 870{
511 struct hrtimer_base *base; 871 struct hrtimer_clock_base *base;
512 unsigned long flags; 872 unsigned long flags;
513 ktime_t rem; 873 ktime_t rem;
514 874
515 base = lock_hrtimer_base(timer, &flags); 875 base = lock_hrtimer_base(timer, &flags);
516 rem = ktime_sub(timer->expires, timer->base->get_time()); 876 rem = ktime_sub(timer->expires, base->get_time());
517 unlock_hrtimer_base(timer, &flags); 877 unlock_hrtimer_base(timer, &flags);
518 878
519 return rem; 879 return rem;
520} 880}
521EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 881EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
522 882
523#ifdef CONFIG_NO_IDLE_HZ 883#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
524/** 884/**
525 * hrtimer_get_next_event - get the time until next expiry event 885 * hrtimer_get_next_event - get the time until next expiry event
526 * 886 *
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
529 */ 889 */
530ktime_t hrtimer_get_next_event(void) 890ktime_t hrtimer_get_next_event(void)
531{ 891{
532 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 892 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
893 struct hrtimer_clock_base *base = cpu_base->clock_base;
533 ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 894 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
534 unsigned long flags; 895 unsigned long flags;
535 int i; 896 int i;
536 897
537 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 898 spin_lock_irqsave(&cpu_base->lock, flags);
538 struct hrtimer *timer;
539 899
540 spin_lock_irqsave(&base->lock, flags); 900 if (!hrtimer_hres_active()) {
541 if (!base->first) { 901 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
542 spin_unlock_irqrestore(&base->lock, flags); 902 struct hrtimer *timer;
543 continue; 903
904 if (!base->first)
905 continue;
906
907 timer = rb_entry(base->first, struct hrtimer, node);
908 delta.tv64 = timer->expires.tv64;
909 delta = ktime_sub(delta, base->get_time());
910 if (delta.tv64 < mindelta.tv64)
911 mindelta.tv64 = delta.tv64;
544 } 912 }
545 timer = rb_entry(base->first, struct hrtimer, node);
546 delta.tv64 = timer->expires.tv64;
547 spin_unlock_irqrestore(&base->lock, flags);
548 delta = ktime_sub(delta, base->get_time());
549 if (delta.tv64 < mindelta.tv64)
550 mindelta.tv64 = delta.tv64;
551 } 913 }
914
915 spin_unlock_irqrestore(&cpu_base->lock, flags);
916
552 if (mindelta.tv64 < 0) 917 if (mindelta.tv64 < 0)
553 mindelta.tv64 = 0; 918 mindelta.tv64 = 0;
554 return mindelta; 919 return mindelta;
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void)
564void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 929void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
565 enum hrtimer_mode mode) 930 enum hrtimer_mode mode)
566{ 931{
567 struct hrtimer_base *bases; 932 struct hrtimer_cpu_base *cpu_base;
568 933
569 memset(timer, 0, sizeof(struct hrtimer)); 934 memset(timer, 0, sizeof(struct hrtimer));
570 935
571 bases = __raw_get_cpu_var(hrtimer_bases); 936 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
572 937
573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 938 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
574 clock_id = CLOCK_MONOTONIC; 939 clock_id = CLOCK_MONOTONIC;
575 940
576 timer->base = &bases[clock_id]; 941 timer->base = &cpu_base->clock_base[clock_id];
577 rb_set_parent(&timer->node, &timer->node); 942 hrtimer_init_timer_hres(timer);
943
944#ifdef CONFIG_TIMER_STATS
945 timer->start_site = NULL;
946 timer->start_pid = -1;
947 memset(timer->start_comm, 0, TASK_COMM_LEN);
948#endif
578} 949}
579EXPORT_SYMBOL_GPL(hrtimer_init); 950EXPORT_SYMBOL_GPL(hrtimer_init);
580 951
@@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
588 */ 959 */
589int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 960int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
590{ 961{
591 struct hrtimer_base *bases; 962 struct hrtimer_cpu_base *cpu_base;
592 963
593 bases = __raw_get_cpu_var(hrtimer_bases); 964 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
594 *tp = ktime_to_timespec(bases[which_clock].resolution); 965 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
595 966
596 return 0; 967 return 0;
597} 968}
598EXPORT_SYMBOL_GPL(hrtimer_get_res); 969EXPORT_SYMBOL_GPL(hrtimer_get_res);
599 970
971#ifdef CONFIG_HIGH_RES_TIMERS
972
973/*
974 * High resolution timer interrupt
975 * Called with interrupts disabled
976 */
977void hrtimer_interrupt(struct clock_event_device *dev)
978{
979 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
980 struct hrtimer_clock_base *base;
981 ktime_t expires_next, now;
982 int i, raise = 0;
983
984 BUG_ON(!cpu_base->hres_active);
985 cpu_base->nr_events++;
986 dev->next_event.tv64 = KTIME_MAX;
987
988 retry:
989 now = ktime_get();
990
991 expires_next.tv64 = KTIME_MAX;
992
993 base = cpu_base->clock_base;
994
995 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
996 ktime_t basenow;
997 struct rb_node *node;
998
999 spin_lock(&cpu_base->lock);
1000
1001 basenow = ktime_add(now, base->offset);
1002
1003 while ((node = base->first)) {
1004 struct hrtimer *timer;
1005
1006 timer = rb_entry(node, struct hrtimer, node);
1007
1008 if (basenow.tv64 < timer->expires.tv64) {
1009 ktime_t expires;
1010
1011 expires = ktime_sub(timer->expires,
1012 base->offset);
1013 if (expires.tv64 < expires_next.tv64)
1014 expires_next = expires;
1015 break;
1016 }
1017
1018 /* Move softirq callbacks to the pending list */
1019 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1020 __remove_hrtimer(timer, base,
1021 HRTIMER_STATE_PENDING, 0);
1022 list_add_tail(&timer->cb_entry,
1023 &base->cpu_base->cb_pending);
1024 raise = 1;
1025 continue;
1026 }
1027
1028 __remove_hrtimer(timer, base,
1029 HRTIMER_STATE_CALLBACK, 0);
1030 timer_stats_account_hrtimer(timer);
1031
1032 /*
1033 * Note: We clear the CALLBACK bit after
1034 * enqueue_hrtimer to avoid reprogramming of
1035 * the event hardware. This happens at the end
1036 * of this function anyway.
1037 */
1038 if (timer->function(timer) != HRTIMER_NORESTART) {
1039 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1040 enqueue_hrtimer(timer, base, 0);
1041 }
1042 timer->state &= ~HRTIMER_STATE_CALLBACK;
1043 }
1044 spin_unlock(&cpu_base->lock);
1045 base++;
1046 }
1047
1048 cpu_base->expires_next = expires_next;
1049
1050 /* Reprogramming necessary ? */
1051 if (expires_next.tv64 != KTIME_MAX) {
1052 if (tick_program_event(expires_next, 0))
1053 goto retry;
1054 }
1055
1056 /* Raise softirq ? */
1057 if (raise)
1058 raise_softirq(HRTIMER_SOFTIRQ);
1059}
1060
1061static void run_hrtimer_softirq(struct softirq_action *h)
1062{
1063 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1064
1065 spin_lock_irq(&cpu_base->lock);
1066
1067 while (!list_empty(&cpu_base->cb_pending)) {
1068 enum hrtimer_restart (*fn)(struct hrtimer *);
1069 struct hrtimer *timer;
1070 int restart;
1071
1072 timer = list_entry(cpu_base->cb_pending.next,
1073 struct hrtimer, cb_entry);
1074
1075 timer_stats_account_hrtimer(timer);
1076
1077 fn = timer->function;
1078 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1079 spin_unlock_irq(&cpu_base->lock);
1080
1081 restart = fn(timer);
1082
1083 spin_lock_irq(&cpu_base->lock);
1084
1085 timer->state &= ~HRTIMER_STATE_CALLBACK;
1086 if (restart == HRTIMER_RESTART) {
1087 BUG_ON(hrtimer_active(timer));
1088 /*
1089 * Enqueue the timer, allow reprogramming of the event
1090 * device
1091 */
1092 enqueue_hrtimer(timer, timer->base, 1);
1093 } else if (hrtimer_active(timer)) {
1094 /*
1095 * If the timer was rearmed on another CPU, reprogram
1096 * the event device.
1097 */
1098 if (timer->base->first == &timer->node)
1099 hrtimer_reprogram(timer, timer->base);
1100 }
1101 }
1102 spin_unlock_irq(&cpu_base->lock);
1103}
1104
1105#endif /* CONFIG_HIGH_RES_TIMERS */
1106
600/* 1107/*
601 * Expire the per base hrtimer-queue: 1108 * Expire the per base hrtimer-queue:
602 */ 1109 */
603static inline void run_hrtimer_queue(struct hrtimer_base *base) 1110static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1111 int index)
604{ 1112{
605 struct rb_node *node; 1113 struct rb_node *node;
1114 struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
606 1115
607 if (!base->first) 1116 if (!base->first)
608 return; 1117 return;
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
610 if (base->get_softirq_time) 1119 if (base->get_softirq_time)
611 base->softirq_time = base->get_softirq_time(); 1120 base->softirq_time = base->get_softirq_time();
612 1121
613 spin_lock_irq(&base->lock); 1122 spin_lock_irq(&cpu_base->lock);
614 1123
615 while ((node = base->first)) { 1124 while ((node = base->first)) {
616 struct hrtimer *timer; 1125 struct hrtimer *timer;
617 int (*fn)(struct hrtimer *); 1126 enum hrtimer_restart (*fn)(struct hrtimer *);
618 int restart; 1127 int restart;
619 1128
620 timer = rb_entry(node, struct hrtimer, node); 1129 timer = rb_entry(node, struct hrtimer, node);
621 if (base->softirq_time.tv64 <= timer->expires.tv64) 1130 if (base->softirq_time.tv64 <= timer->expires.tv64)
622 break; 1131 break;
623 1132
1133 timer_stats_account_hrtimer(timer);
1134
624 fn = timer->function; 1135 fn = timer->function;
625 set_curr_timer(base, timer); 1136 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
626 __remove_hrtimer(timer, base); 1137 spin_unlock_irq(&cpu_base->lock);
627 spin_unlock_irq(&base->lock);
628 1138
629 restart = fn(timer); 1139 restart = fn(timer);
630 1140
631 spin_lock_irq(&base->lock); 1141 spin_lock_irq(&cpu_base->lock);
632 1142
1143 timer->state &= ~HRTIMER_STATE_CALLBACK;
633 if (restart != HRTIMER_NORESTART) { 1144 if (restart != HRTIMER_NORESTART) {
634 BUG_ON(hrtimer_active(timer)); 1145 BUG_ON(hrtimer_active(timer));
635 enqueue_hrtimer(timer, base); 1146 enqueue_hrtimer(timer, base, 0);
636 } 1147 }
637 } 1148 }
638 set_curr_timer(base, NULL); 1149 spin_unlock_irq(&cpu_base->lock);
639 spin_unlock_irq(&base->lock);
640} 1150}
641 1151
642/* 1152/*
643 * Called from timer softirq every jiffy, expire hrtimers: 1153 * Called from timer softirq every jiffy, expire hrtimers:
1154 *
1155 * For HRT its the fall back code to run the softirq in the timer
1156 * softirq context in case the hrtimer initialization failed or has
1157 * not been done yet.
644 */ 1158 */
645void hrtimer_run_queues(void) 1159void hrtimer_run_queues(void)
646{ 1160{
647 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 1161 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
648 int i; 1162 int i;
649 1163
650 hrtimer_get_softirq_time(base); 1164 if (hrtimer_hres_active())
1165 return;
1166
1167 /*
1168 * This _is_ ugly: We have to check in the softirq context,
1169 * whether we can switch to highres and / or nohz mode. The
1170 * clocksource switch happens in the timer interrupt with
1171 * xtime_lock held. Notification from there only sets the
1172 * check bit in the tick_oneshot code, otherwise we might
1173 * deadlock vs. xtime_lock.
1174 */
1175 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1176 hrtimer_switch_to_hres();
651 1177
652 for (i = 0; i < MAX_HRTIMER_BASES; i++) 1178 hrtimer_get_softirq_time(cpu_base);
653 run_hrtimer_queue(&base[i]); 1179
1180 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1181 run_hrtimer_queue(cpu_base, i);
654} 1182}
655 1183
656/* 1184/*
657 * Sleep related functions: 1185 * Sleep related functions:
658 */ 1186 */
659static int hrtimer_wakeup(struct hrtimer *timer) 1187static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
660{ 1188{
661 struct hrtimer_sleeper *t = 1189 struct hrtimer_sleeper *t =
662 container_of(timer, struct hrtimer_sleeper, timer); 1190 container_of(timer, struct hrtimer_sleeper, timer);
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
673{ 1201{
674 sl->timer.function = hrtimer_wakeup; 1202 sl->timer.function = hrtimer_wakeup;
675 sl->task = task; 1203 sl->task = task;
1204#ifdef CONFIG_HIGH_RES_TIMERS
1205 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
1206#endif
676} 1207}
677 1208
678static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1209static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
683 set_current_state(TASK_INTERRUPTIBLE); 1214 set_current_state(TASK_INTERRUPTIBLE);
684 hrtimer_start(&t->timer, t->timer.expires, mode); 1215 hrtimer_start(&t->timer, t->timer.expires, mode);
685 1216
686 schedule(); 1217 if (likely(t->task))
1218 schedule();
687 1219
688 hrtimer_cancel(&t->timer); 1220 hrtimer_cancel(&t->timer);
689 mode = HRTIMER_ABS; 1221 mode = HRTIMER_MODE_ABS;
690 1222
691 } while (t->task && !signal_pending(current)); 1223 } while (t->task && !signal_pending(current));
692 1224
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
702 1234
703 restart->fn = do_no_restart_syscall; 1235 restart->fn = do_no_restart_syscall;
704 1236
705 hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); 1237 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
706 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1238 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
707 1239
708 if (do_nanosleep(&t, HRTIMER_ABS)) 1240 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
709 return 0; 1241 return 0;
710 1242
711 rmtp = (struct timespec __user *) restart->arg1; 1243 rmtp = (struct timespec __user *) restart->arg1;
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
738 return 0; 1270 return 0;
739 1271
740 /* Absolute timers do not update the rmtp value and restart: */ 1272 /* Absolute timers do not update the rmtp value and restart: */
741 if (mode == HRTIMER_ABS) 1273 if (mode == HRTIMER_MODE_ABS)
742 return -ERESTARTNOHAND; 1274 return -ERESTARTNOHAND;
743 1275
744 if (rmtp) { 1276 if (rmtp) {
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
771 if (!timespec_valid(&tu)) 1303 if (!timespec_valid(&tu))
772 return -EINVAL; 1304 return -EINVAL;
773 1305
774 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); 1306 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
775} 1307}
776 1308
777/* 1309/*
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
779 */ 1311 */
780static void __devinit init_hrtimers_cpu(int cpu) 1312static void __devinit init_hrtimers_cpu(int cpu)
781{ 1313{
782 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); 1314 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
783 int i; 1315 int i;
784 1316
785 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 1317 spin_lock_init(&cpu_base->lock);
786 spin_lock_init(&base->lock); 1318 lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
787 lockdep_set_class(&base->lock, &base->lock_key); 1319
788 } 1320 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1321 cpu_base->clock_base[i].cpu_base = cpu_base;
1322
1323 hrtimer_init_hres(cpu_base);
789} 1324}
790 1325
791#ifdef CONFIG_HOTPLUG_CPU 1326#ifdef CONFIG_HOTPLUG_CPU
792 1327
793static void migrate_hrtimer_list(struct hrtimer_base *old_base, 1328static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
794 struct hrtimer_base *new_base) 1329 struct hrtimer_clock_base *new_base)
795{ 1330{
796 struct hrtimer *timer; 1331 struct hrtimer *timer;
797 struct rb_node *node; 1332 struct rb_node *node;
798 1333
799 while ((node = rb_first(&old_base->active))) { 1334 while ((node = rb_first(&old_base->active))) {
800 timer = rb_entry(node, struct hrtimer, node); 1335 timer = rb_entry(node, struct hrtimer, node);
801 __remove_hrtimer(timer, old_base); 1336 BUG_ON(hrtimer_callback_running(timer));
1337 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
802 timer->base = new_base; 1338 timer->base = new_base;
803 enqueue_hrtimer(timer, new_base); 1339 /*
1340 * Enqueue the timer. Allow reprogramming of the event device
1341 */
1342 enqueue_hrtimer(timer, new_base, 1);
804 } 1343 }
805} 1344}
806 1345
807static void migrate_hrtimers(int cpu) 1346static void migrate_hrtimers(int cpu)
808{ 1347{
809 struct hrtimer_base *old_base, *new_base; 1348 struct hrtimer_cpu_base *old_base, *new_base;
810 int i; 1349 int i;
811 1350
812 BUG_ON(cpu_online(cpu)); 1351 BUG_ON(cpu_online(cpu));
813 old_base = per_cpu(hrtimer_bases, cpu); 1352 old_base = &per_cpu(hrtimer_bases, cpu);
814 new_base = get_cpu_var(hrtimer_bases); 1353 new_base = &get_cpu_var(hrtimer_bases);
815
816 local_irq_disable();
817 1354
818 for (i = 0; i < MAX_HRTIMER_BASES; i++) { 1355 tick_cancel_sched_timer(cpu);
819 1356
820 spin_lock(&new_base->lock); 1357 local_irq_disable();
821 spin_lock(&old_base->lock);
822
823 BUG_ON(old_base->curr_timer);
824 1358
825 migrate_hrtimer_list(old_base, new_base); 1359 spin_lock(&new_base->lock);
1360 spin_lock(&old_base->lock);
826 1361
827 spin_unlock(&old_base->lock); 1362 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
828 spin_unlock(&new_base->lock); 1363 migrate_hrtimer_list(&old_base->clock_base[i],
829 old_base++; 1364 &new_base->clock_base[i]);
830 new_base++;
831 } 1365 }
1366 spin_unlock(&old_base->lock);
1367 spin_unlock(&new_base->lock);
832 1368
833 local_irq_enable(); 1369 local_irq_enable();
834 put_cpu_var(hrtimer_bases); 1370 put_cpu_var(hrtimer_bases);
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
848 1384
849#ifdef CONFIG_HOTPLUG_CPU 1385#ifdef CONFIG_HOTPLUG_CPU
850 case CPU_DEAD: 1386 case CPU_DEAD:
1387 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
851 migrate_hrtimers(cpu); 1388 migrate_hrtimers(cpu);
852 break; 1389 break;
853#endif 1390#endif
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void)
868 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1405 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
869 (void *)(long)smp_processor_id()); 1406 (void *)(long)smp_processor_id());
870 register_cpu_notifier(&hrtimers_nb); 1407 register_cpu_notifier(&hrtimers_nb);
1408#ifdef CONFIG_HIGH_RES_TIMERS
1409 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
1410#endif
871} 1411}
872 1412
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 475e8a71bcdc..0133f4f9e9f0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data);
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_data - set irq type data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @data: Pointer to interrupt specific data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the hardware irq controller data for an irq
174 */ 174 */
@@ -230,10 +230,6 @@ static void default_enable(unsigned int irq)
230 */ 230 */
231static void default_disable(unsigned int irq) 231static void default_disable(unsigned int irq)
232{ 232{
233 struct irq_desc *desc = irq_desc + irq;
234
235 if (!(desc->status & IRQ_DELAYED_DISABLE))
236 desc->chip->mask(irq);
237} 233}
238 234
239/* 235/*
@@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
298 294
299 if (unlikely(desc->status & IRQ_INPROGRESS)) 295 if (unlikely(desc->status & IRQ_INPROGRESS))
300 goto out_unlock; 296 goto out_unlock;
301 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
302 kstat_cpu(cpu).irqs[irq]++; 297 kstat_cpu(cpu).irqs[irq]++;
303 298
304 action = desc->action; 299 action = desc->action;
305 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 300 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
301 if (desc->chip->mask)
302 desc->chip->mask(irq);
303 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
304 desc->status |= IRQ_PENDING;
306 goto out_unlock; 305 goto out_unlock;
306 }
307 307
308 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
308 desc->status |= IRQ_INPROGRESS; 309 desc->status |= IRQ_INPROGRESS;
309 spin_unlock(&desc->lock); 310 spin_unlock(&desc->lock);
310 311
@@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
396 397
397 /* 398 /*
398 * If its disabled or no action available 399 * If its disabled or no action available
399 * keep it masked and get out of here 400 * then mask it and get out of here:
400 */ 401 */
401 action = desc->action; 402 action = desc->action;
402 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 403 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
403 desc->status |= IRQ_PENDING; 404 desc->status |= IRQ_PENDING;
405 if (desc->chip->mask)
406 desc->chip->mask(irq);
404 goto out; 407 goto out;
405 } 408 }
406 409
@@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
562 565
563 /* Uninstall? */ 566 /* Uninstall? */
564 if (handle == handle_bad_irq) { 567 if (handle == handle_bad_irq) {
565 if (desc->chip != &no_irq_chip) { 568 if (desc->chip != &no_irq_chip)
566 desc->chip->mask(irq); 569 mask_ack_irq(desc, irq);
567 desc->chip->ack(irq);
568 }
569 desc->status |= IRQ_DISABLED; 570 desc->status |= IRQ_DISABLED;
570 desc->depth = 1; 571 desc->depth = 1;
571 } 572 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7c85d69188ef..5597c157442a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq)
38} 38}
39EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
40 40
41/**
42 * irq_can_set_affinity - Check if the affinity of a given irq can be set
43 * @irq: Interrupt to check
44 *
45 */
46int irq_can_set_affinity(unsigned int irq)
47{
48 struct irq_desc *desc = irq_desc + irq;
49
50 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
51 !desc->chip->set_affinity)
52 return 0;
53
54 return 1;
55}
56
57/**
58 * irq_set_affinity - Set the irq affinity of a given irq
59 * @irq: Interrupt to set affinity
60 * @cpumask: cpumask
61 *
62 */
63int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
64{
65 struct irq_desc *desc = irq_desc + irq;
66
67 if (!desc->chip->set_affinity)
68 return -EINVAL;
69
70 set_balance_irq_affinity(irq, cpumask);
71
72#ifdef CONFIG_GENERIC_PENDING_IRQ
73 set_pending_irq(irq, cpumask);
74#else
75 desc->affinity = cpumask;
76 desc->chip->set_affinity(irq, cpumask);
77#endif
78 return 0;
79}
80
41#endif 81#endif
42 82
43/** 83/**
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
281 if (new->flags & IRQF_PERCPU) 321 if (new->flags & IRQF_PERCPU)
282 desc->status |= IRQ_PER_CPU; 322 desc->status |= IRQ_PER_CPU;
283#endif 323#endif
324 /* Exclude IRQ from balancing */
325 if (new->flags & IRQF_NOBALANCING)
326 desc->status |= IRQ_NO_BALANCING;
327
284 if (!shared) { 328 if (!shared) {
285 irq_chip_set_defaults(desc->chip); 329 irq_chip_set_defaults(desc->chip);
286 330
@@ -461,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
461 /* 505 /*
462 * Lockdep wants atomic interrupt handlers: 506 * Lockdep wants atomic interrupt handlers:
463 */ 507 */
464 irqflags |= SA_INTERRUPT; 508 irqflags |= IRQF_DISABLED;
465#endif 509#endif
466 /* 510 /*
467 * Sanity-check: shared interrupts must pass in a real dev-ID, 511 * Sanity-check: shared interrupts must pass in a real dev-ID,
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6d3be06e8ce6..2db91eb54ad8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19#ifdef CONFIG_GENERIC_PENDING_IRQ
20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
21{
22 set_balance_irq_affinity(irq, mask_val);
23
24 /*
25 * Save these away for later use. Re-progam when the
26 * interrupt is pending
27 */
28 set_pending_irq(irq, mask_val);
29}
30#else
31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
32{
33 set_balance_irq_affinity(irq, mask_val);
34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
36}
37#endif
38
39static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
40 int count, int *eof, void *data) 20 int count, int *eof, void *data)
41{ 21{
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
55 cpumask_t new_value, tmp; 35 cpumask_t new_value, tmp;
56 36
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 37 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status)) 38 irq_balancing_disabled(irq))
59 return -EIO; 39 return -EIO;
60 40
61 err = cpumask_parse_user(buffer, count, new_value); 41 err = cpumask_parse_user(buffer, count, new_value);
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
73 code to set default SMP affinity. */ 53 code to set default SMP affinity. */
74 return select_smp_affinity(irq) ? -EINVAL : full_count; 54 return select_smp_affinity(irq) ? -EINVAL : full_count;
75 55
76 proc_set_irq_affinity(irq, new_value); 56 irq_set_affinity(irq, new_value);
77 57
78 return full_count; 58 return full_count;
79} 59}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 204ed7939e75..307c6a632ef6 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(struct hrtimer *timer) 131enum hrtimer_restart it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct signal_struct *sig = 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer); 134 container_of(timer, struct signal_struct, real_timer);
135 135
136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
137 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART;
142 }
143 return HRTIMER_NORESTART; 138 return HRTIMER_NORESTART;
144} 139}
145 140
@@ -231,11 +226,14 @@ again:
231 spin_unlock_irq(&tsk->sighand->siglock); 226 spin_unlock_irq(&tsk->sighand->siglock);
232 goto again; 227 goto again;
233 } 228 }
234 tsk->signal->it_real_incr =
235 timeval_to_ktime(value->it_interval);
236 expires = timeval_to_ktime(value->it_value); 229 expires = timeval_to_ktime(value->it_value);
237 if (expires.tv64 != 0) 230 if (expires.tv64 != 0) {
238 hrtimer_start(timer, expires, HRTIMER_REL); 231 tsk->signal->it_real_incr =
232 timeval_to_ktime(value->it_interval);
233 hrtimer_start(timer, expires, HRTIMER_MODE_REL);
234 } else
235 tsk->signal->it_real_incr.tv64 = 0;
236
239 spin_unlock_irq(&tsk->sighand->siglock); 237 spin_unlock_irq(&tsk->sighand->siglock);
240 break; 238 break;
241 case ITIMER_VIRTUAL: 239 case ITIMER_VIRTUAL:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..796276141e51 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
217 sub_info->retval = ret; 217 sub_info->retval = ret;
218 } 218 }
219 219
220 complete(sub_info->complete); 220 if (sub_info->wait < 0)
221 kfree(sub_info);
222 else
223 complete(sub_info->complete);
221 return 0; 224 return 0;
222} 225}
223 226
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
239 pid = kernel_thread(____call_usermodehelper, sub_info, 242 pid = kernel_thread(____call_usermodehelper, sub_info,
240 CLONE_VFORK | SIGCHLD); 243 CLONE_VFORK | SIGCHLD);
241 244
245 if (wait < 0)
246 return;
247
242 if (pid < 0) { 248 if (pid < 0) {
243 sub_info->retval = pid; 249 sub_info->retval = pid;
244 complete(sub_info->complete); 250 complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
253 * @envp: null-terminated environment list 259 * @envp: null-terminated environment list
254 * @session_keyring: session keyring for process (NULL for an empty keyring) 260 * @session_keyring: session keyring for process (NULL for an empty keyring)
255 * @wait: wait for the application to finish and return status. 261 * @wait: wait for the application to finish and return status.
262 * when -1 don't wait at all, but you get no useful error back when
263 * the program couldn't be exec'ed. This makes it safe to call
264 * from interrupt context.
256 * 265 *
257 * Runs a user-space application. The application is started 266 * Runs a user-space application. The application is started
258 * asynchronously if wait is not set, and runs as a child of keventd. 267 * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
265 struct key *session_keyring, int wait) 274 struct key *session_keyring, int wait)
266{ 275{
267 DECLARE_COMPLETION_ONSTACK(done); 276 DECLARE_COMPLETION_ONSTACK(done);
268 struct subprocess_info sub_info = { 277 struct subprocess_info *sub_info;
269 .work = __WORK_INITIALIZER(sub_info.work, 278 int retval;
270 __call_usermodehelper),
271 .complete = &done,
272 .path = path,
273 .argv = argv,
274 .envp = envp,
275 .ring = session_keyring,
276 .wait = wait,
277 .retval = 0,
278 };
279 279
280 if (!khelper_wq) 280 if (!khelper_wq)
281 return -EBUSY; 281 return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
283 if (path[0] == '\0') 283 if (path[0] == '\0')
284 return 0; 284 return 0;
285 285
286 queue_work(khelper_wq, &sub_info.work); 286 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
287 if (!sub_info)
288 return -ENOMEM;
289
290 INIT_WORK(&sub_info->work, __call_usermodehelper);
291 sub_info->complete = &done;
292 sub_info->path = path;
293 sub_info->argv = argv;
294 sub_info->envp = envp;
295 sub_info->ring = session_keyring;
296 sub_info->wait = wait;
297
298 queue_work(khelper_wq, &sub_info->work);
299 if (wait < 0) /* task has freed sub_info */
300 return 0;
287 wait_for_completion(&done); 301 wait_for_completion(&done);
288 return sub_info.retval; 302 retval = sub_info->retval;
303 kfree(sub_info);
304 return retval;
289} 305}
290EXPORT_SYMBOL(call_usermodehelper_keys); 306EXPORT_SYMBOL(call_usermodehelper_keys);
291 307
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 88fc611b3ae9..58f35e586ee3 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -10,7 +10,6 @@
10 * Code for /proc/lockdep and /proc/lockdep_stats: 10 * Code for /proc/lockdep and /proc/lockdep_stats:
11 * 11 *
12 */ 12 */
13#include <linux/sched.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
16#include <linux/seq_file.h> 15#include <linux/seq_file.h>
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 841539d72c55..d17436cdea1b 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -13,7 +13,6 @@
13 * Released under the General Public License (GPL). 13 * Released under the General Public License (GPL).
14 */ 14 */
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <linux/sched.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/module.h> 17#include <linux/module.h>
19#include <linux/poison.h> 18#include <linux/poison.h>
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c3e1e6dfb5b..657f77697415 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
304 * should be able to see it. 304 * should be able to see it.
305 */ 305 */
306 struct task_struct *p; 306 struct task_struct *p;
307 read_lock(&tasklist_lock); 307 rcu_read_lock();
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else if (p->tgid == pid && p->signal) { 315 } else {
316 error = cpu_clock_sample_group(which_clock, 316 read_lock(&tasklist_lock);
317 p, &rtn); 317 if (p->tgid == pid && p->signal) {
318 error =
319 cpu_clock_sample_group(which_clock,
320 p, &rtn);
321 }
322 read_unlock(&tasklist_lock);
318 } 323 }
319 } 324 }
320 read_unlock(&tasklist_lock); 325 rcu_read_unlock();
321 } 326 }
322 327
323 if (error) 328 if (error)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a1bf61617839..44318ca71978 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
145 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
146static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
147 147
148static int posix_timer_fn(struct hrtimer *data); 148static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
149 149
150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
151 151
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
334 334
335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
336 */ 336 */
337static int posix_timer_fn(struct hrtimer *timer) 337static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
338{ 338{
339 struct k_itimer *timr; 339 struct k_itimer *timr;
340 unsigned long flags; 340 unsigned long flags;
341 int si_private = 0; 341 int si_private = 0;
342 int ret = HRTIMER_NORESTART; 342 enum hrtimer_restart ret = HRTIMER_NORESTART;
343 343
344 timr = container_of(timer, struct k_itimer, it.real.timer); 344 timr = container_of(timer, struct k_itimer, it.real.timer);
345 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer)
356 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
357 timr->it_overrun += 357 timr->it_overrun +=
358 hrtimer_forward(timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time, 359 hrtimer_cb_get_time(timer),
360 timr->it.real.interval); 360 timr->it.real.interval);
361 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
362 ++timr->it_requeue_pending; 362 ++timr->it_requeue_pending;
@@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags,
722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) 722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
723 return 0; 723 return 0;
724 724
725 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 725 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
727 timr->it.real.timer.function = posix_timer_fn; 727 timr->it.real.timer.function = posix_timer_fn;
728 728
@@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags,
734 /* SIGEV_NONE timers are not queued ! See common_timer_get */ 734 /* SIGEV_NONE timers are not queued ! See common_timer_get */
735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
736 /* Setup correct expiry time for relative timers */ 736 /* Setup correct expiry time for relative timers */
737 if (mode == HRTIMER_REL) 737 if (mode == HRTIMER_MODE_REL)
738 timer->expires = ktime_add(timer->expires, 738 timer->expires = ktime_add(timer->expires,
739 timer->base->get_time()); 739 timer->base->get_time());
740 return 0; 740 return 0;
@@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
950 struct timespec *tsave, struct timespec __user *rmtp) 950 struct timespec *tsave, struct timespec __user *rmtp)
951{ 951{
952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
953 HRTIMER_ABS : HRTIMER_REL, which_clock); 953 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
954 which_clock);
954} 955}
955 956
956asmlinkage long 957asmlinkage long
diff --git a/kernel/resource.c b/kernel/resource.c
index 2a3f88636580..bdb55a33f969 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h>
12#include <linux/errno.h> 11#include <linux/errno.h>
13#include <linux/ioport.h> 12#include <linux/ioport.h>
14#include <linux/init.h> 13#include <linux/init.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da46fd8..180978cb2f75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
625 /* Setup the timer, when timeout != NULL */ 625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout)) 626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires, 627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS); 628 HRTIMER_MODE_ABS);
629 629
630 for (;;) { 630 for (;;) {
631 /* Try to acquire the lock: */ 631 /* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 08f86178aa34..0dc757246d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1853 struct mm_struct *mm = next->mm; 1853 struct mm_struct *mm = next->mm;
1854 struct mm_struct *oldmm = prev->active_mm; 1854 struct mm_struct *oldmm = prev->active_mm;
1855 1855
1856 /*
1857 * For paravirt, this is coupled with an exit in switch_to to
1858 * combine the page table reload and the switch backend into
1859 * one hypercall.
1860 */
1861 arch_enter_lazy_cpu_mode();
1862
1856 if (!mm) { 1863 if (!mm) {
1857 next->active_mm = oldmm; 1864 next->active_mm = oldmm;
1858 atomic_inc(&oldmm->mm_count); 1865 atomic_inc(&oldmm->mm_count);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8072e568bbe0..e2a7d4bf7d57 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
457{ 457{
458 int signr = __dequeue_signal(&tsk->pending, mask, info); 458 int signr = __dequeue_signal(&tsk->pending, mask, info);
459 if (!signr) 459 if (!signr) {
460 signr = __dequeue_signal(&tsk->signal->shared_pending, 460 signr = __dequeue_signal(&tsk->signal->shared_pending,
461 mask, info); 461 mask, info);
462 /*
463 * itimer signal ?
464 *
465 * itimers are process shared and we restart periodic
466 * itimers in the signal delivery path to prevent DoS
467 * attacks in the high resolution timer case. This is
468 * compliant with the old way of self restarting
469 * itimers, as the SIGALRM is a legacy signal and only
470 * queued once. Changing the restart behaviour to
471 * restart the timer in the signal dequeue path is
472 * reducing the timer noise on heavy loaded !highres
473 * systems too.
474 */
475 if (unlikely(signr == SIGALRM)) {
476 struct hrtimer *tmr = &tsk->signal->real_timer;
477
478 if (!hrtimer_is_queued(tmr) &&
479 tsk->signal->it_real_incr.tv64 != 0) {
480 hrtimer_forward(tmr, tmr->base->get_time(),
481 tsk->signal->it_real_incr);
482 hrtimer_restart(tmr);
483 }
484 }
485 }
462 recalc_sigpending_tsk(tsk); 486 recalc_sigpending_tsk(tsk);
463 if (signr && unlikely(sig_kernel_stop(signr))) { 487 if (signr && unlikely(sig_kernel_stop(signr))) {
464 /* 488 /*
465 * Set a marker that we have dequeued a stop signal. Our 489 * Set a marker that we have dequeued a stop signal. Our
466 * caller might release the siglock and then the pending 490 * caller might release the siglock and then the pending
467 * stop signal it is about to process is no longer in the 491 * stop signal it is about to process is no longer in the
468 * pending bitmasks, but must still be cleared by a SIGCONT 492 * pending bitmasks, but must still be cleared by a SIGCONT
469 * (and overruled by a SIGKILL). So those cases clear this 493 * (and overruled by a SIGKILL). So those cases clear this
470 * shared flag after we've set it. Note that this flag may 494 * shared flag after we've set it. Note that this flag may
471 * remain set after the signal we return is ignored or 495 * remain set after the signal we return is ignored or
472 * handled. That doesn't matter because its only purpose 496 * handled. That doesn't matter because its only purpose
473 * is to alert stop-signal processing code when another 497 * is to alert stop-signal processing code when another
474 * processor has come along and cleared the flag. 498 * processor has come along and cleared the flag.
475 */ 499 */
476 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 500 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
477 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 501 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
478 } 502 }
479 if ( signr && 503 if ( signr &&
480 ((info->si_code & __SI_MASK) == __SI_TIMER) && 504 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
481 info->si_sys_private){ 505 info->si_sys_private){
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 918e52df090e..8b75008e2bd8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -17,6 +17,7 @@
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/tick.h>
20 21
21#include <asm/irq.h> 22#include <asm/irq.h>
22/* 23/*
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq);
273 274
274#endif 275#endif
275 276
277/*
278 * Enter an interrupt context.
279 */
280void irq_enter(void)
281{
282 __irq_enter();
283#ifdef CONFIG_NO_HZ
284 if (idle_cpu(smp_processor_id()))
285 tick_nohz_update_jiffies();
286#endif
287}
288
276#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 289#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
277# define invoke_softirq() __do_softirq() 290# define invoke_softirq() __do_softirq()
278#else 291#else
@@ -289,6 +302,12 @@ void irq_exit(void)
289 sub_preempt_count(IRQ_EXIT_OFFSET); 302 sub_preempt_count(IRQ_EXIT_OFFSET);
290 if (!in_interrupt() && local_softirq_pending()) 303 if (!in_interrupt() && local_softirq_pending())
291 invoke_softirq(); 304 invoke_softirq();
305
306#ifdef CONFIG_NO_HZ
307 /* Make sure that timer wheel updates are propagated */
308 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
309 tick_nohz_stop_sched_tick();
310#endif
292 preempt_enable_no_resched(); 311 preempt_enable_no_resched();
293} 312}
294 313
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e0ac6cd79fcf..3ca1d5ff0319 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -90,12 +90,6 @@ extern char modprobe_path[];
90#ifdef CONFIG_CHR_DEV_SG 90#ifdef CONFIG_CHR_DEV_SG
91extern int sg_big_buff; 91extern int sg_big_buff;
92#endif 92#endif
93#ifdef CONFIG_SYSVIPC
94static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos);
96static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
97 void __user *buffer, size_t *lenp, loff_t *ppos);
98#endif
99 93
100#ifdef __sparc__ 94#ifdef __sparc__
101extern char reboot_command []; 95extern char reboot_command [];
@@ -135,18 +129,6 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
135 void __user *, size_t, ctl_table *); 129 void __user *, size_t, ctl_table *);
136#endif 130#endif
137 131
138static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
139 void __user *buffer, size_t *lenp, loff_t *ppos);
140
141static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
142 void __user *oldval, size_t __user *oldlenp,
143 void __user *newval, size_t newlen);
144
145#ifdef CONFIG_SYSVIPC
146static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
147 void __user *oldval, size_t __user *oldlenp,
148 void __user *newval, size_t newlen);
149#endif
150 132
151#ifdef CONFIG_PROC_SYSCTL 133#ifdef CONFIG_PROC_SYSCTL
152static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 134static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -177,60 +159,6 @@ int sysctl_legacy_va_layout;
177#endif 159#endif
178 160
179 161
180static void *get_uts(ctl_table *table, int write)
181{
182 char *which = table->data;
183#ifdef CONFIG_UTS_NS
184 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
185 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
186#endif
187 if (!write)
188 down_read(&uts_sem);
189 else
190 down_write(&uts_sem);
191 return which;
192}
193
194static void put_uts(ctl_table *table, int write, void *which)
195{
196 if (!write)
197 up_read(&uts_sem);
198 else
199 up_write(&uts_sem);
200}
201
202#ifdef CONFIG_SYSVIPC
203static void *get_ipc(ctl_table *table, int write)
204{
205 char *which = table->data;
206 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
207 which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
208 return which;
209}
210#else
211#define get_ipc(T,W) ((T)->data)
212#endif
213
214/* /proc declarations: */
215
216#ifdef CONFIG_PROC_SYSCTL
217
218static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
219static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
220static int proc_opensys(struct inode *, struct file *);
221
222const struct file_operations proc_sys_file_operations = {
223 .open = proc_opensys,
224 .read = proc_readsys,
225 .write = proc_writesys,
226};
227
228extern struct proc_dir_entry *proc_sys_root;
229
230static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
231static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
232#endif
233
234/* The default sysctl tables: */ 162/* The default sysctl tables: */
235 163
236static ctl_table root_table[] = { 164static ctl_table root_table[] = {
@@ -278,51 +206,6 @@ static ctl_table root_table[] = {
278 206
279static ctl_table kern_table[] = { 207static ctl_table kern_table[] = {
280 { 208 {
281 .ctl_name = KERN_OSTYPE,
282 .procname = "ostype",
283 .data = init_uts_ns.name.sysname,
284 .maxlen = sizeof(init_uts_ns.name.sysname),
285 .mode = 0444,
286 .proc_handler = &proc_do_uts_string,
287 .strategy = &sysctl_uts_string,
288 },
289 {
290 .ctl_name = KERN_OSRELEASE,
291 .procname = "osrelease",
292 .data = init_uts_ns.name.release,
293 .maxlen = sizeof(init_uts_ns.name.release),
294 .mode = 0444,
295 .proc_handler = &proc_do_uts_string,
296 .strategy = &sysctl_uts_string,
297 },
298 {
299 .ctl_name = KERN_VERSION,
300 .procname = "version",
301 .data = init_uts_ns.name.version,
302 .maxlen = sizeof(init_uts_ns.name.version),
303 .mode = 0444,
304 .proc_handler = &proc_do_uts_string,
305 .strategy = &sysctl_uts_string,
306 },
307 {
308 .ctl_name = KERN_NODENAME,
309 .procname = "hostname",
310 .data = init_uts_ns.name.nodename,
311 .maxlen = sizeof(init_uts_ns.name.nodename),
312 .mode = 0644,
313 .proc_handler = &proc_do_uts_string,
314 .strategy = &sysctl_uts_string,
315 },
316 {
317 .ctl_name = KERN_DOMAINNAME,
318 .procname = "domainname",
319 .data = init_uts_ns.name.domainname,
320 .maxlen = sizeof(init_uts_ns.name.domainname),
321 .mode = 0644,
322 .proc_handler = &proc_do_uts_string,
323 .strategy = &sysctl_uts_string,
324 },
325 {
326 .ctl_name = KERN_PANIC, 209 .ctl_name = KERN_PANIC,
327 .procname = "panic", 210 .procname = "panic",
328 .data = &panic_timeout, 211 .data = &panic_timeout,
@@ -478,71 +361,6 @@ static ctl_table kern_table[] = {
478 .proc_handler = &proc_dointvec, 361 .proc_handler = &proc_dointvec,
479 }, 362 },
480#endif 363#endif
481#ifdef CONFIG_SYSVIPC
482 {
483 .ctl_name = KERN_SHMMAX,
484 .procname = "shmmax",
485 .data = &init_ipc_ns.shm_ctlmax,
486 .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
487 .mode = 0644,
488 .proc_handler = &proc_ipc_doulongvec_minmax,
489 .strategy = sysctl_ipc_data,
490 },
491 {
492 .ctl_name = KERN_SHMALL,
493 .procname = "shmall",
494 .data = &init_ipc_ns.shm_ctlall,
495 .maxlen = sizeof (init_ipc_ns.shm_ctlall),
496 .mode = 0644,
497 .proc_handler = &proc_ipc_doulongvec_minmax,
498 .strategy = sysctl_ipc_data,
499 },
500 {
501 .ctl_name = KERN_SHMMNI,
502 .procname = "shmmni",
503 .data = &init_ipc_ns.shm_ctlmni,
504 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
505 .mode = 0644,
506 .proc_handler = &proc_ipc_dointvec,
507 .strategy = sysctl_ipc_data,
508 },
509 {
510 .ctl_name = KERN_MSGMAX,
511 .procname = "msgmax",
512 .data = &init_ipc_ns.msg_ctlmax,
513 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
514 .mode = 0644,
515 .proc_handler = &proc_ipc_dointvec,
516 .strategy = sysctl_ipc_data,
517 },
518 {
519 .ctl_name = KERN_MSGMNI,
520 .procname = "msgmni",
521 .data = &init_ipc_ns.msg_ctlmni,
522 .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
523 .mode = 0644,
524 .proc_handler = &proc_ipc_dointvec,
525 .strategy = sysctl_ipc_data,
526 },
527 {
528 .ctl_name = KERN_MSGMNB,
529 .procname = "msgmnb",
530 .data = &init_ipc_ns.msg_ctlmnb,
531 .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
532 .mode = 0644,
533 .proc_handler = &proc_ipc_dointvec,
534 .strategy = sysctl_ipc_data,
535 },
536 {
537 .ctl_name = KERN_SEM,
538 .procname = "sem",
539 .data = &init_ipc_ns.sem_ctls,
540 .maxlen = 4*sizeof (int),
541 .mode = 0644,
542 .proc_handler = &proc_ipc_dointvec,
543 .strategy = sysctl_ipc_data,
544 },
545#endif
546#ifdef CONFIG_MAGIC_SYSRQ 364#ifdef CONFIG_MAGIC_SYSRQ
547 { 365 {
548 .ctl_name = KERN_SYSRQ, 366 .ctl_name = KERN_SYSRQ,
@@ -1043,6 +861,12 @@ static ctl_table vm_table[] = {
1043 { .ctl_name = 0 } 861 { .ctl_name = 0 }
1044}; 862};
1045 863
864#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
865static ctl_table binfmt_misc_table[] = {
866 { .ctl_name = 0 }
867};
868#endif
869
1046static ctl_table fs_table[] = { 870static ctl_table fs_table[] = {
1047 { 871 {
1048 .ctl_name = FS_NRINODE, 872 .ctl_name = FS_NRINODE,
@@ -1166,6 +990,14 @@ static ctl_table fs_table[] = {
1166 .mode = 0644, 990 .mode = 0644,
1167 .proc_handler = &proc_dointvec, 991 .proc_handler = &proc_dointvec,
1168 }, 992 },
993#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
994 {
995 .ctl_name = CTL_UNNUMBERED,
996 .procname = "binfmt_misc",
997 .mode = 0555,
998 .child = binfmt_misc_table,
999 },
1000#endif
1169 { .ctl_name = 0 } 1001 { .ctl_name = 0 }
1170}; 1002};
1171 1003
@@ -1177,8 +1009,6 @@ static ctl_table dev_table[] = {
1177 { .ctl_name = 0 } 1009 { .ctl_name = 0 }
1178}; 1010};
1179 1011
1180extern void init_irq_proc (void);
1181
1182static DEFINE_SPINLOCK(sysctl_lock); 1012static DEFINE_SPINLOCK(sysctl_lock);
1183 1013
1184/* called under sysctl_lock */ 1014/* called under sysctl_lock */
@@ -1220,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p)
1220 list_del_init(&p->ctl_entry); 1050 list_del_init(&p->ctl_entry);
1221} 1051}
1222 1052
1223void __init sysctl_init(void) 1053void sysctl_head_finish(struct ctl_table_header *head)
1224{ 1054{
1225#ifdef CONFIG_PROC_SYSCTL 1055 if (!head)
1226 register_proc_table(root_table, proc_sys_root, &root_table_header); 1056 return;
1227 init_irq_proc(); 1057 spin_lock(&sysctl_lock);
1228#endif 1058 unuse_table(head);
1059 spin_unlock(&sysctl_lock);
1060}
1061
1062struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1063{
1064 struct ctl_table_header *head;
1065 struct list_head *tmp;
1066 spin_lock(&sysctl_lock);
1067 if (prev) {
1068 tmp = &prev->ctl_entry;
1069 unuse_table(prev);
1070 goto next;
1071 }
1072 tmp = &root_table_header.ctl_entry;
1073 for (;;) {
1074 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1075
1076 if (!use_table(head))
1077 goto next;
1078 spin_unlock(&sysctl_lock);
1079 return head;
1080 next:
1081 tmp = tmp->next;
1082 if (tmp == &root_table_header.ctl_entry)
1083 break;
1084 }
1085 spin_unlock(&sysctl_lock);
1086 return NULL;
1229} 1087}
1230 1088
1231#ifdef CONFIG_SYSCTL_SYSCALL 1089#ifdef CONFIG_SYSCTL_SYSCALL
1232int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1090int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1233 void __user *newval, size_t newlen) 1091 void __user *newval, size_t newlen)
1234{ 1092{
1235 struct list_head *tmp; 1093 struct ctl_table_header *head;
1236 int error = -ENOTDIR; 1094 int error = -ENOTDIR;
1237 1095
1238 if (nlen <= 0 || nlen >= CTL_MAXNAME) 1096 if (nlen <= 0 || nlen >= CTL_MAXNAME)
@@ -1242,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1242 if (!oldlenp || get_user(old_len, oldlenp)) 1100 if (!oldlenp || get_user(old_len, oldlenp))
1243 return -EFAULT; 1101 return -EFAULT;
1244 } 1102 }
1245 spin_lock(&sysctl_lock);
1246 tmp = &root_table_header.ctl_entry;
1247 do {
1248 struct ctl_table_header *head =
1249 list_entry(tmp, struct ctl_table_header, ctl_entry);
1250
1251 if (!use_table(head))
1252 continue;
1253
1254 spin_unlock(&sysctl_lock);
1255 1103
1104 for (head = sysctl_head_next(NULL); head;
1105 head = sysctl_head_next(head)) {
1256 error = parse_table(name, nlen, oldval, oldlenp, 1106 error = parse_table(name, nlen, oldval, oldlenp,
1257 newval, newlen, head->ctl_table); 1107 newval, newlen, head->ctl_table);
1258 1108 if (error != -ENOTDIR) {
1259 spin_lock(&sysctl_lock); 1109 sysctl_head_finish(head);
1260 unuse_table(head);
1261 if (error != -ENOTDIR)
1262 break; 1110 break;
1263 } while ((tmp = tmp->next) != &root_table_header.ctl_entry); 1111 }
1264 spin_unlock(&sysctl_lock); 1112 }
1265 return error; 1113 return error;
1266} 1114}
1267 1115
@@ -1282,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1282#endif /* CONFIG_SYSCTL_SYSCALL */ 1130#endif /* CONFIG_SYSCTL_SYSCALL */
1283 1131
1284/* 1132/*
1285 * ctl_perm does NOT grant the superuser all rights automatically, because 1133 * sysctl_perm does NOT grant the superuser all rights automatically, because
1286 * some sysctl variables are readonly even to root. 1134 * some sysctl variables are readonly even to root.
1287 */ 1135 */
1288 1136
@@ -1297,7 +1145,7 @@ static int test_perm(int mode, int op)
1297 return -EACCES; 1145 return -EACCES;
1298} 1146}
1299 1147
1300static inline int ctl_perm(ctl_table *table, int op) 1148int sysctl_perm(ctl_table *table, int op)
1301{ 1149{
1302 int error; 1150 int error;
1303 error = security_sysctl(table, op); 1151 error = security_sysctl(table, op);
@@ -1321,19 +1169,11 @@ repeat:
1321 for ( ; table->ctl_name || table->procname; table++) { 1169 for ( ; table->ctl_name || table->procname; table++) {
1322 if (!table->ctl_name) 1170 if (!table->ctl_name)
1323 continue; 1171 continue;
1324 if (n == table->ctl_name || table->ctl_name == CTL_ANY) { 1172 if (n == table->ctl_name) {
1325 int error; 1173 int error;
1326 if (table->child) { 1174 if (table->child) {
1327 if (ctl_perm(table, 001)) 1175 if (sysctl_perm(table, 001))
1328 return -EPERM; 1176 return -EPERM;
1329 if (table->strategy) {
1330 error = table->strategy(
1331 table, name, nlen,
1332 oldval, oldlenp,
1333 newval, newlen);
1334 if (error)
1335 return error;
1336 }
1337 name++; 1177 name++;
1338 nlen--; 1178 nlen--;
1339 table = table->child; 1179 table = table->child;
@@ -1361,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table,
1361 op |= 004; 1201 op |= 004;
1362 if (newval) 1202 if (newval)
1363 op |= 002; 1203 op |= 002;
1364 if (ctl_perm(table, op)) 1204 if (sysctl_perm(table, op))
1365 return -EPERM; 1205 return -EPERM;
1366 1206
1367 if (table->strategy) { 1207 if (table->strategy) {
@@ -1400,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table,
1400} 1240}
1401#endif /* CONFIG_SYSCTL_SYSCALL */ 1241#endif /* CONFIG_SYSCTL_SYSCALL */
1402 1242
1243static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1244{
1245 for (; table->ctl_name || table->procname; table++) {
1246 table->parent = parent;
1247 if (table->child)
1248 sysctl_set_parent(table, table->child);
1249 }
1250}
1251
1252static __init int sysctl_init(void)
1253{
1254 sysctl_set_parent(NULL, root_table);
1255 return 0;
1256}
1257
1258core_initcall(sysctl_init);
1259
1403/** 1260/**
1404 * register_sysctl_table - register a sysctl hierarchy 1261 * register_sysctl_table - register a sysctl hierarchy
1405 * @table: the top-level table structure 1262 * @table: the top-level table structure
1406 * @insert_at_head: whether the entry should be inserted in front or at the end
1407 * 1263 *
1408 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1264 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1409 * array. An entry with a ctl_name of 0 terminates the table. 1265 * array. An entry with a ctl_name of 0 terminates the table.
@@ -1469,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table,
1469 * This routine returns %NULL on a failure to register, and a pointer 1325 * This routine returns %NULL on a failure to register, and a pointer
1470 * to the table header on success. 1326 * to the table header on success.
1471 */ 1327 */
1472struct ctl_table_header *register_sysctl_table(ctl_table * table, 1328struct ctl_table_header *register_sysctl_table(ctl_table * table)
1473 int insert_at_head)
1474{ 1329{
1475 struct ctl_table_header *tmp; 1330 struct ctl_table_header *tmp;
1476 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1331 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
@@ -1480,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1480 INIT_LIST_HEAD(&tmp->ctl_entry); 1335 INIT_LIST_HEAD(&tmp->ctl_entry);
1481 tmp->used = 0; 1336 tmp->used = 0;
1482 tmp->unregistering = NULL; 1337 tmp->unregistering = NULL;
1338 sysctl_set_parent(NULL, table);
1483 spin_lock(&sysctl_lock); 1339 spin_lock(&sysctl_lock);
1484 if (insert_at_head) 1340 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1485 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
1486 else
1487 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1488 spin_unlock(&sysctl_lock); 1341 spin_unlock(&sysctl_lock);
1489#ifdef CONFIG_PROC_SYSCTL
1490 register_proc_table(table, proc_sys_root, tmp);
1491#endif
1492 return tmp; 1342 return tmp;
1493} 1343}
1494 1344
@@ -1504,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1504 might_sleep(); 1354 might_sleep();
1505 spin_lock(&sysctl_lock); 1355 spin_lock(&sysctl_lock);
1506 start_unregistering(header); 1356 start_unregistering(header);
1507#ifdef CONFIG_PROC_SYSCTL
1508 unregister_proc_table(header->ctl_table, proc_sys_root);
1509#endif
1510 spin_unlock(&sysctl_lock); 1357 spin_unlock(&sysctl_lock);
1511 kfree(header); 1358 kfree(header);
1512} 1359}
@@ -1530,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table)
1530 1377
1531#ifdef CONFIG_PROC_SYSCTL 1378#ifdef CONFIG_PROC_SYSCTL
1532 1379
1533/* Scan the sysctl entries in table and add them all into /proc */
1534static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
1535{
1536 struct proc_dir_entry *de;
1537 int len;
1538 mode_t mode;
1539
1540 for (; table->ctl_name || table->procname; table++) {
1541 /* Can't do anything without a proc name. */
1542 if (!table->procname)
1543 continue;
1544 /* Maybe we can't do anything with it... */
1545 if (!table->proc_handler && !table->child) {
1546 printk(KERN_WARNING "SYSCTL: Can't register %s\n",
1547 table->procname);
1548 continue;
1549 }
1550
1551 len = strlen(table->procname);
1552 mode = table->mode;
1553
1554 de = NULL;
1555 if (table->proc_handler)
1556 mode |= S_IFREG;
1557 else {
1558 mode |= S_IFDIR;
1559 for (de = root->subdir; de; de = de->next) {
1560 if (proc_match(len, table->procname, de))
1561 break;
1562 }
1563 /* If the subdir exists already, de is non-NULL */
1564 }
1565
1566 if (!de) {
1567 de = create_proc_entry(table->procname, mode, root);
1568 if (!de)
1569 continue;
1570 de->set = set;
1571 de->data = (void *) table;
1572 if (table->proc_handler)
1573 de->proc_fops = &proc_sys_file_operations;
1574 }
1575 table->de = de;
1576 if (de->mode & S_IFDIR)
1577 register_proc_table(table->child, de, set);
1578 }
1579}
1580
1581/*
1582 * Unregister a /proc sysctl table and any subdirectories.
1583 */
1584static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
1585{
1586 struct proc_dir_entry *de;
1587 for (; table->ctl_name || table->procname; table++) {
1588 if (!(de = table->de))
1589 continue;
1590 if (de->mode & S_IFDIR) {
1591 if (!table->child) {
1592 printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
1593 continue;
1594 }
1595 unregister_proc_table(table->child, de);
1596
1597 /* Don't unregister directories which still have entries.. */
1598 if (de->subdir)
1599 continue;
1600 }
1601
1602 /*
1603 * In any case, mark the entry as goner; we'll keep it
1604 * around if it's busy, but we'll know to do nothing with
1605 * its fields. We are under sysctl_lock here.
1606 */
1607 de->data = NULL;
1608
1609 /* Don't unregister proc entries that are still being used.. */
1610 if (atomic_read(&de->count))
1611 continue;
1612
1613 table->de = NULL;
1614 remove_proc_entry(table->procname, root);
1615 }
1616}
1617
1618static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1619 size_t count, loff_t *ppos)
1620{
1621 int op;
1622 struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
1623 struct ctl_table *table;
1624 size_t res;
1625 ssize_t error = -ENOTDIR;
1626
1627 spin_lock(&sysctl_lock);
1628 if (de && de->data && use_table(de->set)) {
1629 /*
1630 * at that point we know that sysctl was not unregistered
1631 * and won't be until we finish
1632 */
1633 spin_unlock(&sysctl_lock);
1634 table = (struct ctl_table *) de->data;
1635 if (!table || !table->proc_handler)
1636 goto out;
1637 error = -EPERM;
1638 op = (write ? 002 : 004);
1639 if (ctl_perm(table, op))
1640 goto out;
1641
1642 /* careful: calling conventions are nasty here */
1643 res = count;
1644 error = (*table->proc_handler)(table, write, file,
1645 buf, &res, ppos);
1646 if (!error)
1647 error = res;
1648 out:
1649 spin_lock(&sysctl_lock);
1650 unuse_table(de->set);
1651 }
1652 spin_unlock(&sysctl_lock);
1653 return error;
1654}
1655
1656static int proc_opensys(struct inode *inode, struct file *file)
1657{
1658 if (file->f_mode & FMODE_WRITE) {
1659 /*
1660 * sysctl entries that are not writable,
1661 * are _NOT_ writable, capabilities or not.
1662 */
1663 if (!(inode->i_mode & S_IWUSR))
1664 return -EPERM;
1665 }
1666
1667 return 0;
1668}
1669
1670static ssize_t proc_readsys(struct file * file, char __user * buf,
1671 size_t count, loff_t *ppos)
1672{
1673 return do_rw_proc(0, file, buf, count, ppos);
1674}
1675
1676static ssize_t proc_writesys(struct file * file, const char __user * buf,
1677 size_t count, loff_t *ppos)
1678{
1679 return do_rw_proc(1, file, (char __user *) buf, count, ppos);
1680}
1681
1682static int _proc_do_string(void* data, int maxlen, int write, 1380static int _proc_do_string(void* data, int maxlen, int write,
1683 struct file *filp, void __user *buffer, 1381 struct file *filp, void __user *buffer,
1684 size_t *lenp, loff_t *ppos) 1382 size_t *lenp, loff_t *ppos)
@@ -1762,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1762 buffer, lenp, ppos); 1460 buffer, lenp, ppos);
1763} 1461}
1764 1462
1765/*
1766 * Special case of dostring for the UTS structure. This has locks
1767 * to observe. Should this be in kernel/sys.c ????
1768 */
1769
1770static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1771 void __user *buffer, size_t *lenp, loff_t *ppos)
1772{
1773 int r;
1774 void *which;
1775 which = get_uts(table, write);
1776 r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
1777 put_uts(table, write, which);
1778 return r;
1779}
1780 1463
1781static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1464static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1782 int *valp, 1465 int *valp,
@@ -2362,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2362 do_proc_dointvec_ms_jiffies_conv, NULL); 2045 do_proc_dointvec_ms_jiffies_conv, NULL);
2363} 2046}
2364 2047
2365#ifdef CONFIG_SYSVIPC
2366static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2367 void __user *buffer, size_t *lenp, loff_t *ppos)
2368{
2369 void *which;
2370 which = get_ipc(table, write);
2371 return __do_proc_dointvec(which, table, write, filp, buffer,
2372 lenp, ppos, NULL, NULL);
2373}
2374
2375static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2376 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
2377{
2378 void *which;
2379 which = get_ipc(table, write);
2380 return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
2381 lenp, ppos, 1l, 1l);
2382}
2383
2384#endif
2385
2386static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2048static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
2387 void __user *buffer, size_t *lenp, loff_t *ppos) 2049 void __user *buffer, size_t *lenp, loff_t *ppos)
2388{ 2050{
@@ -2413,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
2413 return -ENOSYS; 2075 return -ENOSYS;
2414} 2076}
2415 2077
2416static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
2417 void __user *buffer, size_t *lenp, loff_t *ppos)
2418{
2419 return -ENOSYS;
2420}
2421
2422#ifdef CONFIG_SYSVIPC
2423static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2424 void __user *buffer, size_t *lenp, loff_t *ppos)
2425{
2426 return -ENOSYS;
2427}
2428static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2429 void __user *buffer, size_t *lenp, loff_t *ppos)
2430{
2431 return -ENOSYS;
2432}
2433static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2434 struct file *filp, void __user *buffer,
2435 size_t *lenp, loff_t *ppos)
2436{
2437 return -ENOSYS;
2438}
2439#endif
2440
2441int proc_dointvec(ctl_table *table, int write, struct file *filp, 2078int proc_dointvec(ctl_table *table, int write, struct file *filp,
2442 void __user *buffer, size_t *lenp, loff_t *ppos) 2079 void __user *buffer, size_t *lenp, loff_t *ppos)
2443{ 2080{
@@ -2648,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2648} 2285}
2649 2286
2650 2287
2651/* The generic string strategy routine: */
2652static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2653 void __user *oldval, size_t __user *oldlenp,
2654 void __user *newval, size_t newlen)
2655{
2656 struct ctl_table uts_table;
2657 int r, write;
2658 write = newval && newlen;
2659 memcpy(&uts_table, table, sizeof(uts_table));
2660 uts_table.data = get_uts(table, write);
2661 r = sysctl_string(&uts_table, name, nlen,
2662 oldval, oldlenp, newval, newlen);
2663 put_uts(table, write, uts_table.data);
2664 return r;
2665}
2666
2667#ifdef CONFIG_SYSVIPC
2668/* The generic sysctl ipc data routine. */
2669static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2670 void __user *oldval, size_t __user *oldlenp,
2671 void __user *newval, size_t newlen)
2672{
2673 size_t len;
2674 void *data;
2675
2676 /* Get out of I don't have a variable */
2677 if (!table->data || !table->maxlen)
2678 return -ENOTDIR;
2679
2680 data = get_ipc(table, 1);
2681 if (!data)
2682 return -ENOTDIR;
2683
2684 if (oldval && oldlenp) {
2685 if (get_user(len, oldlenp))
2686 return -EFAULT;
2687 if (len) {
2688 if (len > table->maxlen)
2689 len = table->maxlen;
2690 if (copy_to_user(oldval, data, len))
2691 return -EFAULT;
2692 if (put_user(len, oldlenp))
2693 return -EFAULT;
2694 }
2695 }
2696
2697 if (newval && newlen) {
2698 if (newlen > table->maxlen)
2699 newlen = table->maxlen;
2700
2701 if (copy_from_user(data, newval, newlen))
2702 return -EFAULT;
2703 }
2704 return 1;
2705}
2706#endif
2707 2288
2708#else /* CONFIG_SYSCTL_SYSCALL */ 2289#else /* CONFIG_SYSCTL_SYSCALL */
2709 2290
@@ -2769,20 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2769 return -ENOSYS; 2350 return -ENOSYS;
2770} 2351}
2771 2352
2772static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2773 void __user *oldval, size_t __user *oldlenp,
2774 void __user *newval, size_t newlen)
2775{
2776 return -ENOSYS;
2777}
2778#ifdef CONFIG_SYSVIPC
2779static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2780 void __user *oldval, size_t __user *oldlenp,
2781 void __user *newval, size_t newlen)
2782{
2783 return -ENOSYS;
2784}
2785#endif
2786#endif /* CONFIG_SYSCTL_SYSCALL */ 2353#endif /* CONFIG_SYSCTL_SYSCALL */
2787 2354
2788/* 2355/*
diff --git a/kernel/time.c b/kernel/time.c
index 0e017bff4c19..c6c80ea5d0ea 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec)
470 return tv; 470 return tv;
471} 471}
472 472
473/*
474 * Convert jiffies to milliseconds and back.
475 *
476 * Avoid unnecessary multiplications/divisions in the
477 * two most common HZ cases:
478 */
479unsigned int jiffies_to_msecs(const unsigned long j)
480{
481#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
482 return (MSEC_PER_SEC / HZ) * j;
483#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
484 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
485#else
486 return (j * MSEC_PER_SEC) / HZ;
487#endif
488}
489EXPORT_SYMBOL(jiffies_to_msecs);
490
491unsigned int jiffies_to_usecs(const unsigned long j)
492{
493#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
494 return (USEC_PER_SEC / HZ) * j;
495#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
496 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
497#else
498 return (j * USEC_PER_SEC) / HZ;
499#endif
500}
501EXPORT_SYMBOL(jiffies_to_usecs);
502
503/*
504 * When we convert to jiffies then we interpret incoming values
505 * the following way:
506 *
507 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
508 *
509 * - 'too large' values [that would result in larger than
510 * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
511 *
512 * - all other values are converted to jiffies by either multiplying
513 * the input value by a factor or dividing it with a factor
514 *
515 * We must also be careful about 32-bit overflows.
516 */
517unsigned long msecs_to_jiffies(const unsigned int m)
518{
519 /*
520 * Negative value, means infinite timeout:
521 */
522 if ((int)m < 0)
523 return MAX_JIFFY_OFFSET;
524
525#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
526 /*
527 * HZ is equal to or smaller than 1000, and 1000 is a nice
528 * round multiple of HZ, divide with the factor between them,
529 * but round upwards:
530 */
531 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
532#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
533 /*
534 * HZ is larger than 1000, and HZ is a nice round multiple of
535 * 1000 - simply multiply with the factor between them.
536 *
537 * But first make sure the multiplication result cannot
538 * overflow:
539 */
540 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
541 return MAX_JIFFY_OFFSET;
542
543 return m * (HZ / MSEC_PER_SEC);
544#else
545 /*
546 * Generic case - multiply, round and divide. But first
547 * check that if we are doing a net multiplication, that
548 * we wouldnt overflow:
549 */
550 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
551 return MAX_JIFFY_OFFSET;
552
553 return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
554#endif
555}
556EXPORT_SYMBOL(msecs_to_jiffies);
557
558unsigned long usecs_to_jiffies(const unsigned int u)
559{
560 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
561 return MAX_JIFFY_OFFSET;
562#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
563 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
564#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
565 return u * (HZ / USEC_PER_SEC);
566#else
567 return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
568#endif
569}
570EXPORT_SYMBOL(usecs_to_jiffies);
571
572/*
573 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
574 * that a remainder subtract here would not do the right thing as the
575 * resolution values don't fall on second boundries. I.e. the line:
576 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
577 *
578 * Rather, we just shift the bits off the right.
579 *
580 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
581 * value to a scaled second value.
582 */
583unsigned long
584timespec_to_jiffies(const struct timespec *value)
585{
586 unsigned long sec = value->tv_sec;
587 long nsec = value->tv_nsec + TICK_NSEC - 1;
588
589 if (sec >= MAX_SEC_IN_JIFFIES){
590 sec = MAX_SEC_IN_JIFFIES;
591 nsec = 0;
592 }
593 return (((u64)sec * SEC_CONVERSION) +
594 (((u64)nsec * NSEC_CONVERSION) >>
595 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
596
597}
598EXPORT_SYMBOL(timespec_to_jiffies);
599
600void
601jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
602{
603 /*
604 * Convert jiffies to nanoseconds and separate with
605 * one divide.
606 */
607 u64 nsec = (u64)jiffies * TICK_NSEC;
608 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
609}
610EXPORT_SYMBOL(jiffies_to_timespec);
611
612/* Same for "timeval"
613 *
614 * Well, almost. The problem here is that the real system resolution is
615 * in nanoseconds and the value being converted is in micro seconds.
616 * Also for some machines (those that use HZ = 1024, in-particular),
617 * there is a LARGE error in the tick size in microseconds.
618
619 * The solution we use is to do the rounding AFTER we convert the
620 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
621 * Instruction wise, this should cost only an additional add with carry
622 * instruction above the way it was done above.
623 */
624unsigned long
625timeval_to_jiffies(const struct timeval *value)
626{
627 unsigned long sec = value->tv_sec;
628 long usec = value->tv_usec;
629
630 if (sec >= MAX_SEC_IN_JIFFIES){
631 sec = MAX_SEC_IN_JIFFIES;
632 usec = 0;
633 }
634 return (((u64)sec * SEC_CONVERSION) +
635 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
636 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
637}
638
639void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
640{
641 /*
642 * Convert jiffies to nanoseconds and separate with
643 * one divide.
644 */
645 u64 nsec = (u64)jiffies * TICK_NSEC;
646 long tv_usec;
647
648 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
649 tv_usec /= NSEC_PER_USEC;
650 value->tv_usec = tv_usec;
651}
652
653/*
654 * Convert jiffies/jiffies_64 to clock_t and back.
655 */
656clock_t jiffies_to_clock_t(long x)
657{
658#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
659 return x / (HZ / USER_HZ);
660#else
661 u64 tmp = (u64)x * TICK_NSEC;
662 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
663 return (long)tmp;
664#endif
665}
666EXPORT_SYMBOL(jiffies_to_clock_t);
667
668unsigned long clock_t_to_jiffies(unsigned long x)
669{
670#if (HZ % USER_HZ)==0
671 if (x >= ~0UL / (HZ / USER_HZ))
672 return ~0UL;
673 return x * (HZ / USER_HZ);
674#else
675 u64 jif;
676
677 /* Don't worry about loss of precision here .. */
678 if (x >= ~0UL / HZ * USER_HZ)
679 return ~0UL;
680
681 /* .. but do try to contain it here */
682 jif = x * (u64) HZ;
683 do_div(jif, USER_HZ);
684 return jif;
685#endif
686}
687EXPORT_SYMBOL(clock_t_to_jiffies);
688
689u64 jiffies_64_to_clock_t(u64 x)
690{
691#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
692 do_div(x, HZ / USER_HZ);
693#else
694 /*
695 * There are better ways that don't overflow early,
696 * but even this doesn't overflow in hundreds of years
697 * in 64 bits, so..
698 */
699 x *= TICK_NSEC;
700 do_div(x, (NSEC_PER_SEC / USER_HZ));
701#endif
702 return x;
703}
704
705EXPORT_SYMBOL(jiffies_64_to_clock_t);
706
707u64 nsec_to_clock_t(u64 x)
708{
709#if (NSEC_PER_SEC % USER_HZ) == 0
710 do_div(x, (NSEC_PER_SEC / USER_HZ));
711#elif (USER_HZ % 512) == 0
712 x *= USER_HZ/512;
713 do_div(x, (NSEC_PER_SEC / 512));
714#else
715 /*
716 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
717 * overflow after 64.99 years.
718 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
719 */
720 x *= 9;
721 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
722 USER_HZ));
723#endif
724 return x;
725}
726
473#if (BITS_PER_LONG < 64) 727#if (BITS_PER_LONG < 64)
474u64 get_jiffies_64(void) 728u64 get_jiffies_64(void)
475{ 729{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000000..f66351126544
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
1#
2# Timer subsystem related configuration options
3#
4config TICK_ONESHOT
5 bool
6 default n
7
8config NO_HZ
9 bool "Tickless System (Dynamic Ticks)"
10 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
11 select TICK_ONESHOT
12 help
13 This option enables a tickless system: timer interrupts will
14 only trigger on an as-needed basis both when the system is
15 busy and when the system is idle.
16
17config HIGH_RES_TIMERS
18 bool "High Resolution Timer Support"
19 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
20 select TICK_ONESHOT
21 help
22 This option enables high resolution timer support. If your
23 hardware is not capable then this option only increases
24 the size of the kernel image.
25
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16fb..93bccba1f265 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
1obj-y += ntp.o clocksource.o jiffies.o 1obj-y += ntp.o clocksource.o jiffies.o timer_list.o
2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
8obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000000..67932ea78c17
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
1/*
2 * linux/kernel/time/clockevents.c
3 *
4 * This file contains functions which manage clock event devices.
5 *
6 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
8 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
9 *
10 * This code is licenced under the GPL version 2. For details see
11 * kernel-base/COPYING.
12 */
13
14#include <linux/clockchips.h>
15#include <linux/hrtimer.h>
16#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h>
20#include <linux/sysdev.h>
21
22/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices);
24static LIST_HEAD(clockevents_released);
25
26/* Notification for clock events */
27static RAW_NOTIFIER_HEAD(clockevents_chain);
28
29/* Protection for the above */
30static DEFINE_SPINLOCK(clockevents_lock);
31
32/**
33 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
34 * @latch: value to convert
35 * @evt: pointer to clock event device descriptor
36 *
37 * Math helper, returns latch value converted to nanoseconds (bound checked)
38 */
39unsigned long clockevent_delta2ns(unsigned long latch,
40 struct clock_event_device *evt)
41{
42 u64 clc = ((u64) latch << evt->shift);
43
44 do_div(clc, evt->mult);
45 if (clc < 1000)
46 clc = 1000;
47 if (clc > LONG_MAX)
48 clc = LONG_MAX;
49
50 return (unsigned long) clc;
51}
52
53/**
54 * clockevents_set_mode - set the operating mode of a clock event device
55 * @dev: device to modify
56 * @mode: new mode
57 *
58 * Must be called with interrupts disabled !
59 */
60void clockevents_set_mode(struct clock_event_device *dev,
61 enum clock_event_mode mode)
62{
63 if (dev->mode != mode) {
64 dev->set_mode(mode, dev);
65 dev->mode = mode;
66 }
67}
68
69/**
70 * clockevents_program_event - Reprogram the clock event device.
71 * @expires: absolute expiry time (monotonic clock)
72 *
73 * Returns 0 on success, -ETIME when the event is in the past.
74 */
75int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
76 ktime_t now)
77{
78 unsigned long long clc;
79 int64_t delta;
80
81 delta = ktime_to_ns(ktime_sub(expires, now));
82
83 if (delta <= 0)
84 return -ETIME;
85
86 dev->next_event = expires;
87
88 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
89 return 0;
90
91 if (delta > dev->max_delta_ns)
92 delta = dev->max_delta_ns;
93 if (delta < dev->min_delta_ns)
94 delta = dev->min_delta_ns;
95
96 clc = delta * dev->mult;
97 clc >>= dev->shift;
98
99 return dev->set_next_event((unsigned long) clc, dev);
100}
101
102/**
103 * clockevents_register_notifier - register a clock events change listener
104 */
105int clockevents_register_notifier(struct notifier_block *nb)
106{
107 int ret;
108
109 spin_lock(&clockevents_lock);
110 ret = raw_notifier_chain_register(&clockevents_chain, nb);
111 spin_unlock(&clockevents_lock);
112
113 return ret;
114}
115
116/**
117 * clockevents_unregister_notifier - unregister a clock events change listener
118 */
119void clockevents_unregister_notifier(struct notifier_block *nb)
120{
121 spin_lock(&clockevents_lock);
122 raw_notifier_chain_unregister(&clockevents_chain, nb);
123 spin_unlock(&clockevents_lock);
124}
125
126/*
127 * Notify about a clock event change. Called with clockevents_lock
128 * held.
129 */
130static void clockevents_do_notify(unsigned long reason, void *dev)
131{
132 raw_notifier_call_chain(&clockevents_chain, reason, dev);
133}
134
135/*
136 * Called after a notify add to make devices availble which were
137 * released from the notifier call.
138 */
139static void clockevents_notify_released(void)
140{
141 struct clock_event_device *dev;
142
143 while (!list_empty(&clockevents_released)) {
144 dev = list_entry(clockevents_released.next,
145 struct clock_event_device, list);
146 list_del(&dev->list);
147 list_add(&dev->list, &clockevent_devices);
148 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
149 }
150}
151
152/**
153 * clockevents_register_device - register a clock event device
154 * @dev: device to register
155 */
156void clockevents_register_device(struct clock_event_device *dev)
157{
158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159
160 spin_lock(&clockevents_lock);
161
162 list_add(&dev->list, &clockevent_devices);
163 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
164 clockevents_notify_released();
165
166 spin_unlock(&clockevents_lock);
167}
168
169/*
170 * Noop handler when we shut down an event device
171 */
172static void clockevents_handle_noop(struct clock_event_device *dev)
173{
174}
175
176/**
177 * clockevents_exchange_device - release and request clock devices
178 * @old: device to release (can be NULL)
179 * @new: device to request (can be NULL)
180 *
181 * Called from the notifier chain. clockevents_lock is held already
182 */
183void clockevents_exchange_device(struct clock_event_device *old,
184 struct clock_event_device *new)
185{
186 unsigned long flags;
187
188 local_irq_save(flags);
189 /*
190 * Caller releases a clock event device. We queue it into the
191 * released list and do a notify add later.
192 */
193 if (old) {
194 old->event_handler = clockevents_handle_noop;
195 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
196 list_del(&old->list);
197 list_add(&old->list, &clockevents_released);
198 }
199
200 if (new) {
201 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
202 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
203 }
204 local_irq_restore(flags);
205}
206
207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events
250 */
251void clockevents_notify(unsigned long reason, void *arg)
252{
253 spin_lock(&clockevents_lock);
254 clockevents_do_notify(reason, arg);
255
256 switch (reason) {
257 case CLOCK_EVT_NOTIFY_CPU_DEAD:
258 /*
259 * Unregister the clock event devices which were
260 * released from the users in the notify chain.
261 */
262 while (!list_empty(&clockevents_released)) {
263 struct clock_event_device *dev;
264
265 dev = list_entry(clockevents_released.next,
266 struct clock_event_device, list);
267 list_del(&dev->list);
268 }
269 break;
270 default:
271 break;
272 }
273 spin_unlock(&clockevents_lock);
274}
275EXPORT_SYMBOL_GPL(clockevents_notify);
276
277#ifdef CONFIG_SYSFS
278
279/**
280 * clockevents_show_registered - sysfs interface for listing clockevents
281 * @dev: unused
282 * @buf: char buffer to be filled with clock events list
283 *
284 * Provides sysfs interface for listing registered clock event devices
285 */
286static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
287{
288 struct list_head *tmp;
289 char *p = buf;
290 int cpu;
291
292 spin_lock(&clockevents_lock);
293
294 list_for_each(tmp, &clockevent_devices) {
295 struct clock_event_device *ce;
296
297 ce = list_entry(tmp, struct clock_event_device, list);
298 p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
299 ce->features, ce->mode);
300 p += sprintf(p, " C:");
301 if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
302 for_each_cpu_mask(cpu, ce->cpumask)
303 p += sprintf(p, " %d", cpu);
304 } else {
305 /*
306 * FIXME: Add the cpu which is handling this sucker
307 */
308 }
309 p += sprintf(p, "\n");
310 }
311
312 spin_unlock(&clockevents_lock);
313
314 return p - buf;
315}
316
317/*
318 * Sysfs setup bits:
319 */
320static SYSDEV_ATTR(registered, 0600,
321 clockevents_show_registered, NULL);
322
323static struct sysdev_class clockevents_sysclass = {
324 set_kset_name("clockevents"),
325};
326
327static struct sys_device clockevents_sys_device = {
328 .id = 0,
329 .cls = &clockevents_sysclass,
330};
331
332static int __init clockevents_sysfs_init(void)
333{
334 int error = sysdev_class_register(&clockevents_sysclass);
335
336 if (!error)
337 error = sysdev_register(&clockevents_sys_device);
338 if (!error)
339 error = sysdev_create_file(
340 &clockevents_sys_device,
341 &attr_registered);
342 return error;
343}
344device_initcall(clockevents_sysfs_init);
345#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9ef176c4e09..193a0793af95 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -29,6 +29,7 @@
29#include <linux/init.h> 29#include <linux/init.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h>
32 33
33/* XXX - Would like a better way for initializing curr_clocksource */ 34/* XXX - Would like a better way for initializing curr_clocksource */
34extern struct clocksource clocksource_jiffies; 35extern struct clocksource clocksource_jiffies;
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
48 */ 49 */
49static struct clocksource *curr_clocksource = &clocksource_jiffies; 50static struct clocksource *curr_clocksource = &clocksource_jiffies;
50static struct clocksource *next_clocksource; 51static struct clocksource *next_clocksource;
52static struct clocksource *clocksource_override;
51static LIST_HEAD(clocksource_list); 53static LIST_HEAD(clocksource_list);
52static DEFINE_SPINLOCK(clocksource_lock); 54static DEFINE_SPINLOCK(clocksource_lock);
53static char override_name[32]; 55static char override_name[32];
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void)
62 finished_booting = 1; 64 finished_booting = 1;
63 return 0; 65 return 0;
64} 66}
65
66late_initcall(clocksource_done_booting); 67late_initcall(clocksource_done_booting);
67 68
69#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
70static LIST_HEAD(watchdog_list);
71static struct clocksource *watchdog;
72static struct timer_list watchdog_timer;
73static DEFINE_SPINLOCK(watchdog_lock);
74static cycle_t watchdog_last;
75/*
76 * Interval: 0.5sec Treshold: 0.0625s
77 */
78#define WATCHDOG_INTERVAL (HZ >> 1)
79#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
80
81static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
82{
83 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
84 return;
85
86 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
87 cs->name, delta);
88 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
89 clocksource_change_rating(cs, 0);
90 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
91 list_del(&cs->wd_list);
92}
93
94static void clocksource_watchdog(unsigned long data)
95{
96 struct clocksource *cs, *tmp;
97 cycle_t csnow, wdnow;
98 int64_t wd_nsec, cs_nsec;
99
100 spin_lock(&watchdog_lock);
101
102 wdnow = watchdog->read();
103 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
104 watchdog_last = wdnow;
105
106 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
107 csnow = cs->read();
108 /* Initialized ? */
109 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
110 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
111 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
112 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
113 /*
114 * We just marked the clocksource as
115 * highres-capable, notify the rest of the
116 * system as well so that we transition
117 * into high-res mode:
118 */
119 tick_clock_notify();
120 }
121 cs->flags |= CLOCK_SOURCE_WATCHDOG;
122 cs->wd_last = csnow;
123 } else {
124 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
125 cs->wd_last = csnow;
126 /* Check the delta. Might remove from the list ! */
127 clocksource_ratewd(cs, cs_nsec - wd_nsec);
128 }
129 }
130
131 if (!list_empty(&watchdog_list)) {
132 __mod_timer(&watchdog_timer,
133 watchdog_timer.expires + WATCHDOG_INTERVAL);
134 }
135 spin_unlock(&watchdog_lock);
136}
137static void clocksource_check_watchdog(struct clocksource *cs)
138{
139 struct clocksource *cse;
140 unsigned long flags;
141
142 spin_lock_irqsave(&watchdog_lock, flags);
143 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
144 int started = !list_empty(&watchdog_list);
145
146 list_add(&cs->wd_list, &watchdog_list);
147 if (!started && watchdog) {
148 watchdog_last = watchdog->read();
149 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
150 add_timer(&watchdog_timer);
151 }
152 } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
153 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
154
155 if (!watchdog || cs->rating > watchdog->rating) {
156 if (watchdog)
157 del_timer(&watchdog_timer);
158 watchdog = cs;
159 init_timer(&watchdog_timer);
160 watchdog_timer.function = clocksource_watchdog;
161
162 /* Reset watchdog cycles */
163 list_for_each_entry(cse, &watchdog_list, wd_list)
164 cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
165 /* Start if list is not empty */
166 if (!list_empty(&watchdog_list)) {
167 watchdog_last = watchdog->read();
168 watchdog_timer.expires =
169 jiffies + WATCHDOG_INTERVAL;
170 add_timer(&watchdog_timer);
171 }
172 }
173 }
174 spin_unlock_irqrestore(&watchdog_lock, flags);
175}
176#else
177static void clocksource_check_watchdog(struct clocksource *cs)
178{
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
180 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
181}
182#endif
183
68/** 184/**
69 * clocksource_get_next - Returns the selected clocksource 185 * clocksource_get_next - Returns the selected clocksource
70 * 186 *
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
84} 200}
85 201
86/** 202/**
87 * select_clocksource - Finds the best registered clocksource. 203 * select_clocksource - Selects the best registered clocksource.
88 * 204 *
89 * Private function. Must hold clocksource_lock when called. 205 * Private function. Must hold clocksource_lock when called.
90 * 206 *
91 * Looks through the list of registered clocksources, returning 207 * Select the clocksource with the best rating, or the clocksource,
92 * the one with the highest rating value. If there is a clocksource 208 * which is selected by userspace override.
93 * name that matches the override string, it returns that clocksource.
94 */ 209 */
95static struct clocksource *select_clocksource(void) 210static struct clocksource *select_clocksource(void)
96{ 211{
97 struct clocksource *best = NULL; 212 struct clocksource *next;
98 struct list_head *tmp;
99 213
100 list_for_each(tmp, &clocksource_list) { 214 if (list_empty(&clocksource_list))
101 struct clocksource *src; 215 return NULL;
102 216
103 src = list_entry(tmp, struct clocksource, list); 217 if (clocksource_override)
104 if (!best) 218 next = clocksource_override;
105 best = src; 219 else
106 220 next = list_entry(clocksource_list.next, struct clocksource,
107 /* check for override: */ 221 list);
108 if (strlen(src->name) == strlen(override_name) && 222
109 !strcmp(src->name, override_name)) { 223 if (next == curr_clocksource)
110 best = src; 224 return NULL;
111 break;
112 }
113 /* pick the highest rating: */
114 if (src->rating > best->rating)
115 best = src;
116 }
117 225
118 return best; 226 return next;
119} 227}
120 228
121/** 229/*
122 * is_registered_source - Checks if clocksource is registered 230 * Enqueue the clocksource sorted by rating
123 * @c: pointer to a clocksource
124 *
125 * Private helper function. Must hold clocksource_lock when called.
126 *
127 * Returns one if the clocksource is already registered, zero otherwise.
128 */ 231 */
129static int is_registered_source(struct clocksource *c) 232static int clocksource_enqueue(struct clocksource *c)
130{ 233{
131 int len = strlen(c->name); 234 struct list_head *tmp, *entry = &clocksource_list;
132 struct list_head *tmp;
133 235
134 list_for_each(tmp, &clocksource_list) { 236 list_for_each(tmp, &clocksource_list) {
135 struct clocksource *src; 237 struct clocksource *cs;
136 238
137 src = list_entry(tmp, struct clocksource, list); 239 cs = list_entry(tmp, struct clocksource, list);
138 if (strlen(src->name) == len && !strcmp(src->name, c->name)) 240 if (cs == c)
139 return 1; 241 return -EBUSY;
242 /* Keep track of the place, where to insert */
243 if (cs->rating >= c->rating)
244 entry = tmp;
140 } 245 }
246 list_add(&c->list, entry);
247
248 if (strlen(c->name) == strlen(override_name) &&
249 !strcmp(c->name, override_name))
250 clocksource_override = c;
141 251
142 return 0; 252 return 0;
143} 253}
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
150 */ 260 */
151int clocksource_register(struct clocksource *c) 261int clocksource_register(struct clocksource *c)
152{ 262{
153 int ret = 0;
154 unsigned long flags; 263 unsigned long flags;
264 int ret;
155 265
156 spin_lock_irqsave(&clocksource_lock, flags); 266 spin_lock_irqsave(&clocksource_lock, flags);
157 /* check if clocksource is already registered */ 267 ret = clocksource_enqueue(c);
158 if (is_registered_source(c)) { 268 if (!ret)
159 printk("register_clocksource: Cannot register %s. "
160 "Already registered!", c->name);
161 ret = -EBUSY;
162 } else {
163 /* register it */
164 list_add(&c->list, &clocksource_list);
165 /* scan the registered clocksources, and pick the best one */
166 next_clocksource = select_clocksource(); 269 next_clocksource = select_clocksource();
167 }
168 spin_unlock_irqrestore(&clocksource_lock, flags); 270 spin_unlock_irqrestore(&clocksource_lock, flags);
271 if (!ret)
272 clocksource_check_watchdog(c);
169 return ret; 273 return ret;
170} 274}
171EXPORT_SYMBOL(clocksource_register); 275EXPORT_SYMBOL(clocksource_register);
172 276
173/** 277/**
174 * clocksource_reselect - Rescan list for next clocksource 278 * clocksource_change_rating - Change the rating of a registered clocksource
175 * 279 *
176 * A quick helper function to be used if a clocksource changes its
177 * rating. Forces the clocksource list to be re-scanned for the best
178 * clocksource.
179 */ 280 */
180void clocksource_reselect(void) 281void clocksource_change_rating(struct clocksource *cs, int rating)
181{ 282{
182 unsigned long flags; 283 unsigned long flags;
183 284
184 spin_lock_irqsave(&clocksource_lock, flags); 285 spin_lock_irqsave(&clocksource_lock, flags);
286 list_del(&cs->list);
287 cs->rating = rating;
288 clocksource_enqueue(cs);
185 next_clocksource = select_clocksource(); 289 next_clocksource = select_clocksource();
186 spin_unlock_irqrestore(&clocksource_lock, flags); 290 spin_unlock_irqrestore(&clocksource_lock, flags);
187} 291}
188EXPORT_SYMBOL(clocksource_reselect);
189 292
190#ifdef CONFIG_SYSFS 293#ifdef CONFIG_SYSFS
191/** 294/**
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
221static ssize_t sysfs_override_clocksource(struct sys_device *dev, 324static ssize_t sysfs_override_clocksource(struct sys_device *dev,
222 const char *buf, size_t count) 325 const char *buf, size_t count)
223{ 326{
327 struct clocksource *ovr = NULL;
328 struct list_head *tmp;
224 size_t ret = count; 329 size_t ret = count;
330 int len;
331
225 /* strings from sysfs write are not 0 terminated! */ 332 /* strings from sysfs write are not 0 terminated! */
226 if (count >= sizeof(override_name)) 333 if (count >= sizeof(override_name))
227 return -EINVAL; 334 return -EINVAL;
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
229 /* strip of \n: */ 336 /* strip of \n: */
230 if (buf[count-1] == '\n') 337 if (buf[count-1] == '\n')
231 count--; 338 count--;
232 if (count < 1)
233 return -EINVAL;
234 339
235 spin_lock_irq(&clocksource_lock); 340 spin_lock_irq(&clocksource_lock);
236 341
237 /* copy the name given: */ 342 if (count > 0)
238 memcpy(override_name, buf, count); 343 memcpy(override_name, buf, count);
239 override_name[count] = 0; 344 override_name[count] = 0;
240 345
241 /* try to select it: */ 346 len = strlen(override_name);
242 next_clocksource = select_clocksource(); 347 if (len) {
348 ovr = clocksource_override;
349 /* try to select it: */
350 list_for_each(tmp, &clocksource_list) {
351 struct clocksource *cs;
352
353 cs = list_entry(tmp, struct clocksource, list);
354 if (strlen(cs->name) == len &&
355 !strcmp(cs->name, override_name))
356 ovr = cs;
357 }
358 }
359
360 /* Reselect, when the override name has changed */
361 if (ovr != clocksource_override) {
362 clocksource_override = ovr;
363 next_clocksource = select_clocksource();
364 }
243 365
244 spin_unlock_irq(&clocksource_lock); 366 spin_unlock_irq(&clocksource_lock);
245 367
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a07..3be8da8fed7e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66}; 65};
67 66
68static int __init init_jiffies_clocksource(void) 67static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f9..eb12509e00bd 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
24 24
25#define MAX_TICKADJ 500 /* microsecs */ 25#define MAX_TICKADJ 500 /* microsecs */
26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
27 TICK_LENGTH_SHIFT) / HZ) 27 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
28 28
29/* 29/*
30 * phase-lock loop variables 30 * phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
46 46
47static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
48{ 48{
49 tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
51 tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 51 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 do_div(tick_length_base, HZ); 54 tick_length_base = second_length;
54 55
55 tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; 56 do_div(second_length, HZ);
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT;
58
59 do_div(tick_length_base, NTP_INTERVAL_FREQ);
56} 60}
57 61
58/** 62/**
@@ -162,7 +166,7 @@ void second_overflow(void)
162 tick_length -= MAX_TICKADJ_SCALED; 166 tick_length -= MAX_TICKADJ_SCALED;
163 } else { 167 } else {
164 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 168 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
165 HZ) << TICK_LENGTH_SHIFT; 169 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
166 time_adjust = 0; 170 time_adjust = 0;
167 } 171 }
168 } 172 }
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
239 result = -EINVAL; 243 result = -EINVAL;
240 goto leave; 244 goto leave;
241 } 245 }
242 time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); 246 time_freq = ((s64)txc->freq * NSEC_PER_USEC)
247 >> (SHIFT_USEC - SHIFT_NSEC);
243 } 248 }
244 249
245 if (txc->modes & ADJ_MAXERROR) { 250 if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
309 freq_adj += time_freq; 314 freq_adj += time_freq;
310 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); 315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
311 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); 316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
312 time_offset = (time_offset / HZ) << SHIFT_UPDATE; 317 time_offset = (time_offset / NTP_INTERVAL_FREQ)
318 << SHIFT_UPDATE;
313 } /* STA_PLL */ 319 } /* STA_PLL */
314 } /* txc->modes & ADJ_OFFSET */ 320 } /* txc->modes & ADJ_OFFSET */
315 if (txc->modes & ADJ_TICK) 321 if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
324 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 330 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
325 txc->offset = save_adjust; 331 txc->offset = save_adjust;
326 else 332 else
327 txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; 333 txc->offset = shift_right(time_offset, SHIFT_UPDATE)
328 txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); 334 * NTP_INTERVAL_FREQ / 1000;
335 txc->freq = (time_freq / NSEC_PER_USEC)
336 << (SHIFT_USEC - SHIFT_NSEC);
329 txc->maxerror = time_maxerror; 337 txc->maxerror = time_maxerror;
330 txc->esterror = time_esterror; 338 txc->esterror = time_esterror;
331 txc->status = time_status; 339 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000000..12b3efeb9f6f
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
1/*
2 * linux/kernel/time/tick-broadcast.c
3 *
4 * This file contains functions which emulate a local clock-event
5 * device via a broadcast event source.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Broadcast support for broken x86 hardware, where the local apic
27 * timer stops in C3 state.
28 */
29
30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock);
33
34/*
35 * Debugging: see timer_list.c
36 */
37struct tick_device *tick_get_broadcast_device(void)
38{
39 return &tick_broadcast_device;
40}
41
42cpumask_t *tick_get_broadcast_mask(void)
43{
44 return &tick_broadcast_mask;
45}
46
47/*
48 * Start the device in periodic mode
49 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
53 tick_setup_periodic(bc, 1);
54}
55
56/*
57 * Check, if the device can be utilized as broadcast device:
58 */
59int tick_check_broadcast_device(struct clock_event_device *dev)
60{
61 if (tick_broadcast_device.evtdev ||
62 (dev->features & CLOCK_EVT_FEAT_C3STOP))
63 return 0;
64
65 clockevents_exchange_device(NULL, dev);
66 tick_broadcast_device.evtdev = dev;
67 if (!cpus_empty(tick_broadcast_mask))
68 tick_broadcast_start_periodic(dev);
69 return 1;
70}
71
72/*
73 * Check, if the device is the broadcast device
74 */
75int tick_is_broadcast_device(struct clock_event_device *dev)
76{
77 return (dev && tick_broadcast_device.evtdev == dev);
78}
79
80/*
81 * Check, if the device is disfunctional and a place holder, which
82 * needs to be handled by the broadcast device.
83 */
84int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
85{
86 unsigned long flags;
87 int ret = 0;
88
89 spin_lock_irqsave(&tick_broadcast_lock, flags);
90
91 /*
92 * Devices might be registered with both periodic and oneshot
93 * mode disabled. This signals, that the device needs to be
94 * operated from the broadcast device and is a placeholder for
95 * the cpu local device.
96 */
97 if (!tick_device_is_functional(dev)) {
98 dev->event_handler = tick_handle_periodic;
99 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1;
102 }
103
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret;
106}
107
108/*
109 * Broadcast the event to the cpus, which are set in the mask
110 */
111int tick_do_broadcast(cpumask_t mask)
112{
113 int ret = 0, cpu = smp_processor_id();
114 struct tick_device *td;
115
116 /*
117 * Check, if the current cpu is in the mask
118 */
119 if (cpu_isset(cpu, mask)) {
120 cpu_clear(cpu, mask);
121 td = &per_cpu(tick_cpu_device, cpu);
122 td->evtdev->event_handler(td->evtdev);
123 ret = 1;
124 }
125
126 if (!cpus_empty(mask)) {
127 /*
128 * It might be necessary to actually check whether the devices
129 * have different broadcast functions. For now, just use the
130 * one of the first device. This works as long as we have this
131 * misfeature only on x86 (lapic)
132 */
133 cpu = first_cpu(mask);
134 td = &per_cpu(tick_cpu_device, cpu);
135 td->evtdev->broadcast(mask);
136 ret = 1;
137 }
138 return ret;
139}
140
141/*
142 * Periodic broadcast:
143 * - invoke the broadcast handlers
144 */
145static void tick_do_periodic_broadcast(void)
146{
147 cpumask_t mask;
148
149 spin_lock(&tick_broadcast_lock);
150
151 cpus_and(mask, cpu_online_map, tick_broadcast_mask);
152 tick_do_broadcast(mask);
153
154 spin_unlock(&tick_broadcast_lock);
155}
156
157/*
158 * Event handler for periodic broadcast ticks
159 */
160static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
161{
162 dev->next_event.tv64 = KTIME_MAX;
163
164 tick_do_periodic_broadcast();
165
166 /*
167 * The device is in periodic mode. No reprogramming necessary:
168 */
169 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
170 return;
171
172 /*
173 * Setup the next period for devices, which do not have
174 * periodic mode:
175 */
176 for (;;) {
177 ktime_t next = ktime_add(dev->next_event, tick_period);
178
179 if (!clockevents_program_event(dev, next, ktime_get()))
180 return;
181 tick_do_periodic_broadcast();
182 }
183}
184
185/*
186 * Powerstate information: The system enters/leaves a state, where
187 * affected devices might stop
188 */
189static void tick_do_broadcast_on_off(void *why)
190{
191 struct clock_event_device *bc, *dev;
192 struct tick_device *td;
193 unsigned long flags, *reason = why;
194 int cpu;
195
196 spin_lock_irqsave(&tick_broadcast_lock, flags);
197
198 cpu = smp_processor_id();
199 td = &per_cpu(tick_cpu_device, cpu);
200 dev = td->evtdev;
201 bc = tick_broadcast_device.evtdev;
202
203 /*
204 * Is the device in broadcast mode forever or is it not
205 * affected by the powerstate ?
206 */
207 if (!dev || !tick_device_is_functional(dev) ||
208 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
209 goto out;
210
211 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
212 if (!cpu_isset(cpu, tick_broadcast_mask)) {
213 cpu_set(cpu, tick_broadcast_mask);
214 if (td->mode == TICKDEV_MODE_PERIODIC)
215 clockevents_set_mode(dev,
216 CLOCK_EVT_MODE_SHUTDOWN);
217 }
218 } else {
219 if (cpu_isset(cpu, tick_broadcast_mask)) {
220 cpu_clear(cpu, tick_broadcast_mask);
221 if (td->mode == TICKDEV_MODE_PERIODIC)
222 tick_setup_periodic(dev, 0);
223 }
224 }
225
226 if (cpus_empty(tick_broadcast_mask))
227 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
228 else {
229 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
230 tick_broadcast_start_periodic(bc);
231 else
232 tick_broadcast_setup_oneshot(bc);
233 }
234out:
235 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
236}
237
238/*
239 * Powerstate information: The system enters/leaves a state, where
240 * affected devices might stop.
241 */
242void tick_broadcast_on_off(unsigned long reason, int *oncpu)
243{
244 int cpu = get_cpu();
245
246 if (cpu == *oncpu)
247 tick_do_broadcast_on_off(&reason);
248 else
249 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
250 &reason, 1, 1);
251 put_cpu();
252}
253
254/*
255 * Set the periodic handler depending on broadcast on/off
256 */
257void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
258{
259 if (!broadcast)
260 dev->event_handler = tick_handle_periodic;
261 else
262 dev->event_handler = tick_handle_periodic_broadcast;
263}
264
265/*
266 * Remove a CPU from broadcasting
267 */
268void tick_shutdown_broadcast(unsigned int *cpup)
269{
270 struct clock_event_device *bc;
271 unsigned long flags;
272 unsigned int cpu = *cpup;
273
274 spin_lock_irqsave(&tick_broadcast_lock, flags);
275
276 bc = tick_broadcast_device.evtdev;
277 cpu_clear(cpu, tick_broadcast_mask);
278
279 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
280 if (bc && cpus_empty(tick_broadcast_mask))
281 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
282 }
283
284 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
285}
286
287#ifdef CONFIG_TICK_ONESHOT
288
289static cpumask_t tick_broadcast_oneshot_mask;
290
291/*
292 * Debugging: see timer_list.c
293 */
294cpumask_t *tick_get_broadcast_oneshot_mask(void)
295{
296 return &tick_broadcast_oneshot_mask;
297}
298
299static int tick_broadcast_set_event(ktime_t expires, int force)
300{
301 struct clock_event_device *bc = tick_broadcast_device.evtdev;
302 ktime_t now = ktime_get();
303 int res;
304
305 for(;;) {
306 res = clockevents_program_event(bc, expires, now);
307 if (!res || !force)
308 return res;
309 now = ktime_get();
310 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
311 }
312}
313
314/*
315 * Reprogram the broadcast device:
316 *
317 * Called with tick_broadcast_lock held and interrupts disabled.
318 */
319static int tick_broadcast_reprogram(void)
320{
321 ktime_t expires = { .tv64 = KTIME_MAX };
322 struct tick_device *td;
323 int cpu;
324
325 /*
326 * Find the event which expires next:
327 */
328 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
329 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
330 td = &per_cpu(tick_cpu_device, cpu);
331 if (td->evtdev->next_event.tv64 < expires.tv64)
332 expires = td->evtdev->next_event;
333 }
334
335 if (expires.tv64 == KTIME_MAX)
336 return 0;
337
338 return tick_broadcast_set_event(expires, 0);
339}
340
341/*
342 * Handle oneshot mode broadcasting
343 */
344static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
345{
346 struct tick_device *td;
347 cpumask_t mask;
348 ktime_t now;
349 int cpu;
350
351 spin_lock(&tick_broadcast_lock);
352again:
353 dev->next_event.tv64 = KTIME_MAX;
354 mask = CPU_MASK_NONE;
355 now = ktime_get();
356 /* Find all expired events */
357 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
358 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
359 td = &per_cpu(tick_cpu_device, cpu);
360 if (td->evtdev->next_event.tv64 <= now.tv64)
361 cpu_set(cpu, mask);
362 }
363
364 /*
365 * Wakeup the cpus which have an expired event. The broadcast
366 * device is reprogrammed in the return from idle code.
367 */
368 if (!tick_do_broadcast(mask)) {
369 /*
370 * The global event did not expire any CPU local
371 * events. This happens in dyntick mode, as the
372 * maximum PIT delta is quite small.
373 */
374 if (tick_broadcast_reprogram())
375 goto again;
376 }
377 spin_unlock(&tick_broadcast_lock);
378}
379
380/*
381 * Powerstate information: The system enters/leaves a state, where
382 * affected devices might stop
383 */
384void tick_broadcast_oneshot_control(unsigned long reason)
385{
386 struct clock_event_device *bc, *dev;
387 struct tick_device *td;
388 unsigned long flags;
389 int cpu;
390
391 spin_lock_irqsave(&tick_broadcast_lock, flags);
392
393 /*
394 * Periodic mode does not care about the enter/exit of power
395 * states
396 */
397 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
398 goto out;
399
400 bc = tick_broadcast_device.evtdev;
401 cpu = smp_processor_id();
402 td = &per_cpu(tick_cpu_device, cpu);
403 dev = td->evtdev;
404
405 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
406 goto out;
407
408 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
409 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
410 cpu_set(cpu, tick_broadcast_oneshot_mask);
411 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
412 if (dev->next_event.tv64 < bc->next_event.tv64)
413 tick_broadcast_set_event(dev->next_event, 1);
414 }
415 } else {
416 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
417 cpu_clear(cpu, tick_broadcast_oneshot_mask);
418 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
419 if (dev->next_event.tv64 != KTIME_MAX)
420 tick_program_event(dev->next_event, 1);
421 }
422 }
423
424out:
425 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
426}
427
428/**
429 * tick_broadcast_setup_highres - setup the broadcast device for highres
430 */
431void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
432{
433 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
434 bc->event_handler = tick_handle_oneshot_broadcast;
435 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
436 bc->next_event.tv64 = KTIME_MAX;
437 }
438}
439
440/*
441 * Select oneshot operating mode for the broadcast device
442 */
443void tick_broadcast_switch_to_oneshot(void)
444{
445 struct clock_event_device *bc;
446 unsigned long flags;
447
448 spin_lock_irqsave(&tick_broadcast_lock, flags);
449
450 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
451 bc = tick_broadcast_device.evtdev;
452 if (bc)
453 tick_broadcast_setup_oneshot(bc);
454 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
455}
456
457
458/*
459 * Remove a dead CPU from broadcasting
460 */
461void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
462{
463 struct clock_event_device *bc;
464 unsigned long flags;
465 unsigned int cpu = *cpup;
466
467 spin_lock_irqsave(&tick_broadcast_lock, flags);
468
469 bc = tick_broadcast_device.evtdev;
470 cpu_clear(cpu, tick_broadcast_oneshot_mask);
471
472 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
473 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
474 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
475 }
476
477 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
478}
479
480#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000000..4500e347f1bb
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,346 @@
1/*
2 * linux/kernel/time/tick-common.c
3 *
4 * This file contains the base functions to manage periodic tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Tick devices
27 */
28DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
29/*
30 * Tick next event: keeps track of the tick time
31 */
32ktime_t tick_next_period;
33ktime_t tick_period;
34static int tick_do_timer_cpu = -1;
35DEFINE_SPINLOCK(tick_device_lock);
36
37/*
38 * Debugging: see timer_list.c
39 */
40struct tick_device *tick_get_device(int cpu)
41{
42 return &per_cpu(tick_cpu_device, cpu);
43}
44
45/**
46 * tick_is_oneshot_available - check for a oneshot capable event device
47 */
48int tick_is_oneshot_available(void)
49{
50 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
51
52 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
53}
54
55/*
56 * Periodic tick
57 */
58static void tick_periodic(int cpu)
59{
60 if (tick_do_timer_cpu == cpu) {
61 write_seqlock(&xtime_lock);
62
63 /* Keep track of the next tick event */
64 tick_next_period = ktime_add(tick_next_period, tick_period);
65
66 do_timer(1);
67 write_sequnlock(&xtime_lock);
68 }
69
70 update_process_times(user_mode(get_irq_regs()));
71 profile_tick(CPU_PROFILING);
72}
73
74/*
75 * Event handler for periodic ticks
76 */
77void tick_handle_periodic(struct clock_event_device *dev)
78{
79 int cpu = smp_processor_id();
80
81 tick_periodic(cpu);
82
83 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
84 return;
85 /*
86 * Setup the next period for devices, which do not have
87 * periodic mode:
88 */
89 for (;;) {
90 ktime_t next = ktime_add(dev->next_event, tick_period);
91
92 if (!clockevents_program_event(dev, next, ktime_get()))
93 return;
94 tick_periodic(cpu);
95 }
96}
97
98/*
99 * Setup the device for a periodic tick
100 */
101void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
102{
103 tick_set_periodic_handler(dev, broadcast);
104
105 /* Broadcast setup ? */
106 if (!tick_device_is_functional(dev))
107 return;
108
109 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
110 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
111 } else {
112 unsigned long seq;
113 ktime_t next;
114
115 do {
116 seq = read_seqbegin(&xtime_lock);
117 next = tick_next_period;
118 } while (read_seqretry(&xtime_lock, seq));
119
120 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
121
122 for (;;) {
123 if (!clockevents_program_event(dev, next, ktime_get()))
124 return;
125 next = ktime_add(next, tick_period);
126 }
127 }
128}
129
130/*
131 * Setup the tick device
132 */
133static void tick_setup_device(struct tick_device *td,
134 struct clock_event_device *newdev, int cpu,
135 cpumask_t cpumask)
136{
137 ktime_t next_event;
138 void (*handler)(struct clock_event_device *) = NULL;
139
140 /*
141 * First device setup ?
142 */
143 if (!td->evtdev) {
144 /*
145 * If no cpu took the do_timer update, assign it to
146 * this cpu:
147 */
148 if (tick_do_timer_cpu == -1) {
149 tick_do_timer_cpu = cpu;
150 tick_next_period = ktime_get();
151 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
152 }
153
154 /*
155 * Startup in periodic mode first.
156 */
157 td->mode = TICKDEV_MODE_PERIODIC;
158 } else {
159 handler = td->evtdev->event_handler;
160 next_event = td->evtdev->next_event;
161 }
162
163 td->evtdev = newdev;
164
165 /*
166 * When the device is not per cpu, pin the interrupt to the
167 * current cpu:
168 */
169 if (!cpus_equal(newdev->cpumask, cpumask))
170 irq_set_affinity(newdev->irq, cpumask);
171
172 /*
173 * When global broadcasting is active, check if the current
174 * device is registered as a placeholder for broadcast mode.
175 * This allows us to handle this x86 misfeature in a generic
176 * way.
177 */
178 if (tick_device_uses_broadcast(newdev, cpu))
179 return;
180
181 if (td->mode == TICKDEV_MODE_PERIODIC)
182 tick_setup_periodic(newdev, 0);
183 else
184 tick_setup_oneshot(newdev, handler, next_event);
185}
186
187/*
188 * Check, if the new registered device should be used.
189 */
190static int tick_check_new_device(struct clock_event_device *newdev)
191{
192 struct clock_event_device *curdev;
193 struct tick_device *td;
194 int cpu, ret = NOTIFY_OK;
195 unsigned long flags;
196 cpumask_t cpumask;
197
198 spin_lock_irqsave(&tick_device_lock, flags);
199
200 cpu = smp_processor_id();
201 if (!cpu_isset(cpu, newdev->cpumask))
202 goto out;
203
204 td = &per_cpu(tick_cpu_device, cpu);
205 curdev = td->evtdev;
206 cpumask = cpumask_of_cpu(cpu);
207
208 /* cpu local device ? */
209 if (!cpus_equal(newdev->cpumask, cpumask)) {
210
211 /*
212 * If the cpu affinity of the device interrupt can not
213 * be set, ignore it.
214 */
215 if (!irq_can_set_affinity(newdev->irq))
216 goto out_bc;
217
218 /*
219 * If we have a cpu local device already, do not replace it
220 * by a non cpu local device
221 */
222 if (curdev && cpus_equal(curdev->cpumask, cpumask))
223 goto out_bc;
224 }
225
226 /*
227 * If we have an active device, then check the rating and the oneshot
228 * feature.
229 */
230 if (curdev) {
231 /*
232 * Prefer one shot capable devices !
233 */
234 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
235 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
236 goto out_bc;
237 /*
238 * Check the rating
239 */
240 if (curdev->rating >= newdev->rating)
241 goto out_bc;
242 }
243
244 /*
245 * Replace the eventually existing device by the new
246 * device. If the current device is the broadcast device, do
247 * not give it back to the clockevents layer !
248 */
249 if (tick_is_broadcast_device(curdev)) {
250 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
251 curdev = NULL;
252 }
253 clockevents_exchange_device(curdev, newdev);
254 tick_setup_device(td, newdev, cpu, cpumask);
255 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
256 tick_oneshot_notify();
257
258 spin_unlock_irqrestore(&tick_device_lock, flags);
259 return NOTIFY_STOP;
260
261out_bc:
262 /*
263 * Can the new device be used as a broadcast device ?
264 */
265 if (tick_check_broadcast_device(newdev))
266 ret = NOTIFY_STOP;
267out:
268 spin_unlock_irqrestore(&tick_device_lock, flags);
269
270 return ret;
271}
272
273/*
274 * Shutdown an event device on a given cpu:
275 *
276 * This is called on a life CPU, when a CPU is dead. So we cannot
277 * access the hardware device itself.
278 * We just set the mode and remove it from the lists.
279 */
280static void tick_shutdown(unsigned int *cpup)
281{
282 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
283 struct clock_event_device *dev = td->evtdev;
284 unsigned long flags;
285
286 spin_lock_irqsave(&tick_device_lock, flags);
287 td->mode = TICKDEV_MODE_PERIODIC;
288 if (dev) {
289 /*
290 * Prevent that the clock events layer tries to call
291 * the set mode function!
292 */
293 dev->mode = CLOCK_EVT_MODE_UNUSED;
294 clockevents_exchange_device(dev, NULL);
295 td->evtdev = NULL;
296 }
297 spin_unlock_irqrestore(&tick_device_lock, flags);
298}
299
300/*
301 * Notification about clock event devices
302 */
303static int tick_notify(struct notifier_block *nb, unsigned long reason,
304 void *dev)
305{
306 switch (reason) {
307
308 case CLOCK_EVT_NOTIFY_ADD:
309 return tick_check_new_device(dev);
310
311 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
312 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
313 tick_broadcast_on_off(reason, dev);
314 break;
315
316 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
317 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
318 tick_broadcast_oneshot_control(reason);
319 break;
320
321 case CLOCK_EVT_NOTIFY_CPU_DEAD:
322 tick_shutdown_broadcast_oneshot(dev);
323 tick_shutdown_broadcast(dev);
324 tick_shutdown(dev);
325 break;
326
327 default:
328 break;
329 }
330
331 return NOTIFY_OK;
332}
333
334static struct notifier_block tick_notifier = {
335 .notifier_call = tick_notify,
336};
337
338/**
339 * tick_init - initialize the tick control
340 *
341 * Register the notifier with the clockevents framework
342 */
343void __init tick_init(void)
344{
345 clockevents_register_notifier(&tick_notifier);
346}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000000..54861a0f29ff
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
1/*
2 * tick internal variable and functions used by low/high res code
3 */
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period;
7extern ktime_t tick_period;
8
9extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
10extern void tick_handle_periodic(struct clock_event_device *dev);
11
12/*
13 * NO_HZ / high resolution timer shared code
14 */
15#ifdef CONFIG_TICK_ONESHOT
16extern void tick_setup_oneshot(struct clock_event_device *newdev,
17 void (*handler)(struct clock_event_device *),
18 ktime_t nextevt);
19extern int tick_program_event(ktime_t expires, int force);
20extern void tick_oneshot_notify(void);
21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
22
23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
25extern void tick_broadcast_oneshot_control(unsigned long reason);
26extern void tick_broadcast_switch_to_oneshot(void);
27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
28# else /* BROADCAST */
29static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
30{
31 BUG();
32}
33static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
34static inline void tick_broadcast_switch_to_oneshot(void) { }
35static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
36# endif /* !BROADCAST */
37
38#else /* !ONESHOT */
39static inline
40void tick_setup_oneshot(struct clock_event_device *newdev,
41 void (*handler)(struct clock_event_device *),
42 ktime_t nextevt)
43{
44 BUG();
45}
46static inline int tick_program_event(ktime_t expires, int force)
47{
48 return 0;
49}
50static inline void tick_oneshot_notify(void) { }
51static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
52{
53 BUG();
54}
55static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
56static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
57#endif /* !TICK_ONESHOT */
58
59/*
60 * Broadcasting support
61 */
62#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
63extern int tick_do_broadcast(cpumask_t mask);
64
65extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
66extern int tick_check_broadcast_device(struct clock_event_device *dev);
67extern int tick_is_broadcast_device(struct clock_event_device *dev);
68extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
69extern void tick_shutdown_broadcast(unsigned int *cpup);
70
71extern void
72tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
73
74#else /* !BROADCAST */
75
76static inline int tick_check_broadcast_device(struct clock_event_device *dev)
77{
78 return 0;
79}
80
81static inline int tick_is_broadcast_device(struct clock_event_device *dev)
82{
83 return 0;
84}
85static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
86 int cpu)
87{
88 return 0;
89}
90static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
91static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
92static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
93
94/*
95 * Set the periodic handler in non broadcast mode
96 */
97static inline void tick_set_periodic_handler(struct clock_event_device *dev,
98 int broadcast)
99{
100 dev->event_handler = tick_handle_periodic;
101}
102#endif /* !BROADCAST */
103
104/*
105 * Check, if the device is functional or a dummy for broadcast
106 */
107static inline int tick_device_is_functional(struct clock_event_device *dev)
108{
109 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
110}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000000..2e8b7ff863cc
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
1/*
2 * linux/kernel/time/tick-oneshot.c
3 *
4 * This file contains functions which manage high resolution tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/**
26 * tick_program_event
27 */
28int tick_program_event(ktime_t expires, int force)
29{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get();
32
33 while (1) {
34 int ret = clockevents_program_event(dev, expires, now);
35
36 if (!ret || !force)
37 return ret;
38 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
40 }
41}
42
43/**
44 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
45 */
46void tick_setup_oneshot(struct clock_event_device *newdev,
47 void (*handler)(struct clock_event_device *),
48 ktime_t next_event)
49{
50 newdev->event_handler = handler;
51 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
52 clockevents_program_event(newdev, next_event, ktime_get());
53}
54
55/**
56 * tick_switch_to_oneshot - switch to oneshot mode
57 */
58int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
59{
60 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
61 struct clock_event_device *dev = td->evtdev;
62
63 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
64 !tick_device_is_functional(dev))
65 return -EINVAL;
66
67 td->mode = TICKDEV_MODE_ONESHOT;
68 dev->event_handler = handler;
69 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
70 tick_broadcast_switch_to_oneshot();
71 return 0;
72}
73
74#ifdef CONFIG_HIGH_RES_TIMERS
75/**
76 * tick_init_highres - switch to high resolution mode
77 *
78 * Called with interrupts disabled.
79 */
80int tick_init_highres(void)
81{
82 return tick_switch_to_oneshot(hrtimer_interrupt);
83}
84#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000000..95e41f7f850b
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,563 @@
1/*
2 * linux/kernel/time/tick-sched.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * No idle tick implementation for low and high resolution timers
9 *
10 * Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/interrupt.h>
18#include <linux/kernel_stat.h>
19#include <linux/percpu.h>
20#include <linux/profile.h>
21#include <linux/sched.h>
22#include <linux/tick.h>
23
24#include "tick-internal.h"
25
26/*
27 * Per cpu nohz control structure
28 */
29static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
30
31/*
32 * The time, when the last jiffy update happened. Protected by xtime_lock.
33 */
34static ktime_t last_jiffies_update;
35
36struct tick_sched *tick_get_tick_sched(int cpu)
37{
38 return &per_cpu(tick_cpu_sched, cpu);
39}
40
41/*
42 * Must be called with interrupts disabled !
43 */
44static void tick_do_update_jiffies64(ktime_t now)
45{
46 unsigned long ticks = 0;
47 ktime_t delta;
48
49 /* Reevalute with xtime_lock held */
50 write_seqlock(&xtime_lock);
51
52 delta = ktime_sub(now, last_jiffies_update);
53 if (delta.tv64 >= tick_period.tv64) {
54
55 delta = ktime_sub(delta, tick_period);
56 last_jiffies_update = ktime_add(last_jiffies_update,
57 tick_period);
58
59 /* Slow path for long timeouts */
60 if (unlikely(delta.tv64 >= tick_period.tv64)) {
61 s64 incr = ktime_to_ns(tick_period);
62
63 ticks = ktime_divns(delta, incr);
64
65 last_jiffies_update = ktime_add_ns(last_jiffies_update,
66 incr * ticks);
67 }
68 do_timer(++ticks);
69 }
70 write_sequnlock(&xtime_lock);
71}
72
73/*
74 * Initialize and return retrieve the jiffies update.
75 */
76static ktime_t tick_init_jiffy_update(void)
77{
78 ktime_t period;
79
80 write_seqlock(&xtime_lock);
81 /* Did we start the jiffies update yet ? */
82 if (last_jiffies_update.tv64 == 0)
83 last_jiffies_update = tick_next_period;
84 period = last_jiffies_update;
85 write_sequnlock(&xtime_lock);
86 return period;
87}
88
89/*
90 * NOHZ - aka dynamic tick functionality
91 */
92#ifdef CONFIG_NO_HZ
93/*
94 * NO HZ enabled ?
95 */
96static int tick_nohz_enabled __read_mostly = 1;
97
98/*
99 * Enable / Disable tickless mode
100 */
101static int __init setup_tick_nohz(char *str)
102{
103 if (!strcmp(str, "off"))
104 tick_nohz_enabled = 0;
105 else if (!strcmp(str, "on"))
106 tick_nohz_enabled = 1;
107 else
108 return 0;
109 return 1;
110}
111
112__setup("nohz=", setup_tick_nohz);
113
114/**
115 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
116 *
117 * Called from interrupt entry when the CPU was idle
118 *
119 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
120 * must be updated. Otherwise an interrupt handler could use a stale jiffy
121 * value. We do this unconditionally on any cpu, as we don't know whether the
122 * cpu, which has the update task assigned is in a long sleep.
123 */
124void tick_nohz_update_jiffies(void)
125{
126 int cpu = smp_processor_id();
127 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
128 unsigned long flags;
129 ktime_t now;
130
131 if (!ts->tick_stopped)
132 return;
133
134 cpu_clear(cpu, nohz_cpu_mask);
135 now = ktime_get();
136
137 local_irq_save(flags);
138 tick_do_update_jiffies64(now);
139 local_irq_restore(flags);
140}
141
142/**
143 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
144 *
145 * When the next event is more than a tick into the future, stop the idle tick
146 * Called either from the idle loop or from irq_exit() when an idle period was
147 * just interrupted by an interrupt which did not cause a reschedule.
148 */
149void tick_nohz_stop_sched_tick(void)
150{
151 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
152 struct tick_sched *ts;
153 ktime_t last_update, expires, now, delta;
154 int cpu;
155
156 local_irq_save(flags);
157
158 cpu = smp_processor_id();
159 ts = &per_cpu(tick_cpu_sched, cpu);
160
161 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
162 goto end;
163
164 if (need_resched())
165 goto end;
166
167 cpu = smp_processor_id();
168 BUG_ON(local_softirq_pending());
169
170 now = ktime_get();
171 /*
172 * When called from irq_exit we need to account the idle sleep time
173 * correctly.
174 */
175 if (ts->tick_stopped) {
176 delta = ktime_sub(now, ts->idle_entrytime);
177 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
178 }
179
180 ts->idle_entrytime = now;
181 ts->idle_calls++;
182
183 /* Read jiffies and the time when jiffies were updated last */
184 do {
185 seq = read_seqbegin(&xtime_lock);
186 last_update = last_jiffies_update;
187 last_jiffies = jiffies;
188 } while (read_seqretry(&xtime_lock, seq));
189
190 /* Get the next timer wheel timer */
191 next_jiffies = get_next_timer_interrupt(last_jiffies);
192 delta_jiffies = next_jiffies - last_jiffies;
193
194 /*
195 * Do not stop the tick, if we are only one off
196 * or if the cpu is required for rcu
197 */
198 if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu)))
199 goto out;
200
201 /* Schedule the tick, if we are at least one jiffie off */
202 if ((long)delta_jiffies >= 1) {
203
204 if (rcu_needs_cpu(cpu))
205 delta_jiffies = 1;
206 else
207 cpu_set(cpu, nohz_cpu_mask);
208 /*
209 * nohz_stop_sched_tick can be called several times before
210 * the nohz_restart_sched_tick is called. This happens when
211 * interrupts arrive which do not cause a reschedule. In the
212 * first call we save the current tick time, so we can restart
213 * the scheduler tick in nohz_restart_sched_tick.
214 */
215 if (!ts->tick_stopped) {
216 ts->idle_tick = ts->sched_timer.expires;
217 ts->tick_stopped = 1;
218 ts->idle_jiffies = last_jiffies;
219 }
220 /*
221 * calculate the expiry time for the next timer wheel
222 * timer
223 */
224 expires = ktime_add_ns(last_update, tick_period.tv64 *
225 delta_jiffies);
226 ts->idle_expires = expires;
227 ts->idle_sleeps++;
228
229 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
230 hrtimer_start(&ts->sched_timer, expires,
231 HRTIMER_MODE_ABS);
232 /* Check, if the timer was already in the past */
233 if (hrtimer_active(&ts->sched_timer))
234 goto out;
235 } else if(!tick_program_event(expires, 0))
236 goto out;
237 /*
238 * We are past the event already. So we crossed a
239 * jiffie boundary. Update jiffies and raise the
240 * softirq.
241 */
242 tick_do_update_jiffies64(ktime_get());
243 cpu_clear(cpu, nohz_cpu_mask);
244 }
245 raise_softirq_irqoff(TIMER_SOFTIRQ);
246out:
247 ts->next_jiffies = next_jiffies;
248 ts->last_jiffies = last_jiffies;
249end:
250 local_irq_restore(flags);
251}
252
253/**
254 * nohz_restart_sched_tick - restart the idle tick from the idle task
255 *
256 * Restart the idle tick when the CPU is woken up from idle
257 */
258void tick_nohz_restart_sched_tick(void)
259{
260 int cpu = smp_processor_id();
261 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
262 unsigned long ticks;
263 ktime_t now, delta;
264
265 if (!ts->tick_stopped)
266 return;
267
268 /* Update jiffies first */
269 now = ktime_get();
270
271 local_irq_disable();
272 tick_do_update_jiffies64(now);
273 cpu_clear(cpu, nohz_cpu_mask);
274
275 /* Account the idle time */
276 delta = ktime_sub(now, ts->idle_entrytime);
277 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
278
279 /*
280 * We stopped the tick in idle. Update process times would miss the
281 * time we slept as update_process_times does only a 1 tick
282 * accounting. Enforce that this is accounted to idle !
283 */
284 ticks = jiffies - ts->idle_jiffies;
285 /*
286 * We might be one off. Do not randomly account a huge number of ticks!
287 */
288 if (ticks && ticks < LONG_MAX) {
289 add_preempt_count(HARDIRQ_OFFSET);
290 account_system_time(current, HARDIRQ_OFFSET,
291 jiffies_to_cputime(ticks));
292 sub_preempt_count(HARDIRQ_OFFSET);
293 }
294
295 /*
296 * Cancel the scheduled timer and restore the tick
297 */
298 ts->tick_stopped = 0;
299 hrtimer_cancel(&ts->sched_timer);
300 ts->sched_timer.expires = ts->idle_tick;
301
302 while (1) {
303 /* Forward the time to expire in the future */
304 hrtimer_forward(&ts->sched_timer, now, tick_period);
305
306 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
307 hrtimer_start(&ts->sched_timer,
308 ts->sched_timer.expires,
309 HRTIMER_MODE_ABS);
310 /* Check, if the timer was already in the past */
311 if (hrtimer_active(&ts->sched_timer))
312 break;
313 } else {
314 if (!tick_program_event(ts->sched_timer.expires, 0))
315 break;
316 }
317 /* Update jiffies and reread time */
318 tick_do_update_jiffies64(now);
319 now = ktime_get();
320 }
321 local_irq_enable();
322}
323
324static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
325{
326 hrtimer_forward(&ts->sched_timer, now, tick_period);
327 return tick_program_event(ts->sched_timer.expires, 0);
328}
329
330/*
331 * The nohz low res interrupt handler
332 */
333static void tick_nohz_handler(struct clock_event_device *dev)
334{
335 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
336 struct pt_regs *regs = get_irq_regs();
337 ktime_t now = ktime_get();
338
339 dev->next_event.tv64 = KTIME_MAX;
340
341 /* Check, if the jiffies need an update */
342 tick_do_update_jiffies64(now);
343
344 /*
345 * When we are idle and the tick is stopped, we have to touch
346 * the watchdog as we might not schedule for a really long
347 * time. This happens on complete idle SMP systems while
348 * waiting on the login prompt. We also increment the "start
349 * of idle" jiffy stamp so the idle accounting adjustment we
350 * do when we go busy again does not account too much ticks.
351 */
352 if (ts->tick_stopped) {
353 touch_softlockup_watchdog();
354 ts->idle_jiffies++;
355 }
356
357 update_process_times(user_mode(regs));
358 profile_tick(CPU_PROFILING);
359
360 /* Do not restart, when we are in the idle loop */
361 if (ts->tick_stopped)
362 return;
363
364 while (tick_nohz_reprogram(ts, now)) {
365 now = ktime_get();
366 tick_do_update_jiffies64(now);
367 }
368}
369
370/**
371 * tick_nohz_switch_to_nohz - switch to nohz mode
372 */
373static void tick_nohz_switch_to_nohz(void)
374{
375 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
376 ktime_t next;
377
378 if (!tick_nohz_enabled)
379 return;
380
381 local_irq_disable();
382 if (tick_switch_to_oneshot(tick_nohz_handler)) {
383 local_irq_enable();
384 return;
385 }
386
387 ts->nohz_mode = NOHZ_MODE_LOWRES;
388
389 /*
390 * Recycle the hrtimer in ts, so we can share the
391 * hrtimer_forward with the highres code.
392 */
393 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
394 /* Get the next period */
395 next = tick_init_jiffy_update();
396
397 for (;;) {
398 ts->sched_timer.expires = next;
399 if (!tick_program_event(next, 0))
400 break;
401 next = ktime_add(next, tick_period);
402 }
403 local_irq_enable();
404
405 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
406 smp_processor_id());
407}
408
409#else
410
411static inline void tick_nohz_switch_to_nohz(void) { }
412
413#endif /* NO_HZ */
414
415/*
416 * High resolution timer specific code
417 */
418#ifdef CONFIG_HIGH_RES_TIMERS
419/*
420 * We rearm the timer until we get disabled by the idle code
421 * Called with interrupts disabled and timer->base->cpu_base->lock held.
422 */
423static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
424{
425 struct tick_sched *ts =
426 container_of(timer, struct tick_sched, sched_timer);
427 struct hrtimer_cpu_base *base = timer->base->cpu_base;
428 struct pt_regs *regs = get_irq_regs();
429 ktime_t now = ktime_get();
430
431 /* Check, if the jiffies need an update */
432 tick_do_update_jiffies64(now);
433
434 /*
435 * Do not call, when we are not in irq context and have
436 * no valid regs pointer
437 */
438 if (regs) {
439 /*
440 * When we are idle and the tick is stopped, we have to touch
441 * the watchdog as we might not schedule for a really long
442 * time. This happens on complete idle SMP systems while
443 * waiting on the login prompt. We also increment the "start of
444 * idle" jiffy stamp so the idle accounting adjustment we do
445 * when we go busy again does not account too much ticks.
446 */
447 if (ts->tick_stopped) {
448 touch_softlockup_watchdog();
449 ts->idle_jiffies++;
450 }
451 /*
452 * update_process_times() might take tasklist_lock, hence
453 * drop the base lock. sched-tick hrtimers are per-CPU and
454 * never accessible by userspace APIs, so this is safe to do.
455 */
456 spin_unlock(&base->lock);
457 update_process_times(user_mode(regs));
458 profile_tick(CPU_PROFILING);
459 spin_lock(&base->lock);
460 }
461
462 /* Do not restart, when we are in the idle loop */
463 if (ts->tick_stopped)
464 return HRTIMER_NORESTART;
465
466 hrtimer_forward(timer, now, tick_period);
467
468 return HRTIMER_RESTART;
469}
470
471/**
472 * tick_setup_sched_timer - setup the tick emulation timer
473 */
474void tick_setup_sched_timer(void)
475{
476 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
477 ktime_t now = ktime_get();
478
479 /*
480 * Emulate tick processing via per-CPU hrtimers:
481 */
482 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
483 ts->sched_timer.function = tick_sched_timer;
484 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
485
486 /* Get the next period */
487 ts->sched_timer.expires = tick_init_jiffy_update();
488
489 for (;;) {
490 hrtimer_forward(&ts->sched_timer, now, tick_period);
491 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
492 HRTIMER_MODE_ABS);
493 /* Check, if the timer was already in the past */
494 if (hrtimer_active(&ts->sched_timer))
495 break;
496 now = ktime_get();
497 }
498
499#ifdef CONFIG_NO_HZ
500 if (tick_nohz_enabled)
501 ts->nohz_mode = NOHZ_MODE_HIGHRES;
502#endif
503}
504
505void tick_cancel_sched_timer(int cpu)
506{
507 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
508
509 if (ts->sched_timer.base)
510 hrtimer_cancel(&ts->sched_timer);
511 ts->tick_stopped = 0;
512 ts->nohz_mode = NOHZ_MODE_INACTIVE;
513}
514#endif /* HIGH_RES_TIMERS */
515
516/**
517 * Async notification about clocksource changes
518 */
519void tick_clock_notify(void)
520{
521 int cpu;
522
523 for_each_possible_cpu(cpu)
524 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
525}
526
527/*
528 * Async notification about clock event changes
529 */
530void tick_oneshot_notify(void)
531{
532 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
533
534 set_bit(0, &ts->check_clocks);
535}
536
537/**
538 * Check, if a change happened, which makes oneshot possible.
539 *
540 * Called cyclic from the hrtimer softirq (driven by the timer
541 * softirq) allow_nohz signals, that we can switch into low-res nohz
542 * mode, because high resolution timers are disabled (either compile
543 * or runtime).
544 */
545int tick_check_oneshot_change(int allow_nohz)
546{
547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
548
549 if (!test_and_clear_bit(0, &ts->check_clocks))
550 return 0;
551
552 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
553 return 0;
554
555 if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
556 return 0;
557
558 if (!allow_nohz)
559 return 1;
560
561 tick_nohz_switch_to_nohz();
562 return 0;
563}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000000..f82c635c3d5c
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
1/*
2 * kernel/time/timer_list.c
3 *
4 * List pending timers
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/sched.h>
17#include <linux/seq_file.h>
18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20
21#include <asm/uaccess.h>
22
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26
27/*
28 * This allows printing both to /proc/timer_list and
29 * to the console (on SysRq-Q):
30 */
31#define SEQ_printf(m, x...) \
32 do { \
33 if (m) \
34 seq_printf(m, x); \
35 else \
36 printk(x); \
37 } while (0)
38
39static void print_name_offset(struct seq_file *m, void *sym)
40{
41 unsigned long addr = (unsigned long)sym;
42 char namebuf[KSYM_NAME_LEN+1];
43 unsigned long size, offset;
44 const char *sym_name;
45 char *modname;
46
47 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
48 if (sym_name)
49 SEQ_printf(m, "%s", sym_name);
50 else
51 SEQ_printf(m, "<%p>", sym);
52}
53
54static void
55print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
56{
57#ifdef CONFIG_TIMER_STATS
58 char tmp[TASK_COMM_LEN + 1];
59#endif
60 SEQ_printf(m, " #%d: ", idx);
61 print_name_offset(m, timer);
62 SEQ_printf(m, ", ");
63 print_name_offset(m, timer->function);
64 SEQ_printf(m, ", S:%02lx", timer->state);
65#ifdef CONFIG_TIMER_STATS
66 SEQ_printf(m, ", ");
67 print_name_offset(m, timer->start_site);
68 memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
69 tmp[TASK_COMM_LEN] = 0;
70 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
71#endif
72 SEQ_printf(m, "\n");
73 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
74 (unsigned long long)ktime_to_ns(timer->expires),
75 (unsigned long long)(ktime_to_ns(timer->expires) - now));
76}
77
78static void
79print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
80 u64 now)
81{
82 struct hrtimer *timer, tmp;
83 unsigned long next = 0, i;
84 struct rb_node *curr;
85 unsigned long flags;
86
87next_one:
88 i = 0;
89 spin_lock_irqsave(&base->cpu_base->lock, flags);
90
91 curr = base->first;
92 /*
93 * Crude but we have to do this O(N*N) thing, because
94 * we have to unlock the base when printing:
95 */
96 while (curr && i < next) {
97 curr = rb_next(curr);
98 i++;
99 }
100
101 if (curr) {
102
103 timer = rb_entry(curr, struct hrtimer, node);
104 tmp = *timer;
105 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
106
107 print_timer(m, &tmp, i, now);
108 next++;
109 goto next_one;
110 }
111 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
112}
113
114static void
115print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
116{
117 SEQ_printf(m, " .index: %d\n",
118 base->index);
119 SEQ_printf(m, " .resolution: %Ld nsecs\n",
120 (unsigned long long)ktime_to_ns(base->resolution));
121 SEQ_printf(m, " .get_time: ");
122 print_name_offset(m, base->get_time);
123 SEQ_printf(m, "\n");
124#ifdef CONFIG_HIGH_RES_TIMERS
125 SEQ_printf(m, " .offset: %Ld nsecs\n",
126 ktime_to_ns(base->offset));
127#endif
128 SEQ_printf(m, "active timers:\n");
129 print_active_timers(m, base, now);
130}
131
132static void print_cpu(struct seq_file *m, int cpu, u64 now)
133{
134 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
135 int i;
136
137 SEQ_printf(m, "\ncpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i);
140 print_base(m, cpu_base->clock_base + i, now);
141 }
142#define P(x) \
143 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
144#define P_ns(x) \
145 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
146 (u64)(ktime_to_ns(cpu_base->x)))
147
148#ifdef CONFIG_HIGH_RES_TIMERS
149 P_ns(expires_next);
150 P(hres_active);
151 P(nr_events);
152#endif
153#undef P
154#undef P_ns
155
156#ifdef CONFIG_TICK_ONESHOT
157# define P(x) \
158 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
159# define P_ns(x) \
160 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
161 (u64)(ktime_to_ns(ts->x)))
162 {
163 struct tick_sched *ts = tick_get_tick_sched(cpu);
164 P(nohz_mode);
165 P_ns(idle_tick);
166 P(tick_stopped);
167 P(idle_jiffies);
168 P(idle_calls);
169 P(idle_sleeps);
170 P_ns(idle_entrytime);
171 P_ns(idle_sleeptime);
172 P(last_jiffies);
173 P(next_jiffies);
174 P_ns(idle_expires);
175 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
176 }
177#endif
178
179#undef P
180#undef P_ns
181}
182
183#ifdef CONFIG_GENERIC_CLOCKEVENTS
184static void
185print_tickdevice(struct seq_file *m, struct tick_device *td)
186{
187 struct clock_event_device *dev = td->evtdev;
188
189 SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode);
190
191 SEQ_printf(m, "Clock Event Device: ");
192 if (!dev) {
193 SEQ_printf(m, "<NULL>\n");
194 return;
195 }
196 SEQ_printf(m, "%s\n", dev->name);
197 SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
198 SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
199 SEQ_printf(m, " mult: %ld\n", dev->mult);
200 SEQ_printf(m, " shift: %d\n", dev->shift);
201 SEQ_printf(m, " mode: %d\n", dev->mode);
202 SEQ_printf(m, " next_event: %Ld nsecs\n",
203 (unsigned long long) ktime_to_ns(dev->next_event));
204
205 SEQ_printf(m, " set_next_event: ");
206 print_name_offset(m, dev->set_next_event);
207 SEQ_printf(m, "\n");
208
209 SEQ_printf(m, " set_mode: ");
210 print_name_offset(m, dev->set_mode);
211 SEQ_printf(m, "\n");
212
213 SEQ_printf(m, " event_handler: ");
214 print_name_offset(m, dev->event_handler);
215 SEQ_printf(m, "\n");
216}
217
218static void timer_list_show_tickdevices(struct seq_file *m)
219{
220 int cpu;
221
222#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
223 print_tickdevice(m, tick_get_broadcast_device());
224 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
225 tick_get_broadcast_mask()->bits[0]);
226#ifdef CONFIG_TICK_ONESHOT
227 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
228 tick_get_broadcast_oneshot_mask()->bits[0]);
229#endif
230 SEQ_printf(m, "\n");
231#endif
232 for_each_online_cpu(cpu)
233 print_tickdevice(m, tick_get_device(cpu));
234 SEQ_printf(m, "\n");
235}
236#else
237static void timer_list_show_tickdevices(struct seq_file *m) { }
238#endif
239
240static int timer_list_show(struct seq_file *m, void *v)
241{
242 u64 now = ktime_to_ns(ktime_get());
243 int cpu;
244
245 SEQ_printf(m, "Timer List Version: v0.3\n");
246 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
247 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
248
249 for_each_online_cpu(cpu)
250 print_cpu(m, cpu, now);
251
252 SEQ_printf(m, "\n");
253 timer_list_show_tickdevices(m);
254
255 return 0;
256}
257
258void sysrq_timer_list_show(void)
259{
260 timer_list_show(NULL, NULL);
261}
262
263static int timer_list_open(struct inode *inode, struct file *filp)
264{
265 return single_open(filp, timer_list_show, NULL);
266}
267
268static struct file_operations timer_list_fops = {
269 .open = timer_list_open,
270 .read = seq_read,
271 .llseek = seq_lseek,
272 .release = seq_release,
273};
274
275static int __init init_timer_list_procfs(void)
276{
277 struct proc_dir_entry *pe;
278
279 pe = create_proc_entry("timer_list", 0644, NULL);
280 if (!pe)
281 return -ENOMEM;
282
283 pe->proc_fops = &timer_list_fops;
284
285 return 0;
286}
287__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000000..1bc4882e28e0
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
1/*
2 * kernel/time/timer_stats.c
3 *
4 * Collect timer usage statistics.
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * timer_stats is based on timer_top, a similar functionality which was part of
10 * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
11 * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
12 * on dynamic allocation of the statistics entries and linear search based
13 * lookup combined with a global lock, rather than the static array, hash
14 * and per-CPU locking which is used by timer_stats. It was written for the
15 * pre hrtimer kernel code and therefore did not take hrtimers into account.
16 * Nevertheless it provided the base for the timer_stats implementation and
17 * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
18 * for this effort.
19 *
20 * timer_top.c is
21 * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
22 * Written by Daniel Petrini <d.pensator@gmail.com>
23 * timer_top.c was released under the GNU General Public License version 2
24 *
25 * We export the addresses and counting of timer functions being called,
26 * the pid and cmdline from the owner process if applicable.
27 *
28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats
30 *
31 * Display the information collected so far:
32 * # cat /proc/timer_stats
33 *
34 * This program is free software; you can redistribute it and/or modify
35 * it under the terms of the GNU General Public License version 2 as
36 * published by the Free Software Foundation.
37 */
38
39#include <linux/proc_fs.h>
40#include <linux/module.h>
41#include <linux/spinlock.h>
42#include <linux/sched.h>
43#include <linux/seq_file.h>
44#include <linux/kallsyms.h>
45
46#include <asm/uaccess.h>
47
48/*
49 * This is our basic unit of interest: a timer expiry event identified
50 * by the timer, its start/expire functions and the PID of the task that
51 * started the timer. We count the number of times an event happens:
52 */
53struct entry {
54 /*
55 * Hash list:
56 */
57 struct entry *next;
58
59 /*
60 * Hash keys:
61 */
62 void *timer;
63 void *start_func;
64 void *expire_func;
65 pid_t pid;
66
67 /*
68 * Number of timeout events:
69 */
70 unsigned long count;
71
72 /*
73 * We save the command-line string to preserve
74 * this information past task exit:
75 */
76 char comm[TASK_COMM_LEN + 1];
77
78} ____cacheline_aligned_in_smp;
79
80/*
81 * Spinlock protecting the tables - not taken during lookup:
82 */
83static DEFINE_SPINLOCK(table_lock);
84
85/*
86 * Per-CPU lookup locks for fast hash lookup:
87 */
88static DEFINE_PER_CPU(spinlock_t, lookup_lock);
89
90/*
91 * Mutex to serialize state changes with show-stats activities:
92 */
93static DEFINE_MUTEX(show_mutex);
94
95/*
96 * Collection status, active/inactive:
97 */
98static int __read_mostly active;
99
100/*
101 * Beginning/end timestamps of measurement:
102 */
103static ktime_t time_start, time_stop;
104
105/*
106 * tstat entry structs only get allocated while collection is
107 * active and never freed during that time - this simplifies
108 * things quite a bit.
109 *
110 * They get freed when a new collection period is started.
111 */
112#define MAX_ENTRIES_BITS 10
113#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
114
115static unsigned long nr_entries;
116static struct entry entries[MAX_ENTRIES];
117
118static atomic_t overflow_count;
119
120static void reset_entries(void)
121{
122 nr_entries = 0;
123 memset(entries, 0, sizeof(entries));
124 atomic_set(&overflow_count, 0);
125}
126
127static struct entry *alloc_entry(void)
128{
129 if (nr_entries >= MAX_ENTRIES)
130 return NULL;
131
132 return entries + nr_entries++;
133}
134
135/*
136 * The entries are in a hash-table, for fast lookup:
137 */
138#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
139#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
140#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
141
142#define __tstat_hashfn(entry) \
143 (((unsigned long)(entry)->timer ^ \
144 (unsigned long)(entry)->start_func ^ \
145 (unsigned long)(entry)->expire_func ^ \
146 (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
147
148#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
149
150static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
151
152static int match_entries(struct entry *entry1, struct entry *entry2)
153{
154 return entry1->timer == entry2->timer &&
155 entry1->start_func == entry2->start_func &&
156 entry1->expire_func == entry2->expire_func &&
157 entry1->pid == entry2->pid;
158}
159
160/*
161 * Look up whether an entry matching this item is present
162 * in the hash already. Must be called with irqs off and the
163 * lookup lock held:
164 */
165static struct entry *tstat_lookup(struct entry *entry, char *comm)
166{
167 struct entry **head, *curr, *prev;
168
169 head = tstat_hashentry(entry);
170 curr = *head;
171
172 /*
173 * The fastpath is when the entry is already hashed,
174 * we do this with the lookup lock held, but with the
175 * table lock not held:
176 */
177 while (curr) {
178 if (match_entries(curr, entry))
179 return curr;
180
181 curr = curr->next;
182 }
183 /*
184 * Slowpath: allocate, set up and link a new hash entry:
185 */
186 prev = NULL;
187 curr = *head;
188
189 spin_lock(&table_lock);
190 /*
191 * Make sure we have not raced with another CPU:
192 */
193 while (curr) {
194 if (match_entries(curr, entry))
195 goto out_unlock;
196
197 prev = curr;
198 curr = curr->next;
199 }
200
201 curr = alloc_entry();
202 if (curr) {
203 *curr = *entry;
204 curr->count = 0;
205 memcpy(curr->comm, comm, TASK_COMM_LEN);
206 if (prev)
207 prev->next = curr;
208 else
209 *head = curr;
210 curr->next = NULL;
211 }
212 out_unlock:
213 spin_unlock(&table_lock);
214
215 return curr;
216}
217
218/**
219 * timer_stats_update_stats - Update the statistics for a timer.
220 * @timer: pointer to either a timer_list or a hrtimer
221 * @pid: the pid of the task which set up the timer
222 * @startf: pointer to the function which did the timer setup
223 * @timerf: pointer to the timer callback function of the timer
224 * @comm: name of the process which set up the timer
225 *
226 * When the timer is already registered, then the event counter is
227 * incremented. Otherwise the timer is registered in a free slot.
228 */
229void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
230 void *timerf, char * comm)
231{
232 /*
233 * It doesnt matter which lock we take:
234 */
235 spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
236 struct entry *entry, input;
237 unsigned long flags;
238
239 input.timer = timer;
240 input.start_func = startf;
241 input.expire_func = timerf;
242 input.pid = pid;
243
244 spin_lock_irqsave(lock, flags);
245 if (!active)
246 goto out_unlock;
247
248 entry = tstat_lookup(&input, comm);
249 if (likely(entry))
250 entry->count++;
251 else
252 atomic_inc(&overflow_count);
253
254 out_unlock:
255 spin_unlock_irqrestore(lock, flags);
256}
257
258static void print_name_offset(struct seq_file *m, unsigned long addr)
259{
260 char namebuf[KSYM_NAME_LEN+1];
261 unsigned long size, offset;
262 const char *sym_name;
263 char *modname;
264
265 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
266 if (sym_name)
267 seq_printf(m, "%s", sym_name);
268 else
269 seq_printf(m, "<%p>", (void *)addr);
270}
271
272static int tstats_show(struct seq_file *m, void *v)
273{
274 struct timespec period;
275 struct entry *entry;
276 unsigned long ms;
277 long events = 0;
278 ktime_t time;
279 int i;
280
281 mutex_lock(&show_mutex);
282 /*
283 * If still active then calculate up to now:
284 */
285 if (active)
286 time_stop = ktime_get();
287
288 time = ktime_sub(time_stop, time_start);
289
290 period = ktime_to_timespec(time);
291 ms = period.tv_nsec / 1000000;
292
293 seq_puts(m, "Timer Stats Version: v0.1\n");
294 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
295 if (atomic_read(&overflow_count))
296 seq_printf(m, "Overflow: %d entries\n",
297 atomic_read(&overflow_count));
298
299 for (i = 0; i < nr_entries; i++) {
300 entry = entries + i;
301 seq_printf(m, "%4lu, %5d %-16s ",
302 entry->count, entry->pid, entry->comm);
303
304 print_name_offset(m, (unsigned long)entry->start_func);
305 seq_puts(m, " (");
306 print_name_offset(m, (unsigned long)entry->expire_func);
307 seq_puts(m, ")\n");
308
309 events += entry->count;
310 }
311
312 ms += period.tv_sec * 1000;
313 if (!ms)
314 ms = 1;
315
316 if (events && period.tv_sec)
317 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
318 events / period.tv_sec, events * 1000 / ms);
319 else
320 seq_printf(m, "%ld total events\n", events);
321
322 mutex_unlock(&show_mutex);
323
324 return 0;
325}
326
327/*
328 * After a state change, make sure all concurrent lookup/update
329 * activities have stopped:
330 */
331static void sync_access(void)
332{
333 unsigned long flags;
334 int cpu;
335
336 for_each_online_cpu(cpu) {
337 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
338 /* nothing */
339 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
340 }
341}
342
343static ssize_t tstats_write(struct file *file, const char __user *buf,
344 size_t count, loff_t *offs)
345{
346 char ctl[2];
347
348 if (count != 2 || *offs)
349 return -EINVAL;
350
351 if (copy_from_user(ctl, buf, count))
352 return -EFAULT;
353
354 mutex_lock(&show_mutex);
355 switch (ctl[0]) {
356 case '0':
357 if (active) {
358 active = 0;
359 time_stop = ktime_get();
360 sync_access();
361 }
362 break;
363 case '1':
364 if (!active) {
365 reset_entries();
366 time_start = ktime_get();
367 active = 1;
368 }
369 break;
370 default:
371 count = -EINVAL;
372 }
373 mutex_unlock(&show_mutex);
374
375 return count;
376}
377
378static int tstats_open(struct inode *inode, struct file *filp)
379{
380 return single_open(filp, tstats_show, NULL);
381}
382
383static struct file_operations tstats_fops = {
384 .open = tstats_open,
385 .read = seq_read,
386 .write = tstats_write,
387 .llseek = seq_lseek,
388 .release = seq_release,
389};
390
391void __init init_timer_stats(void)
392{
393 int cpu;
394
395 for_each_possible_cpu(cpu)
396 spin_lock_init(&per_cpu(lookup_lock, cpu));
397}
398
399static int __init init_tstats_procfs(void)
400{
401 struct proc_dir_entry *pe;
402
403 pe = create_proc_entry("timer_stats", 0644, NULL);
404 if (!pe)
405 return -ENOMEM;
406
407 pe->proc_fops = &tstats_fops;
408
409 return 0;
410}
411__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index 8533c3796082..cb1b86a9c52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,8 @@
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/tick.h>
38#include <linux/kallsyms.h>
37 39
38#include <asm/uaccess.h> 40#include <asm/uaccess.h>
39#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
262 list_add_tail(&timer->entry, vec); 264 list_add_tail(&timer->entry, vec);
263} 265}
264 266
267#ifdef CONFIG_TIMER_STATS
268void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
269{
270 if (timer->start_site)
271 return;
272
273 timer->start_site = addr;
274 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
275 timer->start_pid = current->pid;
276}
277#endif
278
265/** 279/**
266 * init_timer - initialize a timer. 280 * init_timer - initialize a timer.
267 * @timer: the timer to be initialized 281 * @timer: the timer to be initialized
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
273{ 287{
274 timer->entry.next = NULL; 288 timer->entry.next = NULL;
275 timer->base = __raw_get_cpu_var(tvec_bases); 289 timer->base = __raw_get_cpu_var(tvec_bases);
290#ifdef CONFIG_TIMER_STATS
291 timer->start_site = NULL;
292 timer->start_pid = -1;
293 memset(timer->start_comm, 0, TASK_COMM_LEN);
294#endif
276} 295}
277EXPORT_SYMBOL(init_timer); 296EXPORT_SYMBOL(init_timer);
278 297
279static inline void detach_timer(struct timer_list *timer, 298static inline void detach_timer(struct timer_list *timer,
280 int clear_pending) 299 int clear_pending)
281{ 300{
282 struct list_head *entry = &timer->entry; 301 struct list_head *entry = &timer->entry;
283 302
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
324 unsigned long flags; 343 unsigned long flags;
325 int ret = 0; 344 int ret = 0;
326 345
346 timer_stats_timer_set_start_info(timer);
327 BUG_ON(!timer->function); 347 BUG_ON(!timer->function);
328 348
329 base = lock_timer_base(timer, &flags); 349 base = lock_timer_base(timer, &flags);
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
374 tvec_base_t *base = per_cpu(tvec_bases, cpu); 394 tvec_base_t *base = per_cpu(tvec_bases, cpu);
375 unsigned long flags; 395 unsigned long flags;
376 396
397 timer_stats_timer_set_start_info(timer);
377 BUG_ON(timer_pending(timer) || !timer->function); 398 BUG_ON(timer_pending(timer) || !timer->function);
378 spin_lock_irqsave(&base->lock, flags); 399 spin_lock_irqsave(&base->lock, flags);
379 timer->base = base; 400 timer->base = base;
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
406{ 427{
407 BUG_ON(!timer->function); 428 BUG_ON(!timer->function);
408 429
430 timer_stats_timer_set_start_info(timer);
409 /* 431 /*
410 * This is a common optimization triggered by the 432 * This is a common optimization triggered by the
411 * networking code - if the timer is re-modified 433 * networking code - if the timer is re-modified
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer)
436 unsigned long flags; 458 unsigned long flags;
437 int ret = 0; 459 int ret = 0;
438 460
461 timer_stats_timer_clear_start_info(timer);
439 if (timer_pending(timer)) { 462 if (timer_pending(timer)) {
440 base = lock_timer_base(timer, &flags); 463 base = lock_timer_base(timer, &flags);
441 if (timer_pending(timer)) { 464 if (timer_pending(timer)) {
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
569 fn = timer->function; 592 fn = timer->function;
570 data = timer->data; 593 data = timer->data;
571 594
595 timer_stats_account_timer(timer);
596
572 set_running_timer(base, timer); 597 set_running_timer(base, timer);
573 detach_timer(timer, 1); 598 detach_timer(timer, 1);
574 spin_unlock_irq(&base->lock); 599 spin_unlock_irq(&base->lock);
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base)
591 spin_unlock_irq(&base->lock); 616 spin_unlock_irq(&base->lock);
592} 617}
593 618
594#ifdef CONFIG_NO_IDLE_HZ 619#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
595/* 620/*
596 * Find out when the next timer event is due to happen. This 621 * Find out when the next timer event is due to happen. This
597 * is used on S/390 to stop all activity when a cpus is idle. 622 * is used on S/390 to stop all activity when a cpus is idle.
598 * This functions needs to be called disabled. 623 * This functions needs to be called disabled.
599 */ 624 */
600unsigned long next_timer_interrupt(void) 625static unsigned long __next_timer_interrupt(tvec_base_t *base)
601{ 626{
602 tvec_base_t *base; 627 unsigned long timer_jiffies = base->timer_jiffies;
603 struct list_head *list; 628 unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
629 int index, slot, array, found = 0;
604 struct timer_list *nte; 630 struct timer_list *nte;
605 unsigned long expires;
606 unsigned long hr_expires = MAX_JIFFY_OFFSET;
607 ktime_t hr_delta;
608 tvec_t *varray[4]; 631 tvec_t *varray[4];
609 int i, j;
610
611 hr_delta = hrtimer_get_next_event();
612 if (hr_delta.tv64 != KTIME_MAX) {
613 struct timespec tsdelta;
614 tsdelta = ktime_to_timespec(hr_delta);
615 hr_expires = timespec_to_jiffies(&tsdelta);
616 if (hr_expires < 3)
617 return hr_expires + jiffies;
618 }
619 hr_expires += jiffies;
620
621 base = __get_cpu_var(tvec_bases);
622 spin_lock(&base->lock);
623 expires = base->timer_jiffies + (LONG_MAX >> 1);
624 list = NULL;
625 632
626 /* Look for timer events in tv1. */ 633 /* Look for timer events in tv1. */
627 j = base->timer_jiffies & TVR_MASK; 634 index = slot = timer_jiffies & TVR_MASK;
628 do { 635 do {
629 list_for_each_entry(nte, base->tv1.vec + j, entry) { 636 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
637 found = 1;
630 expires = nte->expires; 638 expires = nte->expires;
631 if (j < (base->timer_jiffies & TVR_MASK)) 639 /* Look at the cascade bucket(s)? */
632 list = base->tv2.vec + (INDEX(0)); 640 if (!index || slot < index)
633 goto found; 641 goto cascade;
642 return expires;
634 } 643 }
635 j = (j + 1) & TVR_MASK; 644 slot = (slot + 1) & TVR_MASK;
636 } while (j != (base->timer_jiffies & TVR_MASK)); 645 } while (slot != index);
646
647cascade:
648 /* Calculate the next cascade event */
649 if (index)
650 timer_jiffies += TVR_SIZE - index;
651 timer_jiffies >>= TVR_BITS;
637 652
638 /* Check tv2-tv5. */ 653 /* Check tv2-tv5. */
639 varray[0] = &base->tv2; 654 varray[0] = &base->tv2;
640 varray[1] = &base->tv3; 655 varray[1] = &base->tv3;
641 varray[2] = &base->tv4; 656 varray[2] = &base->tv4;
642 varray[3] = &base->tv5; 657 varray[3] = &base->tv5;
643 for (i = 0; i < 4; i++) { 658
644 j = INDEX(i); 659 for (array = 0; array < 4; array++) {
660 tvec_t *varp = varray[array];
661
662 index = slot = timer_jiffies & TVN_MASK;
645 do { 663 do {
646 if (list_empty(varray[i]->vec + j)) { 664 list_for_each_entry(nte, varp->vec + slot, entry) {
647 j = (j + 1) & TVN_MASK; 665 found = 1;
648 continue;
649 }
650 list_for_each_entry(nte, varray[i]->vec + j, entry)
651 if (time_before(nte->expires, expires)) 666 if (time_before(nte->expires, expires))
652 expires = nte->expires; 667 expires = nte->expires;
653 if (j < (INDEX(i)) && i < 3) 668 }
654 list = varray[i + 1]->vec + (INDEX(i + 1)); 669 /*
655 goto found; 670 * Do we still search for the first timer or are
656 } while (j != (INDEX(i))); 671 * we looking up the cascade buckets ?
657 } 672 */
658found: 673 if (found) {
659 if (list) { 674 /* Look at the cascade bucket(s)? */
660 /* 675 if (!index || slot < index)
661 * The search wrapped. We need to look at the next list 676 break;
662 * from next tv element that would cascade into tv element 677 return expires;
663 * where we found the timer element. 678 }
664 */ 679 slot = (slot + 1) & TVN_MASK;
665 list_for_each_entry(nte, list, entry) { 680 } while (slot != index);
666 if (time_before(nte->expires, expires)) 681
667 expires = nte->expires; 682 if (index)
668 } 683 timer_jiffies += TVN_SIZE - index;
684 timer_jiffies >>= TVN_BITS;
669 } 685 }
670 spin_unlock(&base->lock); 686 return expires;
687}
671 688
672 /* 689/*
673 * It can happen that other CPUs service timer IRQs and increment 690 * Check, if the next hrtimer event is before the next timer wheel
674 * jiffies, but we have not yet got a local timer tick to process 691 * event:
675 * the timer wheels. In that case, the expiry time can be before 692 */
676 * jiffies, but since the high-resolution timer here is relative to 693static unsigned long cmp_next_hrtimer_event(unsigned long now,
677 * jiffies, the default expression when high-resolution timers are 694 unsigned long expires)
678 * not active, 695{
679 * 696 ktime_t hr_delta = hrtimer_get_next_event();
680 * time_before(MAX_JIFFY_OFFSET + jiffies, expires) 697 struct timespec tsdelta;
681 * 698
682 * would falsely evaluate to true. If that is the case, just 699 if (hr_delta.tv64 == KTIME_MAX)
683 * return jiffies so that we can immediately fire the local timer 700 return expires;
684 */
685 if (time_before(expires, jiffies))
686 return jiffies;
687 701
688 if (time_before(hr_expires, expires)) 702 if (hr_delta.tv64 <= TICK_NSEC)
689 return hr_expires; 703 return now;
690 704
705 tsdelta = ktime_to_timespec(hr_delta);
706 now += timespec_to_jiffies(&tsdelta);
707 if (time_before(now, expires))
708 return now;
691 return expires; 709 return expires;
692} 710}
711
712/**
713 * next_timer_interrupt - return the jiffy of the next pending timer
714 */
715unsigned long get_next_timer_interrupt(unsigned long now)
716{
717 tvec_base_t *base = __get_cpu_var(tvec_bases);
718 unsigned long expires;
719
720 spin_lock(&base->lock);
721 expires = __next_timer_interrupt(base);
722 spin_unlock(&base->lock);
723
724 if (time_before_eq(expires, now))
725 return now;
726
727 return cmp_next_hrtimer_event(now, expires);
728}
729
730#ifdef CONFIG_NO_IDLE_HZ
731unsigned long next_timer_interrupt(void)
732{
733 return get_next_timer_interrupt(jiffies);
734}
735#endif
736
693#endif 737#endif
694 738
695/******************************************************************/ 739/******************************************************************/
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday);
832 * 876 *
833 * Accumulates current time interval and initializes new clocksource 877 * Accumulates current time interval and initializes new clocksource
834 */ 878 */
835static int change_clocksource(void) 879static void change_clocksource(void)
836{ 880{
837 struct clocksource *new; 881 struct clocksource *new;
838 cycle_t now; 882 cycle_t now;
839 u64 nsec; 883 u64 nsec;
884
840 new = clocksource_get_next(); 885 new = clocksource_get_next();
841 if (clock != new) { 886
842 now = clocksource_read(new); 887 if (clock == new)
843 nsec = __get_nsec_offset(); 888 return;
844 timespec_add_ns(&xtime, nsec); 889
845 890 now = clocksource_read(new);
846 clock = new; 891 nsec = __get_nsec_offset();
847 clock->cycle_last = now; 892 timespec_add_ns(&xtime, nsec);
848 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 893
849 clock->name); 894 clock = new;
850 return 1; 895 clock->cycle_last = now;
851 } else if (clock->update_callback) { 896
852 return clock->update_callback(); 897 clock->error = 0;
853 } 898 clock->xtime_nsec = 0;
854 return 0; 899 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
900
901 tick_clock_notify();
902
903 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
904 clock->name);
855} 905}
856#else 906#else
857static inline int change_clocksource(void) 907static inline void change_clocksource(void) { }
858{
859 return 0;
860}
861#endif 908#endif
862 909
863/** 910/**
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void)
871 do { 918 do {
872 seq = read_seqbegin(&xtime_lock); 919 seq = read_seqbegin(&xtime_lock);
873 920
874 ret = clock->is_continuous; 921 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
875 922
876 } while (read_seqretry(&xtime_lock, seq)); 923 } while (read_seqretry(&xtime_lock, seq));
877 924
878 return ret; 925 return ret;
879} 926}
880 927
928/**
929 * read_persistent_clock - Return time in seconds from the persistent clock.
930 *
931 * Weak dummy function for arches that do not yet support it.
932 * Returns seconds from epoch using the battery backed persistent clock.
933 * Returns zero if unsupported.
934 *
935 * XXX - Do be sure to remove it once all arches implement it.
936 */
937unsigned long __attribute__((weak)) read_persistent_clock(void)
938{
939 return 0;
940}
941
881/* 942/*
882 * timekeeping_init - Initializes the clocksource and common timekeeping values 943 * timekeeping_init - Initializes the clocksource and common timekeeping values
883 */ 944 */
884void __init timekeeping_init(void) 945void __init timekeeping_init(void)
885{ 946{
886 unsigned long flags; 947 unsigned long flags;
948 unsigned long sec = read_persistent_clock();
887 949
888 write_seqlock_irqsave(&xtime_lock, flags); 950 write_seqlock_irqsave(&xtime_lock, flags);
889 951
890 ntp_clear(); 952 ntp_clear();
891 953
892 clock = clocksource_get_next(); 954 clock = clocksource_get_next();
893 clocksource_calculate_interval(clock, tick_nsec); 955 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
894 clock->cycle_last = clocksource_read(clock); 956 clock->cycle_last = clocksource_read(clock);
895 957
958 xtime.tv_sec = sec;
959 xtime.tv_nsec = 0;
960 set_normalized_timespec(&wall_to_monotonic,
961 -xtime.tv_sec, -xtime.tv_nsec);
962
896 write_sequnlock_irqrestore(&xtime_lock, flags); 963 write_sequnlock_irqrestore(&xtime_lock, flags);
897} 964}
898 965
899 966/* flag for if timekeeping is suspended */
900static int timekeeping_suspended; 967static int timekeeping_suspended;
968/* time in seconds when suspend began */
969static unsigned long timekeeping_suspend_time;
970
901/** 971/**
902 * timekeeping_resume - Resumes the generic timekeeping subsystem. 972 * timekeeping_resume - Resumes the generic timekeeping subsystem.
903 * @dev: unused 973 * @dev: unused
@@ -909,13 +979,26 @@ static int timekeeping_suspended;
909static int timekeeping_resume(struct sys_device *dev) 979static int timekeeping_resume(struct sys_device *dev)
910{ 980{
911 unsigned long flags; 981 unsigned long flags;
982 unsigned long now = read_persistent_clock();
912 983
913 write_seqlock_irqsave(&xtime_lock, flags); 984 write_seqlock_irqsave(&xtime_lock, flags);
914 /* restart the last cycle value */ 985
986 if (now && (now > timekeeping_suspend_time)) {
987 unsigned long sleep_length = now - timekeeping_suspend_time;
988
989 xtime.tv_sec += sleep_length;
990 wall_to_monotonic.tv_sec -= sleep_length;
991 }
992 /* re-base the last cycle value */
915 clock->cycle_last = clocksource_read(clock); 993 clock->cycle_last = clocksource_read(clock);
916 clock->error = 0; 994 clock->error = 0;
917 timekeeping_suspended = 0; 995 timekeeping_suspended = 0;
918 write_sequnlock_irqrestore(&xtime_lock, flags); 996 write_sequnlock_irqrestore(&xtime_lock, flags);
997
998 touch_softlockup_watchdog();
999 /* Resume hrtimers */
1000 clock_was_set();
1001
919 return 0; 1002 return 0;
920} 1003}
921 1004
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
925 1008
926 write_seqlock_irqsave(&xtime_lock, flags); 1009 write_seqlock_irqsave(&xtime_lock, flags);
927 timekeeping_suspended = 1; 1010 timekeeping_suspended = 1;
1011 timekeeping_suspend_time = read_persistent_clock();
928 write_sequnlock_irqrestore(&xtime_lock, flags); 1012 write_sequnlock_irqrestore(&xtime_lock, flags);
929 return 0; 1013 return 0;
930} 1014}
@@ -1089,11 +1173,8 @@ static void update_wall_time(void)
1089 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 1173 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1090 1174
1091 /* check to see if there is a new clocksource to use */ 1175 /* check to see if there is a new clocksource to use */
1092 if (change_clocksource()) { 1176 change_clocksource();
1093 clock->error = 0; 1177 update_vsyscall(&xtime, clock);
1094 clock->xtime_nsec = 0;
1095 clocksource_calculate_interval(clock, tick_nsec);
1096 }
1097} 1178}
1098 1179
1099/* 1180/*
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks)
1162 * This read-write spinlock protects us from races in SMP while 1243 * This read-write spinlock protects us from races in SMP while
1163 * playing with xtime and avenrun. 1244 * playing with xtime and avenrun.
1164 */ 1245 */
1165#ifndef ARCH_HAVE_XTIME_LOCK 1246__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1166__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1167 1247
1168EXPORT_SYMBOL(xtime_lock); 1248EXPORT_SYMBOL(xtime_lock);
1169#endif
1170 1249
1171/* 1250/*
1172 * This function runs timers and the timer-tq in bottom half context. 1251 * This function runs timers and the timer-tq in bottom half context.
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h)
1175{ 1254{
1176 tvec_base_t *base = __get_cpu_var(tvec_bases); 1255 tvec_base_t *base = __get_cpu_var(tvec_bases);
1177 1256
1178 hrtimer_run_queues(); 1257 hrtimer_run_queues();
1258
1179 if (time_after_eq(jiffies, base->timer_jiffies)) 1259 if (time_after_eq(jiffies, base->timer_jiffies))
1180 __run_timers(base); 1260 __run_timers(base);
1181} 1261}
@@ -1621,6 +1701,8 @@ void __init init_timers(void)
1621 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1701 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1622 (void *)(long)smp_processor_id()); 1702 (void *)(long)smp_processor_id());
1623 1703
1704 init_timer_stats();
1705
1624 BUG_ON(err == NOTIFY_BAD); 1706 BUG_ON(err == NOTIFY_BAD);
1625 register_cpu_notifier(&timers_nb); 1707 register_cpu_notifier(&timers_nb);
1626 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1708 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index baacc3691415..658f638c402c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -22,8 +22,6 @@
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24 24
25
26#define USEC_PER_TICK (USEC_PER_SEC/HZ)
27/* 25/*
28 * fill in basic accounting fields 26 * fill in basic accounting fields
29 */ 27 */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
new file mode 100644
index 000000000000..f22b9dbd2a9c
--- /dev/null
+++ b/kernel/utsname_sysctl.c
@@ -0,0 +1,146 @@
1/*
2 * Copyright (C) 2007
3 *
4 * Author: Eric Biederman <ebiederm@xmision.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 */
11
12#include <linux/module.h>
13#include <linux/uts.h>
14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/sysctl.h>
17
18static void *get_uts(ctl_table *table, int write)
19{
20 char *which = table->data;
21#ifdef CONFIG_UTS_NS
22 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
24#endif
25 if (!write)
26 down_read(&uts_sem);
27 else
28 down_write(&uts_sem);
29 return which;
30}
31
32static void put_uts(ctl_table *table, int write, void *which)
33{
34 if (!write)
35 up_read(&uts_sem);
36 else
37 up_write(&uts_sem);
38}
39
40#ifdef CONFIG_PROC_FS
41/*
42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ????
44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{
48 struct ctl_table uts_table;
49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data);
54 return r;
55}
56#else
57#define proc_do_uts_string NULL
58#endif
59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, name, nlen,
73 oldval, oldlenp, newval, newlen);
74 put_uts(table, write, uts_table.data);
75 return r;
76}
77#else
78#define sysctl_uts_string NULL
79#endif
80
81static struct ctl_table uts_kern_table[] = {
82 {
83 .ctl_name = KERN_OSTYPE,
84 .procname = "ostype",
85 .data = init_uts_ns.name.sysname,
86 .maxlen = sizeof(init_uts_ns.name.sysname),
87 .mode = 0444,
88 .proc_handler = proc_do_uts_string,
89 .strategy = sysctl_uts_string,
90 },
91 {
92 .ctl_name = KERN_OSRELEASE,
93 .procname = "osrelease",
94 .data = init_uts_ns.name.release,
95 .maxlen = sizeof(init_uts_ns.name.release),
96 .mode = 0444,
97 .proc_handler = proc_do_uts_string,
98 .strategy = sysctl_uts_string,
99 },
100 {
101 .ctl_name = KERN_VERSION,
102 .procname = "version",
103 .data = init_uts_ns.name.version,
104 .maxlen = sizeof(init_uts_ns.name.version),
105 .mode = 0444,
106 .proc_handler = proc_do_uts_string,
107 .strategy = sysctl_uts_string,
108 },
109 {
110 .ctl_name = KERN_NODENAME,
111 .procname = "hostname",
112 .data = init_uts_ns.name.nodename,
113 .maxlen = sizeof(init_uts_ns.name.nodename),
114 .mode = 0644,
115 .proc_handler = proc_do_uts_string,
116 .strategy = sysctl_uts_string,
117 },
118 {
119 .ctl_name = KERN_DOMAINNAME,
120 .procname = "domainname",
121 .data = init_uts_ns.name.domainname,
122 .maxlen = sizeof(init_uts_ns.name.domainname),
123 .mode = 0644,
124 .proc_handler = proc_do_uts_string,
125 .strategy = sysctl_uts_string,
126 },
127 {}
128};
129
130static struct ctl_table uts_root_table[] = {
131 {
132 .ctl_name = CTL_KERN,
133 .procname = "kernel",
134 .mode = 0555,
135 .child = uts_kern_table,
136 },
137 {}
138};
139
140static int __init utsname_sysctl_init(void)
141{
142 register_sysctl_table(uts_root_table);
143 return 0;
144}
145
146__initcall(utsname_sysctl_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 020d1fff57dc..b6fa5e63085d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
218} 218}
219EXPORT_SYMBOL_GPL(queue_work); 219EXPORT_SYMBOL_GPL(queue_work);
220 220
221static void delayed_work_timer_fn(unsigned long __data) 221void delayed_work_timer_fn(unsigned long __data)
222{ 222{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
245 struct timer_list *timer = &dwork->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work; 246 struct work_struct *work = &dwork->work;
247 247
248 timer_stats_timer_set_start_info(timer);
248 if (delay == 0) 249 if (delay == 0)
249 return queue_work(wq, work); 250 return queue_work(wq, work);
250 251
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
593 * After waiting for a given time this puts a job in the kernel-global 594 * After waiting for a given time this puts a job in the kernel-global
594 * workqueue. 595 * workqueue.
595 */ 596 */
596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) 597int fastcall schedule_delayed_work(struct delayed_work *dwork,
598 unsigned long delay)
597{ 599{
600 timer_stats_timer_set_start_info(&dwork->timer);
598 return queue_delayed_work(keventd_wq, dwork, delay); 601 return queue_delayed_work(keventd_wq, dwork, delay);
599} 602}
600EXPORT_SYMBOL(schedule_delayed_work); 603EXPORT_SYMBOL(schedule_delayed_work);