aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/time
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2014-06-22 06:06:40 -0400
committerThomas Gleixner <tglx@linutronix.de>2014-06-23 05:22:35 -0400
commit5cee964597260237dd2cabb3ec22bba0da24b25d (patch)
treef548efb4181a4cffb026adf43178e65330533e87 /kernel/time
parent58394271c610e9c65dd0165a1c1f6dec75dc5f3e (diff)
time/timers: Move all time(r) related files into kernel/time
Except for Kconfig.HZ. That needs a separate treatment. Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Makefile17
-rw-r--r--kernel/time/hrtimer.c1915
-rw-r--r--kernel/time/itimer.c301
-rw-r--r--kernel/time/posix-cpu-timers.c1490
-rw-r--r--kernel/time/posix-timers.c1121
-rw-r--r--kernel/time/time.c714
-rw-r--r--kernel/time/timeconst.bc108
-rw-r--r--kernel/time/timer.c1734
8 files changed, 7400 insertions, 0 deletions
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 57a413fd0ebf..e59ce8b1b550 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,3 +1,4 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o posix-clock.o alarmtimer.o
3 4
@@ -12,3 +13,19 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
12obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
13obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
14obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16
17$(obj)/time.o: $(obj)/timeconst.h
18
19quiet_cmd_hzfile = HZFILE $@
20 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
21
22targets += hz.bc
23$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
24 $(call if_changed,hzfile)
25
26quiet_cmd_bc = BC $@
27 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
28
29targets += timeconst.h
30$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
31 $(call if_changed,bc)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
new file mode 100644
index 000000000000..3ab28993f6e0
--- /dev/null
+++ b/kernel/time/hrtimer.c
@@ -0,0 +1,1915 @@
1/*
2 * linux/kernel/hrtimer.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * High-resolution kernel timers
9 *
10 * In contrast to the low-resolution timeout API implemented in
11 * kernel/timer.c, hrtimers provide finer resolution and accuracy
12 * depending on system configuration and capabilities.
13 *
14 * These timers are currently used for:
15 * - itimers
16 * - POSIX timers
17 * - nanosleep
18 * - precise in-kernel timing
19 *
20 * Started by: Thomas Gleixner and Ingo Molnar
21 *
22 * Credits:
23 * based on kernel/timer.c
24 *
25 * Help, testing, suggestions, bugfixes, improvements were
26 * provided by:
27 *
28 * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
29 * et. al.
30 *
31 * For licencing details see kernel-base/COPYING
32 */
33
34#include <linux/cpu.h>
35#include <linux/export.h>
36#include <linux/percpu.h>
37#include <linux/hrtimer.h>
38#include <linux/notifier.h>
39#include <linux/syscalls.h>
40#include <linux/kallsyms.h>
41#include <linux/interrupt.h>
42#include <linux/tick.h>
43#include <linux/seq_file.h>
44#include <linux/err.h>
45#include <linux/debugobjects.h>
46#include <linux/sched.h>
47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h>
49#include <linux/sched/deadline.h>
50#include <linux/timer.h>
51#include <linux/freezer.h>
52
53#include <asm/uaccess.h>
54
55#include <trace/events/timer.h>
56
57/*
58 * The timer bases:
59 *
60 * There are more clockids then hrtimer bases. Thus, we index
61 * into the timer bases by the hrtimer_base_type enum. When trying
62 * to reach a base using a clockid, hrtimer_clockid_to_base()
63 * is used to convert from clockid to the proper hrtimer_base_type.
64 */
65DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
66{
67
68 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
69 .clock_base =
70 {
71 {
72 .index = HRTIMER_BASE_MONOTONIC,
73 .clockid = CLOCK_MONOTONIC,
74 .get_time = &ktime_get,
75 .resolution = KTIME_LOW_RES,
76 },
77 {
78 .index = HRTIMER_BASE_REALTIME,
79 .clockid = CLOCK_REALTIME,
80 .get_time = &ktime_get_real,
81 .resolution = KTIME_LOW_RES,
82 },
83 {
84 .index = HRTIMER_BASE_BOOTTIME,
85 .clockid = CLOCK_BOOTTIME,
86 .get_time = &ktime_get_boottime,
87 .resolution = KTIME_LOW_RES,
88 },
89 {
90 .index = HRTIMER_BASE_TAI,
91 .clockid = CLOCK_TAI,
92 .get_time = &ktime_get_clocktai,
93 .resolution = KTIME_LOW_RES,
94 },
95 }
96};
97
98static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
99 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
100 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
101 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
102 [CLOCK_TAI] = HRTIMER_BASE_TAI,
103};
104
105static inline int hrtimer_clockid_to_base(clockid_t clock_id)
106{
107 return hrtimer_clock_to_base_table[clock_id];
108}
109
110
111/*
112 * Get the coarse grained time at the softirq based on xtime and
113 * wall_to_monotonic.
114 */
115static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
116{
117 ktime_t xtim, mono, boot;
118 struct timespec xts, tom, slp;
119 s32 tai_offset;
120
121 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
122 tai_offset = timekeeping_get_tai_offset();
123
124 xtim = timespec_to_ktime(xts);
125 mono = ktime_add(xtim, timespec_to_ktime(tom));
126 boot = ktime_add(mono, timespec_to_ktime(slp));
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
129 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
130 base->clock_base[HRTIMER_BASE_TAI].softirq_time =
131 ktime_add(xtim, ktime_set(tai_offset, 0));
132}
133
134/*
135 * Functions and macros which are different for UP/SMP systems are kept in a
136 * single place
137 */
138#ifdef CONFIG_SMP
139
140/*
141 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
142 * means that all timers which are tied to this base via timer->base are
143 * locked, and the base itself is locked too.
144 *
145 * So __run_timers/migrate_timers can safely modify all timers which could
146 * be found on the lists/queues.
147 *
148 * When the timer's base is locked, and the timer removed from list, it is
149 * possible to set timer->base = NULL and drop the lock: the timer remains
150 * locked.
151 */
152static
153struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
154 unsigned long *flags)
155{
156 struct hrtimer_clock_base *base;
157
158 for (;;) {
159 base = timer->base;
160 if (likely(base != NULL)) {
161 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
162 if (likely(base == timer->base))
163 return base;
164 /* The timer has migrated to another CPU: */
165 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
166 }
167 cpu_relax();
168 }
169}
170
171/*
172 * With HIGHRES=y we do not migrate the timer when it is expiring
173 * before the next event on the target cpu because we cannot reprogram
174 * the target cpu hardware and we would cause it to fire late.
175 *
176 * Called with cpu_base->lock of target cpu held.
177 */
178static int
179hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
180{
181#ifdef CONFIG_HIGH_RES_TIMERS
182 ktime_t expires;
183
184 if (!new_base->cpu_base->hres_active)
185 return 0;
186
187 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
188 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
189#else
190 return 0;
191#endif
192}
193
194/*
195 * Switch the timer base to the current CPU when possible.
196 */
197static inline struct hrtimer_clock_base *
198switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
199 int pinned)
200{
201 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base;
203 int this_cpu = smp_processor_id();
204 int cpu = get_nohz_timer_target(pinned);
205 int basenum = base->index;
206
207again:
208 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
209 new_base = &new_cpu_base->clock_base[basenum];
210
211 if (base != new_base) {
212 /*
213 * We are trying to move timer to new_base.
214 * However we can't change timer's base while it is running,
215 * so we keep it on the same CPU. No hassle vs. reprogramming
216 * the event source in the high resolution case. The softirq
217 * code will take care of this when the timer function has
218 * completed. There is no conflict as we hold the lock until
219 * the timer is enqueued.
220 */
221 if (unlikely(hrtimer_callback_running(timer)))
222 return base;
223
224 /* See the comment in lock_timer_base() */
225 timer->base = NULL;
226 raw_spin_unlock(&base->cpu_base->lock);
227 raw_spin_lock(&new_base->cpu_base->lock);
228
229 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
230 cpu = this_cpu;
231 raw_spin_unlock(&new_base->cpu_base->lock);
232 raw_spin_lock(&base->cpu_base->lock);
233 timer->base = base;
234 goto again;
235 }
236 timer->base = new_base;
237 } else {
238 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
239 cpu = this_cpu;
240 goto again;
241 }
242 }
243 return new_base;
244}
245
246#else /* CONFIG_SMP */
247
248static inline struct hrtimer_clock_base *
249lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
250{
251 struct hrtimer_clock_base *base = timer->base;
252
253 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
254
255 return base;
256}
257
258# define switch_hrtimer_base(t, b, p) (b)
259
260#endif /* !CONFIG_SMP */
261
262/*
263 * Functions for the union type storage format of ktime_t which are
264 * too large for inlining:
265 */
266#if BITS_PER_LONG < 64
267# ifndef CONFIG_KTIME_SCALAR
268/**
269 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
270 * @kt: addend
271 * @nsec: the scalar nsec value to add
272 *
273 * Returns the sum of kt and nsec in ktime_t format
274 */
275ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
276{
277 ktime_t tmp;
278
279 if (likely(nsec < NSEC_PER_SEC)) {
280 tmp.tv64 = nsec;
281 } else {
282 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
283
284 /* Make sure nsec fits into long */
285 if (unlikely(nsec > KTIME_SEC_MAX))
286 return (ktime_t){ .tv64 = KTIME_MAX };
287
288 tmp = ktime_set((long)nsec, rem);
289 }
290
291 return ktime_add(kt, tmp);
292}
293
294EXPORT_SYMBOL_GPL(ktime_add_ns);
295
296/**
297 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
298 * @kt: minuend
299 * @nsec: the scalar nsec value to subtract
300 *
301 * Returns the subtraction of @nsec from @kt in ktime_t format
302 */
303ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
304{
305 ktime_t tmp;
306
307 if (likely(nsec < NSEC_PER_SEC)) {
308 tmp.tv64 = nsec;
309 } else {
310 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
311
312 tmp = ktime_set((long)nsec, rem);
313 }
314
315 return ktime_sub(kt, tmp);
316}
317
318EXPORT_SYMBOL_GPL(ktime_sub_ns);
319# endif /* !CONFIG_KTIME_SCALAR */
320
321/*
322 * Divide a ktime value by a nanosecond value
323 */
324u64 ktime_divns(const ktime_t kt, s64 div)
325{
326 u64 dclc;
327 int sft = 0;
328
329 dclc = ktime_to_ns(kt);
330 /* Make sure the divisor is less than 2^32: */
331 while (div >> 32) {
332 sft++;
333 div >>= 1;
334 }
335 dclc >>= sft;
336 do_div(dclc, (unsigned long) div);
337
338 return dclc;
339}
340#endif /* BITS_PER_LONG >= 64 */
341
342/*
343 * Add two ktime values and do a safety check for overflow:
344 */
345ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
346{
347 ktime_t res = ktime_add(lhs, rhs);
348
349 /*
350 * We use KTIME_SEC_MAX here, the maximum timeout which we can
351 * return to user space in a timespec:
352 */
353 if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
354 res = ktime_set(KTIME_SEC_MAX, 0);
355
356 return res;
357}
358
359EXPORT_SYMBOL_GPL(ktime_add_safe);
360
361#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
362
363static struct debug_obj_descr hrtimer_debug_descr;
364
365static void *hrtimer_debug_hint(void *addr)
366{
367 return ((struct hrtimer *) addr)->function;
368}
369
370/*
371 * fixup_init is called when:
372 * - an active object is initialized
373 */
374static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
375{
376 struct hrtimer *timer = addr;
377
378 switch (state) {
379 case ODEBUG_STATE_ACTIVE:
380 hrtimer_cancel(timer);
381 debug_object_init(timer, &hrtimer_debug_descr);
382 return 1;
383 default:
384 return 0;
385 }
386}
387
388/*
389 * fixup_activate is called when:
390 * - an active object is activated
391 * - an unknown object is activated (might be a statically initialized object)
392 */
393static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
394{
395 switch (state) {
396
397 case ODEBUG_STATE_NOTAVAILABLE:
398 WARN_ON_ONCE(1);
399 return 0;
400
401 case ODEBUG_STATE_ACTIVE:
402 WARN_ON(1);
403
404 default:
405 return 0;
406 }
407}
408
409/*
410 * fixup_free is called when:
411 * - an active object is freed
412 */
413static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
414{
415 struct hrtimer *timer = addr;
416
417 switch (state) {
418 case ODEBUG_STATE_ACTIVE:
419 hrtimer_cancel(timer);
420 debug_object_free(timer, &hrtimer_debug_descr);
421 return 1;
422 default:
423 return 0;
424 }
425}
426
427static struct debug_obj_descr hrtimer_debug_descr = {
428 .name = "hrtimer",
429 .debug_hint = hrtimer_debug_hint,
430 .fixup_init = hrtimer_fixup_init,
431 .fixup_activate = hrtimer_fixup_activate,
432 .fixup_free = hrtimer_fixup_free,
433};
434
435static inline void debug_hrtimer_init(struct hrtimer *timer)
436{
437 debug_object_init(timer, &hrtimer_debug_descr);
438}
439
440static inline void debug_hrtimer_activate(struct hrtimer *timer)
441{
442 debug_object_activate(timer, &hrtimer_debug_descr);
443}
444
445static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
446{
447 debug_object_deactivate(timer, &hrtimer_debug_descr);
448}
449
450static inline void debug_hrtimer_free(struct hrtimer *timer)
451{
452 debug_object_free(timer, &hrtimer_debug_descr);
453}
454
455static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
456 enum hrtimer_mode mode);
457
458void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
459 enum hrtimer_mode mode)
460{
461 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
462 __hrtimer_init(timer, clock_id, mode);
463}
464EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
465
466void destroy_hrtimer_on_stack(struct hrtimer *timer)
467{
468 debug_object_free(timer, &hrtimer_debug_descr);
469}
470
471#else
472static inline void debug_hrtimer_init(struct hrtimer *timer) { }
473static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
474static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
475#endif
476
477static inline void
478debug_init(struct hrtimer *timer, clockid_t clockid,
479 enum hrtimer_mode mode)
480{
481 debug_hrtimer_init(timer);
482 trace_hrtimer_init(timer, clockid, mode);
483}
484
485static inline void debug_activate(struct hrtimer *timer)
486{
487 debug_hrtimer_activate(timer);
488 trace_hrtimer_start(timer);
489}
490
491static inline void debug_deactivate(struct hrtimer *timer)
492{
493 debug_hrtimer_deactivate(timer);
494 trace_hrtimer_cancel(timer);
495}
496
497/* High resolution timer related functions */
498#ifdef CONFIG_HIGH_RES_TIMERS
499
500/*
501 * High resolution timer enabled ?
502 */
503static int hrtimer_hres_enabled __read_mostly = 1;
504
505/*
506 * Enable / Disable high resolution mode
507 */
508static int __init setup_hrtimer_hres(char *str)
509{
510 if (!strcmp(str, "off"))
511 hrtimer_hres_enabled = 0;
512 else if (!strcmp(str, "on"))
513 hrtimer_hres_enabled = 1;
514 else
515 return 0;
516 return 1;
517}
518
519__setup("highres=", setup_hrtimer_hres);
520
521/*
522 * hrtimer_high_res_enabled - query, if the highres mode is enabled
523 */
524static inline int hrtimer_is_hres_enabled(void)
525{
526 return hrtimer_hres_enabled;
527}
528
529/*
530 * Is the high resolution mode active ?
531 */
532static inline int hrtimer_hres_active(void)
533{
534 return __this_cpu_read(hrtimer_bases.hres_active);
535}
536
537/*
538 * Reprogram the event source with checking both queues for the
539 * next event
540 * Called with interrupts disabled and base->lock held
541 */
542static void
543hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
544{
545 int i;
546 struct hrtimer_clock_base *base = cpu_base->clock_base;
547 ktime_t expires, expires_next;
548
549 expires_next.tv64 = KTIME_MAX;
550
551 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
552 struct hrtimer *timer;
553 struct timerqueue_node *next;
554
555 next = timerqueue_getnext(&base->active);
556 if (!next)
557 continue;
558 timer = container_of(next, struct hrtimer, node);
559
560 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
561 /*
562 * clock_was_set() has changed base->offset so the
563 * result might be negative. Fix it up to prevent a
564 * false positive in clockevents_program_event()
565 */
566 if (expires.tv64 < 0)
567 expires.tv64 = 0;
568 if (expires.tv64 < expires_next.tv64)
569 expires_next = expires;
570 }
571
572 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
573 return;
574
575 cpu_base->expires_next.tv64 = expires_next.tv64;
576
577 /*
578 * If a hang was detected in the last timer interrupt then we
579 * leave the hang delay active in the hardware. We want the
580 * system to make progress. That also prevents the following
581 * scenario:
582 * T1 expires 50ms from now
583 * T2 expires 5s from now
584 *
585 * T1 is removed, so this code is called and would reprogram
586 * the hardware to 5s from now. Any hrtimer_start after that
587 * will not reprogram the hardware due to hang_detected being
588 * set. So we'd effectivly block all timers until the T2 event
589 * fires.
590 */
591 if (cpu_base->hang_detected)
592 return;
593
594 if (cpu_base->expires_next.tv64 != KTIME_MAX)
595 tick_program_event(cpu_base->expires_next, 1);
596}
597
598/*
599 * Shared reprogramming for clock_realtime and clock_monotonic
600 *
601 * When a timer is enqueued and expires earlier than the already enqueued
602 * timers, we have to check, whether it expires earlier than the timer for
603 * which the clock event device was armed.
604 *
605 * Called with interrupts disabled and base->cpu_base.lock held
606 */
607static int hrtimer_reprogram(struct hrtimer *timer,
608 struct hrtimer_clock_base *base)
609{
610 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
611 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
612 int res;
613
614 WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
615
616 /*
617 * When the callback is running, we do not reprogram the clock event
618 * device. The timer callback is either running on a different CPU or
619 * the callback is executed in the hrtimer_interrupt context. The
620 * reprogramming is handled either by the softirq, which called the
621 * callback or at the end of the hrtimer_interrupt.
622 */
623 if (hrtimer_callback_running(timer))
624 return 0;
625
626 /*
627 * CLOCK_REALTIME timer might be requested with an absolute
628 * expiry time which is less than base->offset. Nothing wrong
629 * about that, just avoid to call into the tick code, which
630 * has now objections against negative expiry values.
631 */
632 if (expires.tv64 < 0)
633 return -ETIME;
634
635 if (expires.tv64 >= cpu_base->expires_next.tv64)
636 return 0;
637
638 /*
639 * If a hang was detected in the last timer interrupt then we
640 * do not schedule a timer which is earlier than the expiry
641 * which we enforced in the hang detection. We want the system
642 * to make progress.
643 */
644 if (cpu_base->hang_detected)
645 return 0;
646
647 /*
648 * Clockevents returns -ETIME, when the event was in the past.
649 */
650 res = tick_program_event(expires, 0);
651 if (!IS_ERR_VALUE(res))
652 cpu_base->expires_next = expires;
653 return res;
654}
655
656/*
657 * Initialize the high resolution related parts of cpu_base
658 */
659static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
660{
661 base->expires_next.tv64 = KTIME_MAX;
662 base->hres_active = 0;
663}
664
665/*
666 * When High resolution timers are active, try to reprogram. Note, that in case
667 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
668 * check happens. The timer gets enqueued into the rbtree. The reprogramming
669 * and expiry check is done in the hrtimer_interrupt or in the softirq.
670 */
671static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 struct hrtimer_clock_base *base)
673{
674 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
675}
676
677static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
678{
679 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
680 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
681 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
682
683 return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
684}
685
686/*
687 * Retrigger next event is called after clock was set
688 *
689 * Called with interrupts disabled via on_each_cpu()
690 */
691static void retrigger_next_event(void *arg)
692{
693 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
694
695 if (!hrtimer_hres_active())
696 return;
697
698 raw_spin_lock(&base->lock);
699 hrtimer_update_base(base);
700 hrtimer_force_reprogram(base, 0);
701 raw_spin_unlock(&base->lock);
702}
703
704/*
705 * Switch to high resolution mode
706 */
707static int hrtimer_switch_to_hres(void)
708{
709 int i, cpu = smp_processor_id();
710 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
711 unsigned long flags;
712
713 if (base->hres_active)
714 return 1;
715
716 local_irq_save(flags);
717
718 if (tick_init_highres()) {
719 local_irq_restore(flags);
720 printk(KERN_WARNING "Could not switch to high resolution "
721 "mode on CPU %d\n", cpu);
722 return 0;
723 }
724 base->hres_active = 1;
725 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
726 base->clock_base[i].resolution = KTIME_HIGH_RES;
727
728 tick_setup_sched_timer();
729 /* "Retrigger" the interrupt to get things going */
730 retrigger_next_event(NULL);
731 local_irq_restore(flags);
732 return 1;
733}
734
735static void clock_was_set_work(struct work_struct *work)
736{
737 clock_was_set();
738}
739
740static DECLARE_WORK(hrtimer_work, clock_was_set_work);
741
742/*
743 * Called from timekeeping and resume code to reprogramm the hrtimer
744 * interrupt device on all cpus.
745 */
746void clock_was_set_delayed(void)
747{
748 schedule_work(&hrtimer_work);
749}
750
751#else
752
753static inline int hrtimer_hres_active(void) { return 0; }
754static inline int hrtimer_is_hres_enabled(void) { return 0; }
755static inline int hrtimer_switch_to_hres(void) { return 0; }
756static inline void
757hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
758static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
759 struct hrtimer_clock_base *base)
760{
761 return 0;
762}
763static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
764static inline void retrigger_next_event(void *arg) { }
765
766#endif /* CONFIG_HIGH_RES_TIMERS */
767
768/*
769 * Clock realtime was set
770 *
771 * Change the offset of the realtime clock vs. the monotonic
772 * clock.
773 *
774 * We might have to reprogram the high resolution timer interrupt. On
775 * SMP we call the architecture specific code to retrigger _all_ high
776 * resolution timer interrupts. On UP we just disable interrupts and
777 * call the high resolution interrupt code.
778 */
779void clock_was_set(void)
780{
781#ifdef CONFIG_HIGH_RES_TIMERS
782 /* Retrigger the CPU local events everywhere */
783 on_each_cpu(retrigger_next_event, NULL, 1);
784#endif
785 timerfd_clock_was_set();
786}
787
788/*
789 * During resume we might have to reprogram the high resolution timer
790 * interrupt on all online CPUs. However, all other CPUs will be
791 * stopped with IRQs interrupts disabled so the clock_was_set() call
792 * must be deferred.
793 */
794void hrtimers_resume(void)
795{
796 WARN_ONCE(!irqs_disabled(),
797 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
798
799 /* Retrigger on the local CPU */
800 retrigger_next_event(NULL);
801 /* And schedule a retrigger for all others */
802 clock_was_set_delayed();
803}
804
805static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
806{
807#ifdef CONFIG_TIMER_STATS
808 if (timer->start_site)
809 return;
810 timer->start_site = __builtin_return_address(0);
811 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
812 timer->start_pid = current->pid;
813#endif
814}
815
816static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
817{
818#ifdef CONFIG_TIMER_STATS
819 timer->start_site = NULL;
820#endif
821}
822
823static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
824{
825#ifdef CONFIG_TIMER_STATS
826 if (likely(!timer_stats_active))
827 return;
828 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
829 timer->function, timer->start_comm, 0);
830#endif
831}
832
833/*
834 * Counterpart to lock_hrtimer_base above:
835 */
836static inline
837void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
838{
839 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
840}
841
842/**
843 * hrtimer_forward - forward the timer expiry
844 * @timer: hrtimer to forward
845 * @now: forward past this time
846 * @interval: the interval to forward
847 *
848 * Forward the timer expiry so it will expire in the future.
849 * Returns the number of overruns.
850 */
851u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
852{
853 u64 orun = 1;
854 ktime_t delta;
855
856 delta = ktime_sub(now, hrtimer_get_expires(timer));
857
858 if (delta.tv64 < 0)
859 return 0;
860
861 if (interval.tv64 < timer->base->resolution.tv64)
862 interval.tv64 = timer->base->resolution.tv64;
863
864 if (unlikely(delta.tv64 >= interval.tv64)) {
865 s64 incr = ktime_to_ns(interval);
866
867 orun = ktime_divns(delta, incr);
868 hrtimer_add_expires_ns(timer, incr * orun);
869 if (hrtimer_get_expires_tv64(timer) > now.tv64)
870 return orun;
871 /*
872 * This (and the ktime_add() below) is the
873 * correction for exact:
874 */
875 orun++;
876 }
877 hrtimer_add_expires(timer, interval);
878
879 return orun;
880}
881EXPORT_SYMBOL_GPL(hrtimer_forward);
882
883/*
884 * enqueue_hrtimer - internal function to (re)start a timer
885 *
886 * The timer is inserted in expiry order. Insertion into the
887 * red black tree is O(log(n)). Must hold the base lock.
888 *
889 * Returns 1 when the new timer is the leftmost timer in the tree.
890 */
891static int enqueue_hrtimer(struct hrtimer *timer,
892 struct hrtimer_clock_base *base)
893{
894 debug_activate(timer);
895
896 timerqueue_add(&base->active, &timer->node);
897 base->cpu_base->active_bases |= 1 << base->index;
898
899 /*
900 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
901 * state of a possibly running callback.
902 */
903 timer->state |= HRTIMER_STATE_ENQUEUED;
904
905 return (&timer->node == base->active.next);
906}
907
908/*
909 * __remove_hrtimer - internal function to remove a timer
910 *
911 * Caller must hold the base lock.
912 *
913 * High resolution timer mode reprograms the clock event device when the
914 * timer is the one which expires next. The caller can disable this by setting
915 * reprogram to zero. This is useful, when the context does a reprogramming
916 * anyway (e.g. timer interrupt)
917 */
918static void __remove_hrtimer(struct hrtimer *timer,
919 struct hrtimer_clock_base *base,
920 unsigned long newstate, int reprogram)
921{
922 struct timerqueue_node *next_timer;
923 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
924 goto out;
925
926 next_timer = timerqueue_getnext(&base->active);
927 timerqueue_del(&base->active, &timer->node);
928 if (&timer->node == next_timer) {
929#ifdef CONFIG_HIGH_RES_TIMERS
930 /* Reprogram the clock event device. if enabled */
931 if (reprogram && hrtimer_hres_active()) {
932 ktime_t expires;
933
934 expires = ktime_sub(hrtimer_get_expires(timer),
935 base->offset);
936 if (base->cpu_base->expires_next.tv64 == expires.tv64)
937 hrtimer_force_reprogram(base->cpu_base, 1);
938 }
939#endif
940 }
941 if (!timerqueue_getnext(&base->active))
942 base->cpu_base->active_bases &= ~(1 << base->index);
943out:
944 timer->state = newstate;
945}
946
947/*
948 * remove hrtimer, called with base lock held
949 */
950static inline int
951remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
952{
953 if (hrtimer_is_queued(timer)) {
954 unsigned long state;
955 int reprogram;
956
957 /*
958 * Remove the timer and force reprogramming when high
959 * resolution mode is active and the timer is on the current
960 * CPU. If we remove a timer on another CPU, reprogramming is
961 * skipped. The interrupt event on this CPU is fired and
962 * reprogramming happens in the interrupt handler. This is a
963 * rare case and less expensive than a smp call.
964 */
965 debug_deactivate(timer);
966 timer_stats_hrtimer_clear_start_info(timer);
967 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
968 /*
969 * We must preserve the CALLBACK state flag here,
970 * otherwise we could move the timer base in
971 * switch_hrtimer_base.
972 */
973 state = timer->state & HRTIMER_STATE_CALLBACK;
974 __remove_hrtimer(timer, base, state, reprogram);
975 return 1;
976 }
977 return 0;
978}
979
980int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
981 unsigned long delta_ns, const enum hrtimer_mode mode,
982 int wakeup)
983{
984 struct hrtimer_clock_base *base, *new_base;
985 unsigned long flags;
986 int ret, leftmost;
987
988 base = lock_hrtimer_base(timer, &flags);
989
990 /* Remove an active timer from the queue: */
991 ret = remove_hrtimer(timer, base);
992
993 if (mode & HRTIMER_MODE_REL) {
994 tim = ktime_add_safe(tim, base->get_time());
995 /*
996 * CONFIG_TIME_LOW_RES is a temporary way for architectures
997 * to signal that they simply return xtime in
998 * do_gettimeoffset(). In this case we want to round up by
999 * resolution when starting a relative timer, to avoid short
1000 * timeouts. This will go away with the GTOD framework.
1001 */
1002#ifdef CONFIG_TIME_LOW_RES
1003 tim = ktime_add_safe(tim, base->resolution);
1004#endif
1005 }
1006
1007 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
1008
1009 /* Switch the timer base, if necessary: */
1010 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
1011
1012 timer_stats_hrtimer_set_start_info(timer);
1013
1014 leftmost = enqueue_hrtimer(timer, new_base);
1015
1016 /*
1017 * Only allow reprogramming if the new base is on this CPU.
1018 * (it might still be on another CPU if the timer was pending)
1019 *
1020 * XXX send_remote_softirq() ?
1021 */
1022 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
1023 && hrtimer_enqueue_reprogram(timer, new_base)) {
1024 if (wakeup) {
1025 /*
1026 * We need to drop cpu_base->lock to avoid a
1027 * lock ordering issue vs. rq->lock.
1028 */
1029 raw_spin_unlock(&new_base->cpu_base->lock);
1030 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1031 local_irq_restore(flags);
1032 return ret;
1033 } else {
1034 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
1035 }
1036 }
1037
1038 unlock_hrtimer_base(timer, &flags);
1039
1040 return ret;
1041}
1042EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
1043
1044/**
1045 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
1046 * @timer: the timer to be added
1047 * @tim: expiry time
1048 * @delta_ns: "slack" range for the timer
1049 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1050 * relative (HRTIMER_MODE_REL)
1051 *
1052 * Returns:
1053 * 0 on success
1054 * 1 when the timer was active
1055 */
1056int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1057 unsigned long delta_ns, const enum hrtimer_mode mode)
1058{
1059 return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
1060}
1061EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1062
1063/**
1064 * hrtimer_start - (re)start an hrtimer on the current CPU
1065 * @timer: the timer to be added
1066 * @tim: expiry time
1067 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1068 * relative (HRTIMER_MODE_REL)
1069 *
1070 * Returns:
1071 * 0 on success
1072 * 1 when the timer was active
1073 */
1074int
1075hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1076{
1077 return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
1078}
1079EXPORT_SYMBOL_GPL(hrtimer_start);
1080
1081
1082/**
1083 * hrtimer_try_to_cancel - try to deactivate a timer
1084 * @timer: hrtimer to stop
1085 *
1086 * Returns:
1087 * 0 when the timer was not active
1088 * 1 when the timer was active
1089 * -1 when the timer is currently excuting the callback function and
1090 * cannot be stopped
1091 */
1092int hrtimer_try_to_cancel(struct hrtimer *timer)
1093{
1094 struct hrtimer_clock_base *base;
1095 unsigned long flags;
1096 int ret = -1;
1097
1098 base = lock_hrtimer_base(timer, &flags);
1099
1100 if (!hrtimer_callback_running(timer))
1101 ret = remove_hrtimer(timer, base);
1102
1103 unlock_hrtimer_base(timer, &flags);
1104
1105 return ret;
1106
1107}
1108EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
1109
1110/**
1111 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
1112 * @timer: the timer to be cancelled
1113 *
1114 * Returns:
1115 * 0 when the timer was not active
1116 * 1 when the timer was active
1117 */
1118int hrtimer_cancel(struct hrtimer *timer)
1119{
1120 for (;;) {
1121 int ret = hrtimer_try_to_cancel(timer);
1122
1123 if (ret >= 0)
1124 return ret;
1125 cpu_relax();
1126 }
1127}
1128EXPORT_SYMBOL_GPL(hrtimer_cancel);
1129
1130/**
1131 * hrtimer_get_remaining - get remaining time for the timer
1132 * @timer: the timer to read
1133 */
1134ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1135{
1136 unsigned long flags;
1137 ktime_t rem;
1138
1139 lock_hrtimer_base(timer, &flags);
1140 rem = hrtimer_expires_remaining(timer);
1141 unlock_hrtimer_base(timer, &flags);
1142
1143 return rem;
1144}
1145EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1146
1147#ifdef CONFIG_NO_HZ_COMMON
1148/**
1149 * hrtimer_get_next_event - get the time until next expiry event
1150 *
1151 * Returns the delta to the next expiry event or KTIME_MAX if no timer
1152 * is pending.
1153 */
1154ktime_t hrtimer_get_next_event(void)
1155{
1156 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1157 struct hrtimer_clock_base *base = cpu_base->clock_base;
1158 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1159 unsigned long flags;
1160 int i;
1161
1162 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1163
1164 if (!hrtimer_hres_active()) {
1165 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1166 struct hrtimer *timer;
1167 struct timerqueue_node *next;
1168
1169 next = timerqueue_getnext(&base->active);
1170 if (!next)
1171 continue;
1172
1173 timer = container_of(next, struct hrtimer, node);
1174 delta.tv64 = hrtimer_get_expires_tv64(timer);
1175 delta = ktime_sub(delta, base->get_time());
1176 if (delta.tv64 < mindelta.tv64)
1177 mindelta.tv64 = delta.tv64;
1178 }
1179 }
1180
1181 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1182
1183 if (mindelta.tv64 < 0)
1184 mindelta.tv64 = 0;
1185 return mindelta;
1186}
1187#endif
1188
1189static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1190 enum hrtimer_mode mode)
1191{
1192 struct hrtimer_cpu_base *cpu_base;
1193 int base;
1194
1195 memset(timer, 0, sizeof(struct hrtimer));
1196
1197 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1198
1199 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1200 clock_id = CLOCK_MONOTONIC;
1201
1202 base = hrtimer_clockid_to_base(clock_id);
1203 timer->base = &cpu_base->clock_base[base];
1204 timerqueue_init(&timer->node);
1205
1206#ifdef CONFIG_TIMER_STATS
1207 timer->start_site = NULL;
1208 timer->start_pid = -1;
1209 memset(timer->start_comm, 0, TASK_COMM_LEN);
1210#endif
1211}
1212
1213/**
1214 * hrtimer_init - initialize a timer to the given clock
1215 * @timer: the timer to be initialized
1216 * @clock_id: the clock to be used
1217 * @mode: timer mode abs/rel
1218 */
1219void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1220 enum hrtimer_mode mode)
1221{
1222 debug_init(timer, clock_id, mode);
1223 __hrtimer_init(timer, clock_id, mode);
1224}
1225EXPORT_SYMBOL_GPL(hrtimer_init);
1226
1227/**
1228 * hrtimer_get_res - get the timer resolution for a clock
1229 * @which_clock: which clock to query
1230 * @tp: pointer to timespec variable to store the resolution
1231 *
1232 * Store the resolution of the clock selected by @which_clock in the
1233 * variable pointed to by @tp.
1234 */
1235int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1236{
1237 struct hrtimer_cpu_base *cpu_base;
1238 int base = hrtimer_clockid_to_base(which_clock);
1239
1240 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1241 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1242
1243 return 0;
1244}
1245EXPORT_SYMBOL_GPL(hrtimer_get_res);
1246
1247static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1248{
1249 struct hrtimer_clock_base *base = timer->base;
1250 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1251 enum hrtimer_restart (*fn)(struct hrtimer *);
1252 int restart;
1253
1254 WARN_ON(!irqs_disabled());
1255
1256 debug_deactivate(timer);
1257 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1258 timer_stats_account_hrtimer(timer);
1259 fn = timer->function;
1260
1261 /*
1262 * Because we run timers from hardirq context, there is no chance
1263 * they get migrated to another cpu, therefore its safe to unlock
1264 * the timer base.
1265 */
1266 raw_spin_unlock(&cpu_base->lock);
1267 trace_hrtimer_expire_entry(timer, now);
1268 restart = fn(timer);
1269 trace_hrtimer_expire_exit(timer);
1270 raw_spin_lock(&cpu_base->lock);
1271
1272 /*
1273 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1274 * we do not reprogramm the event hardware. Happens either in
1275 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1276 */
1277 if (restart != HRTIMER_NORESTART) {
1278 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1279 enqueue_hrtimer(timer, base);
1280 }
1281
1282 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1283
1284 timer->state &= ~HRTIMER_STATE_CALLBACK;
1285}
1286
1287#ifdef CONFIG_HIGH_RES_TIMERS
1288
1289/*
1290 * High resolution timer interrupt
1291 * Called with interrupts disabled
1292 */
1293void hrtimer_interrupt(struct clock_event_device *dev)
1294{
1295 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1296 ktime_t expires_next, now, entry_time, delta;
1297 int i, retries = 0;
1298
1299 BUG_ON(!cpu_base->hres_active);
1300 cpu_base->nr_events++;
1301 dev->next_event.tv64 = KTIME_MAX;
1302
1303 raw_spin_lock(&cpu_base->lock);
1304 entry_time = now = hrtimer_update_base(cpu_base);
1305retry:
1306 expires_next.tv64 = KTIME_MAX;
1307 /*
1308 * We set expires_next to KTIME_MAX here with cpu_base->lock
1309 * held to prevent that a timer is enqueued in our queue via
1310 * the migration code. This does not affect enqueueing of
1311 * timers which run their callback and need to be requeued on
1312 * this CPU.
1313 */
1314 cpu_base->expires_next.tv64 = KTIME_MAX;
1315
1316 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1317 struct hrtimer_clock_base *base;
1318 struct timerqueue_node *node;
1319 ktime_t basenow;
1320
1321 if (!(cpu_base->active_bases & (1 << i)))
1322 continue;
1323
1324 base = cpu_base->clock_base + i;
1325 basenow = ktime_add(now, base->offset);
1326
1327 while ((node = timerqueue_getnext(&base->active))) {
1328 struct hrtimer *timer;
1329
1330 timer = container_of(node, struct hrtimer, node);
1331
1332 /*
1333 * The immediate goal for using the softexpires is
1334 * minimizing wakeups, not running timers at the
1335 * earliest interrupt after their soft expiration.
1336 * This allows us to avoid using a Priority Search
1337 * Tree, which can answer a stabbing querry for
1338 * overlapping intervals and instead use the simple
1339 * BST we already have.
1340 * We don't add extra wakeups by delaying timers that
1341 * are right-of a not yet expired timer, because that
1342 * timer will have to trigger a wakeup anyway.
1343 */
1344
1345 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1346 ktime_t expires;
1347
1348 expires = ktime_sub(hrtimer_get_expires(timer),
1349 base->offset);
1350 if (expires.tv64 < 0)
1351 expires.tv64 = KTIME_MAX;
1352 if (expires.tv64 < expires_next.tv64)
1353 expires_next = expires;
1354 break;
1355 }
1356
1357 __run_hrtimer(timer, &basenow);
1358 }
1359 }
1360
1361 /*
1362 * Store the new expiry value so the migration code can verify
1363 * against it.
1364 */
1365 cpu_base->expires_next = expires_next;
1366 raw_spin_unlock(&cpu_base->lock);
1367
1368 /* Reprogramming necessary ? */
1369 if (expires_next.tv64 == KTIME_MAX ||
1370 !tick_program_event(expires_next, 0)) {
1371 cpu_base->hang_detected = 0;
1372 return;
1373 }
1374
1375 /*
1376 * The next timer was already expired due to:
1377 * - tracing
1378 * - long lasting callbacks
1379 * - being scheduled away when running in a VM
1380 *
1381 * We need to prevent that we loop forever in the hrtimer
1382 * interrupt routine. We give it 3 attempts to avoid
1383 * overreacting on some spurious event.
1384 *
1385 * Acquire base lock for updating the offsets and retrieving
1386 * the current time.
1387 */
1388 raw_spin_lock(&cpu_base->lock);
1389 now = hrtimer_update_base(cpu_base);
1390 cpu_base->nr_retries++;
1391 if (++retries < 3)
1392 goto retry;
1393 /*
1394 * Give the system a chance to do something else than looping
1395 * here. We stored the entry time, so we know exactly how long
1396 * we spent here. We schedule the next event this amount of
1397 * time away.
1398 */
1399 cpu_base->nr_hangs++;
1400 cpu_base->hang_detected = 1;
1401 raw_spin_unlock(&cpu_base->lock);
1402 delta = ktime_sub(now, entry_time);
1403 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1404 cpu_base->max_hang_time = delta;
1405 /*
1406 * Limit it to a sensible value as we enforce a longer
1407 * delay. Give the CPU at least 100ms to catch up.
1408 */
1409 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1410 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1411 else
1412 expires_next = ktime_add(now, delta);
1413 tick_program_event(expires_next, 1);
1414 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1415 ktime_to_ns(delta));
1416}
1417
1418/*
1419 * local version of hrtimer_peek_ahead_timers() called with interrupts
1420 * disabled.
1421 */
1422static void __hrtimer_peek_ahead_timers(void)
1423{
1424 struct tick_device *td;
1425
1426 if (!hrtimer_hres_active())
1427 return;
1428
1429 td = &__get_cpu_var(tick_cpu_device);
1430 if (td && td->evtdev)
1431 hrtimer_interrupt(td->evtdev);
1432}
1433
1434/**
1435 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1436 *
1437 * hrtimer_peek_ahead_timers will peek at the timer queue of
1438 * the current cpu and check if there are any timers for which
1439 * the soft expires time has passed. If any such timers exist,
1440 * they are run immediately and then removed from the timer queue.
1441 *
1442 */
1443void hrtimer_peek_ahead_timers(void)
1444{
1445 unsigned long flags;
1446
1447 local_irq_save(flags);
1448 __hrtimer_peek_ahead_timers();
1449 local_irq_restore(flags);
1450}
1451
1452static void run_hrtimer_softirq(struct softirq_action *h)
1453{
1454 hrtimer_peek_ahead_timers();
1455}
1456
1457#else /* CONFIG_HIGH_RES_TIMERS */
1458
1459static inline void __hrtimer_peek_ahead_timers(void) { }
1460
1461#endif /* !CONFIG_HIGH_RES_TIMERS */
1462
1463/*
1464 * Called from timer softirq every jiffy, expire hrtimers:
1465 *
1466 * For HRT its the fall back code to run the softirq in the timer
1467 * softirq context in case the hrtimer initialization failed or has
1468 * not been done yet.
1469 */
1470void hrtimer_run_pending(void)
1471{
1472 if (hrtimer_hres_active())
1473 return;
1474
1475 /*
1476 * This _is_ ugly: We have to check in the softirq context,
1477 * whether we can switch to highres and / or nohz mode. The
1478 * clocksource switch happens in the timer interrupt with
1479 * xtime_lock held. Notification from there only sets the
1480 * check bit in the tick_oneshot code, otherwise we might
1481 * deadlock vs. xtime_lock.
1482 */
1483 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1484 hrtimer_switch_to_hres();
1485}
1486
1487/*
1488 * Called from hardirq context every jiffy
1489 */
1490void hrtimer_run_queues(void)
1491{
1492 struct timerqueue_node *node;
1493 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1494 struct hrtimer_clock_base *base;
1495 int index, gettime = 1;
1496
1497 if (hrtimer_hres_active())
1498 return;
1499
1500 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1501 base = &cpu_base->clock_base[index];
1502 if (!timerqueue_getnext(&base->active))
1503 continue;
1504
1505 if (gettime) {
1506 hrtimer_get_softirq_time(cpu_base);
1507 gettime = 0;
1508 }
1509
1510 raw_spin_lock(&cpu_base->lock);
1511
1512 while ((node = timerqueue_getnext(&base->active))) {
1513 struct hrtimer *timer;
1514
1515 timer = container_of(node, struct hrtimer, node);
1516 if (base->softirq_time.tv64 <=
1517 hrtimer_get_expires_tv64(timer))
1518 break;
1519
1520 __run_hrtimer(timer, &base->softirq_time);
1521 }
1522 raw_spin_unlock(&cpu_base->lock);
1523 }
1524}
1525
1526/*
1527 * Sleep related functions:
1528 */
1529static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
1530{
1531 struct hrtimer_sleeper *t =
1532 container_of(timer, struct hrtimer_sleeper, timer);
1533 struct task_struct *task = t->task;
1534
1535 t->task = NULL;
1536 if (task)
1537 wake_up_process(task);
1538
1539 return HRTIMER_NORESTART;
1540}
1541
1542void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1543{
1544 sl->timer.function = hrtimer_wakeup;
1545 sl->task = task;
1546}
1547EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1548
1549static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1550{
1551 hrtimer_init_sleeper(t, current);
1552
1553 do {
1554 set_current_state(TASK_INTERRUPTIBLE);
1555 hrtimer_start_expires(&t->timer, mode);
1556 if (!hrtimer_active(&t->timer))
1557 t->task = NULL;
1558
1559 if (likely(t->task))
1560 freezable_schedule();
1561
1562 hrtimer_cancel(&t->timer);
1563 mode = HRTIMER_MODE_ABS;
1564
1565 } while (t->task && !signal_pending(current));
1566
1567 __set_current_state(TASK_RUNNING);
1568
1569 return t->task == NULL;
1570}
1571
1572static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1573{
1574 struct timespec rmt;
1575 ktime_t rem;
1576
1577 rem = hrtimer_expires_remaining(timer);
1578 if (rem.tv64 <= 0)
1579 return 0;
1580 rmt = ktime_to_timespec(rem);
1581
1582 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
1583 return -EFAULT;
1584
1585 return 1;
1586}
1587
1588long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1589{
1590 struct hrtimer_sleeper t;
1591 struct timespec __user *rmtp;
1592 int ret = 0;
1593
1594 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1595 HRTIMER_MODE_ABS);
1596 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1597
1598 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1599 goto out;
1600
1601 rmtp = restart->nanosleep.rmtp;
1602 if (rmtp) {
1603 ret = update_rmtp(&t.timer, rmtp);
1604 if (ret <= 0)
1605 goto out;
1606 }
1607
1608 /* The other values in restart are already filled in */
1609 ret = -ERESTART_RESTARTBLOCK;
1610out:
1611 destroy_hrtimer_on_stack(&t.timer);
1612 return ret;
1613}
1614
1615long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1616 const enum hrtimer_mode mode, const clockid_t clockid)
1617{
1618 struct restart_block *restart;
1619 struct hrtimer_sleeper t;
1620 int ret = 0;
1621 unsigned long slack;
1622
1623 slack = current->timer_slack_ns;
1624 if (dl_task(current) || rt_task(current))
1625 slack = 0;
1626
1627 hrtimer_init_on_stack(&t.timer, clockid, mode);
1628 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
1629 if (do_nanosleep(&t, mode))
1630 goto out;
1631
1632 /* Absolute timers do not update the rmtp value and restart: */
1633 if (mode == HRTIMER_MODE_ABS) {
1634 ret = -ERESTARTNOHAND;
1635 goto out;
1636 }
1637
1638 if (rmtp) {
1639 ret = update_rmtp(&t.timer, rmtp);
1640 if (ret <= 0)
1641 goto out;
1642 }
1643
1644 restart = &current_thread_info()->restart_block;
1645 restart->fn = hrtimer_nanosleep_restart;
1646 restart->nanosleep.clockid = t.timer.base->clockid;
1647 restart->nanosleep.rmtp = rmtp;
1648 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1649
1650 ret = -ERESTART_RESTARTBLOCK;
1651out:
1652 destroy_hrtimer_on_stack(&t.timer);
1653 return ret;
1654}
1655
1656SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1657 struct timespec __user *, rmtp)
1658{
1659 struct timespec tu;
1660
1661 if (copy_from_user(&tu, rqtp, sizeof(tu)))
1662 return -EFAULT;
1663
1664 if (!timespec_valid(&tu))
1665 return -EINVAL;
1666
1667 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1668}
1669
1670/*
1671 * Functions related to boot-time initialization:
1672 */
1673static void init_hrtimers_cpu(int cpu)
1674{
1675 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1676 int i;
1677
1678 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1679 cpu_base->clock_base[i].cpu_base = cpu_base;
1680 timerqueue_init_head(&cpu_base->clock_base[i].active);
1681 }
1682
1683 hrtimer_init_hres(cpu_base);
1684}
1685
1686#ifdef CONFIG_HOTPLUG_CPU
1687
1688static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1689 struct hrtimer_clock_base *new_base)
1690{
1691 struct hrtimer *timer;
1692 struct timerqueue_node *node;
1693
1694 while ((node = timerqueue_getnext(&old_base->active))) {
1695 timer = container_of(node, struct hrtimer, node);
1696 BUG_ON(hrtimer_callback_running(timer));
1697 debug_deactivate(timer);
1698
1699 /*
1700 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1701 * timer could be seen as !active and just vanish away
1702 * under us on another CPU
1703 */
1704 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1705 timer->base = new_base;
1706 /*
1707 * Enqueue the timers on the new cpu. This does not
1708 * reprogram the event device in case the timer
1709 * expires before the earliest on this CPU, but we run
1710 * hrtimer_interrupt after we migrated everything to
1711 * sort out already expired timers and reprogram the
1712 * event device.
1713 */
1714 enqueue_hrtimer(timer, new_base);
1715
1716 /* Clear the migration state bit */
1717 timer->state &= ~HRTIMER_STATE_MIGRATE;
1718 }
1719}
1720
1721static void migrate_hrtimers(int scpu)
1722{
1723 struct hrtimer_cpu_base *old_base, *new_base;
1724 int i;
1725
1726 BUG_ON(cpu_online(scpu));
1727 tick_cancel_sched_timer(scpu);
1728
1729 local_irq_disable();
1730 old_base = &per_cpu(hrtimer_bases, scpu);
1731 new_base = &__get_cpu_var(hrtimer_bases);
1732 /*
1733 * The caller is globally serialized and nobody else
1734 * takes two locks at once, deadlock is not possible.
1735 */
1736 raw_spin_lock(&new_base->lock);
1737 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1738
1739 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1740 migrate_hrtimer_list(&old_base->clock_base[i],
1741 &new_base->clock_base[i]);
1742 }
1743
1744 raw_spin_unlock(&old_base->lock);
1745 raw_spin_unlock(&new_base->lock);
1746
1747 /* Check, if we got expired work to do */
1748 __hrtimer_peek_ahead_timers();
1749 local_irq_enable();
1750}
1751
1752#endif /* CONFIG_HOTPLUG_CPU */
1753
1754static int hrtimer_cpu_notify(struct notifier_block *self,
1755 unsigned long action, void *hcpu)
1756{
1757 int scpu = (long)hcpu;
1758
1759 switch (action) {
1760
1761 case CPU_UP_PREPARE:
1762 case CPU_UP_PREPARE_FROZEN:
1763 init_hrtimers_cpu(scpu);
1764 break;
1765
1766#ifdef CONFIG_HOTPLUG_CPU
1767 case CPU_DYING:
1768 case CPU_DYING_FROZEN:
1769 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1770 break;
1771 case CPU_DEAD:
1772 case CPU_DEAD_FROZEN:
1773 {
1774 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1775 migrate_hrtimers(scpu);
1776 break;
1777 }
1778#endif
1779
1780 default:
1781 break;
1782 }
1783
1784 return NOTIFY_OK;
1785}
1786
1787static struct notifier_block hrtimers_nb = {
1788 .notifier_call = hrtimer_cpu_notify,
1789};
1790
1791void __init hrtimers_init(void)
1792{
1793 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1794 (void *)(long)smp_processor_id());
1795 register_cpu_notifier(&hrtimers_nb);
1796#ifdef CONFIG_HIGH_RES_TIMERS
1797 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1798#endif
1799}
1800
1801/**
1802 * schedule_hrtimeout_range_clock - sleep until timeout
1803 * @expires: timeout value (ktime_t)
1804 * @delta: slack in expires timeout (ktime_t)
1805 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1806 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1807 */
1808int __sched
1809schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1810 const enum hrtimer_mode mode, int clock)
1811{
1812 struct hrtimer_sleeper t;
1813
1814 /*
1815 * Optimize when a zero timeout value is given. It does not
1816 * matter whether this is an absolute or a relative time.
1817 */
1818 if (expires && !expires->tv64) {
1819 __set_current_state(TASK_RUNNING);
1820 return 0;
1821 }
1822
1823 /*
1824 * A NULL parameter means "infinite"
1825 */
1826 if (!expires) {
1827 schedule();
1828 __set_current_state(TASK_RUNNING);
1829 return -EINTR;
1830 }
1831
1832 hrtimer_init_on_stack(&t.timer, clock, mode);
1833 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1834
1835 hrtimer_init_sleeper(&t, current);
1836
1837 hrtimer_start_expires(&t.timer, mode);
1838 if (!hrtimer_active(&t.timer))
1839 t.task = NULL;
1840
1841 if (likely(t.task))
1842 schedule();
1843
1844 hrtimer_cancel(&t.timer);
1845 destroy_hrtimer_on_stack(&t.timer);
1846
1847 __set_current_state(TASK_RUNNING);
1848
1849 return !t.task ? 0 : -EINTR;
1850}
1851
1852/**
1853 * schedule_hrtimeout_range - sleep until timeout
1854 * @expires: timeout value (ktime_t)
1855 * @delta: slack in expires timeout (ktime_t)
1856 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1857 *
1858 * Make the current task sleep until the given expiry time has
1859 * elapsed. The routine will return immediately unless
1860 * the current task state has been set (see set_current_state()).
1861 *
1862 * The @delta argument gives the kernel the freedom to schedule the
1863 * actual wakeup to a time that is both power and performance friendly.
1864 * The kernel give the normal best effort behavior for "@expires+@delta",
1865 * but may decide to fire the timer earlier, but no earlier than @expires.
1866 *
1867 * You can set the task state as follows -
1868 *
1869 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1870 * pass before the routine returns.
1871 *
1872 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1873 * delivered to the current task.
1874 *
1875 * The current task state is guaranteed to be TASK_RUNNING when this
1876 * routine returns.
1877 *
1878 * Returns 0 when the timer has expired otherwise -EINTR
1879 */
1880int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1881 const enum hrtimer_mode mode)
1882{
1883 return schedule_hrtimeout_range_clock(expires, delta, mode,
1884 CLOCK_MONOTONIC);
1885}
1886EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1887
1888/**
1889 * schedule_hrtimeout - sleep until timeout
1890 * @expires: timeout value (ktime_t)
1891 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1892 *
1893 * Make the current task sleep until the given expiry time has
1894 * elapsed. The routine will return immediately unless
1895 * the current task state has been set (see set_current_state()).
1896 *
1897 * You can set the task state as follows -
1898 *
1899 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1900 * pass before the routine returns.
1901 *
1902 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1903 * delivered to the current task.
1904 *
1905 * The current task state is guaranteed to be TASK_RUNNING when this
1906 * routine returns.
1907 *
1908 * Returns 0 when the timer has expired otherwise -EINTR
1909 */
1910int __sched schedule_hrtimeout(ktime_t *expires,
1911 const enum hrtimer_mode mode)
1912{
1913 return schedule_hrtimeout_range(expires, 0, mode);
1914}
1915EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
new file mode 100644
index 000000000000..8d262b467573
--- /dev/null
+++ b/kernel/time/itimer.c
@@ -0,0 +1,301 @@
1/*
2 * linux/kernel/itimer.c
3 *
4 * Copyright (C) 1992 Darren Senn
5 */
6
7/* These are all the functions necessary to implement itimers */
8
9#include <linux/mm.h>
10#include <linux/interrupt.h>
11#include <linux/syscalls.h>
12#include <linux/time.h>
13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
16
17#include <asm/uaccess.h>
18
19/**
20 * itimer_get_remtime - get remaining time for the timer
21 *
22 * @timer: the timer to read
23 *
24 * Returns the delta between the expiry time and now, which can be
25 * less than zero or 1usec for an pending expired timer
26 */
27static struct timeval itimer_get_remtime(struct hrtimer *timer)
28{
29 ktime_t rem = hrtimer_get_remaining(timer);
30
31 /*
32 * Racy but safe: if the itimer expires after the above
33 * hrtimer_get_remtime() call but before this condition
34 * then we return 0 - which is correct.
35 */
36 if (hrtimer_active(timer)) {
37 if (rem.tv64 <= 0)
38 rem.tv64 = NSEC_PER_USEC;
39 } else
40 rem.tv64 = 0;
41
42 return ktime_to_timeval(rem);
43}
44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (cval) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime.utime + cputime.stime;
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cval < t)
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cval - t;
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
79int do_getitimer(int which, struct itimerval *value)
80{
81 struct task_struct *tsk = current;
82
83 switch (which) {
84 case ITIMER_REAL:
85 spin_lock_irq(&tsk->sighand->siglock);
86 value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
87 value->it_interval =
88 ktime_to_timeval(tsk->signal->it_real_incr);
89 spin_unlock_irq(&tsk->sighand->siglock);
90 break;
91 case ITIMER_VIRTUAL:
92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
93 break;
94 case ITIMER_PROF:
95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
96 break;
97 default:
98 return(-EINVAL);
99 }
100 return 0;
101}
102
103SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
104{
105 int error = -EFAULT;
106 struct itimerval get_buffer;
107
108 if (value) {
109 error = do_getitimer(which, &get_buffer);
110 if (!error &&
111 copy_to_user(value, &get_buffer, sizeof(get_buffer)))
112 error = -EFAULT;
113 }
114 return error;
115}
116
117
118/*
119 * The timer is automagically restarted, when interval != 0
120 */
121enum hrtimer_restart it_real_fn(struct hrtimer *timer)
122{
123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer);
125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
128
129 return HRTIMER_NORESTART;
130}
131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
151
152 nval = timeval_to_cputime(&value->it_value);
153 ns_nval = timeval_to_ns(&value->it_value);
154 ninterval = timeval_to_cputime(&value->it_interval);
155 ns_ninterval = timeval_to_ns(&value->it_interval);
156
157 error = cputime_sub_ns(nval, ns_nval);
158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
159
160 spin_lock_irq(&tsk->sighand->siglock);
161
162 cval = it->expires;
163 cinterval = it->incr;
164 if (cval || nval) {
165 if (nval > 0)
166 nval += cputime_one_jiffy;
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 it->error = error;
172 it->incr_error = incr_error;
173 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
174 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
175
176 spin_unlock_irq(&tsk->sighand->siglock);
177
178 if (ovalue) {
179 cputime_to_timeval(cval, &ovalue->it_value);
180 cputime_to_timeval(cinterval, &ovalue->it_interval);
181 }
182}
183
184/*
185 * Returns true if the timeval is in canonical form
186 */
187#define timeval_valid(t) \
188 (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
189
190int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
191{
192 struct task_struct *tsk = current;
193 struct hrtimer *timer;
194 ktime_t expires;
195
196 /*
197 * Validate the timevals in value.
198 */
199 if (!timeval_valid(&value->it_value) ||
200 !timeval_valid(&value->it_interval))
201 return -EINVAL;
202
203 switch (which) {
204 case ITIMER_REAL:
205again:
206 spin_lock_irq(&tsk->sighand->siglock);
207 timer = &tsk->signal->real_timer;
208 if (ovalue) {
209 ovalue->it_value = itimer_get_remtime(timer);
210 ovalue->it_interval
211 = ktime_to_timeval(tsk->signal->it_real_incr);
212 }
213 /* We are sharing ->siglock with it_real_fn() */
214 if (hrtimer_try_to_cancel(timer) < 0) {
215 spin_unlock_irq(&tsk->sighand->siglock);
216 goto again;
217 }
218 expires = timeval_to_ktime(value->it_value);
219 if (expires.tv64 != 0) {
220 tsk->signal->it_real_incr =
221 timeval_to_ktime(value->it_interval);
222 hrtimer_start(timer, expires, HRTIMER_MODE_REL);
223 } else
224 tsk->signal->it_real_incr.tv64 = 0;
225
226 trace_itimer_state(ITIMER_REAL, value, 0);
227 spin_unlock_irq(&tsk->sighand->siglock);
228 break;
229 case ITIMER_VIRTUAL:
230 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
231 break;
232 case ITIMER_PROF:
233 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
234 break;
235 default:
236 return -EINVAL;
237 }
238 return 0;
239}
240
241/**
242 * alarm_setitimer - set alarm in seconds
243 *
244 * @seconds: number of seconds until alarm
245 * 0 disables the alarm
246 *
247 * Returns the remaining time in seconds of a pending timer or 0 when
248 * the timer is not active.
249 *
250 * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
251 * negative timeval settings which would cause immediate expiry.
252 */
253unsigned int alarm_setitimer(unsigned int seconds)
254{
255 struct itimerval it_new, it_old;
256
257#if BITS_PER_LONG < 64
258 if (seconds > INT_MAX)
259 seconds = INT_MAX;
260#endif
261 it_new.it_value.tv_sec = seconds;
262 it_new.it_value.tv_usec = 0;
263 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
264
265 do_setitimer(ITIMER_REAL, &it_new, &it_old);
266
267 /*
268 * We can't return 0 if we have an alarm pending ... And we'd
269 * better return too much than too little anyway
270 */
271 if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
272 it_old.it_value.tv_usec >= 500000)
273 it_old.it_value.tv_sec++;
274
275 return it_old.it_value.tv_sec;
276}
277
278SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
279 struct itimerval __user *, ovalue)
280{
281 struct itimerval set_buffer, get_buffer;
282 int error;
283
284 if (value) {
285 if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
286 return -EFAULT;
287 } else {
288 memset(&set_buffer, 0, sizeof(set_buffer));
289 printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
290 " Misfeature support will be removed\n",
291 current->comm);
292 }
293
294 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
295 if (error || !ovalue)
296 return error;
297
298 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
299 return -EFAULT;
300 return 0;
301}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
new file mode 100644
index 000000000000..3b8946416a5f
--- /dev/null
+++ b/kernel/time/posix-cpu-timers.c
@@ -0,0 +1,1490 @@
1/*
2 * Implement CPU time clocks for the POSIX clock interface.
3 */
4
5#include <linux/sched.h>
6#include <linux/posix-timers.h>
7#include <linux/errno.h>
8#include <linux/math64.h>
9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
12#include <linux/random.h>
13#include <linux/tick.h>
14#include <linux/workqueue.h>
15
16/*
17 * Called after updating RLIMIT_CPU to run cpu timer and update
18 * tsk->signal->cputime_expires expiration cache if necessary. Needs
19 * siglock protection since other code may update expiration cache as
20 * well.
21 */
22void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
23{
24 cputime_t cputime = secs_to_cputime(rlim_new);
25
26 spin_lock_irq(&task->sighand->siglock);
27 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
28 spin_unlock_irq(&task->sighand->siglock);
29}
30
31static int check_clock(const clockid_t which_clock)
32{
33 int error = 0;
34 struct task_struct *p;
35 const pid_t pid = CPUCLOCK_PID(which_clock);
36
37 if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
38 return -EINVAL;
39
40 if (pid == 0)
41 return 0;
42
43 rcu_read_lock();
44 p = find_task_by_vpid(pid);
45 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
46 same_thread_group(p, current) : has_group_leader_pid(p))) {
47 error = -EINVAL;
48 }
49 rcu_read_unlock();
50
51 return error;
52}
53
54static inline unsigned long long
55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
56{
57 unsigned long long ret;
58
59 ret = 0; /* high half always zero when .cpu used */
60 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
61 ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
62 } else {
63 ret = cputime_to_expires(timespec_to_cputime(tp));
64 }
65 return ret;
66}
67
68static void sample_to_timespec(const clockid_t which_clock,
69 unsigned long long expires,
70 struct timespec *tp)
71{
72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
73 *tp = ns_to_timespec(expires);
74 else
75 cputime_to_timespec((__force cputime_t)expires, tp);
76}
77
78/*
79 * Update expiry time from increment, and increase overrun count,
80 * given the current clock sample.
81 */
82static void bump_cpu_timer(struct k_itimer *timer,
83 unsigned long long now)
84{
85 int i;
86 unsigned long long delta, incr;
87
88 if (timer->it.cpu.incr == 0)
89 return;
90
91 if (now < timer->it.cpu.expires)
92 return;
93
94 incr = timer->it.cpu.incr;
95 delta = now + incr - timer->it.cpu.expires;
96
97 /* Don't use (incr*2 < delta), incr*2 might overflow. */
98 for (i = 0; incr < delta - incr; i++)
99 incr = incr << 1;
100
101 for (; i >= 0; incr >>= 1, i--) {
102 if (delta < incr)
103 continue;
104
105 timer->it.cpu.expires += incr;
106 timer->it_overrun += 1 << i;
107 delta -= incr;
108 }
109}
110
111/**
112 * task_cputime_zero - Check a task_cputime struct for all zero fields.
113 *
114 * @cputime: The struct to compare.
115 *
116 * Checks @cputime to see if all fields are zero. Returns true if all fields
117 * are zero, false if any field is nonzero.
118 */
119static inline int task_cputime_zero(const struct task_cputime *cputime)
120{
121 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
122 return 1;
123 return 0;
124}
125
126static inline unsigned long long prof_ticks(struct task_struct *p)
127{
128 cputime_t utime, stime;
129
130 task_cputime(p, &utime, &stime);
131
132 return cputime_to_expires(utime + stime);
133}
134static inline unsigned long long virt_ticks(struct task_struct *p)
135{
136 cputime_t utime;
137
138 task_cputime(p, &utime, NULL);
139
140 return cputime_to_expires(utime);
141}
142
143static int
144posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
145{
146 int error = check_clock(which_clock);
147 if (!error) {
148 tp->tv_sec = 0;
149 tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
150 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
151 /*
152 * If sched_clock is using a cycle counter, we
153 * don't have any idea of its true resolution
154 * exported, but it is much more than 1s/HZ.
155 */
156 tp->tv_nsec = 1;
157 }
158 }
159 return error;
160}
161
162static int
163posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
164{
165 /*
166 * You can never reset a CPU clock, but we check for other errors
167 * in the call before failing with EPERM.
168 */
169 int error = check_clock(which_clock);
170 if (error == 0) {
171 error = -EPERM;
172 }
173 return error;
174}
175
176
177/*
178 * Sample a per-thread clock for the given task.
179 */
180static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
181 unsigned long long *sample)
182{
183 switch (CPUCLOCK_WHICH(which_clock)) {
184 default:
185 return -EINVAL;
186 case CPUCLOCK_PROF:
187 *sample = prof_ticks(p);
188 break;
189 case CPUCLOCK_VIRT:
190 *sample = virt_ticks(p);
191 break;
192 case CPUCLOCK_SCHED:
193 *sample = task_sched_runtime(p);
194 break;
195 }
196 return 0;
197}
198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
200{
201 if (b->utime > a->utime)
202 a->utime = b->utime;
203
204 if (b->stime > a->stime)
205 a->stime = b->stime;
206
207 if (b->sum_exec_runtime > a->sum_exec_runtime)
208 a->sum_exec_runtime = b->sum_exec_runtime;
209}
210
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum;
215 unsigned long flags;
216
217 if (!cputimer->running) {
218 /*
219 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start
222 * it.
223 */
224 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags);
226 cputimer->running = 1;
227 update_gt_cputime(&cputimer->cputime, &sum);
228 } else
229 raw_spin_lock_irqsave(&cputimer->lock, flags);
230 *times = cputimer->cputime;
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
232}
233
234/*
235 * Sample a process (thread group) clock for the given group_leader task.
236 * Must be called with task sighand lock held for safe while_each_thread()
237 * traversal.
238 */
239static int cpu_clock_sample_group(const clockid_t which_clock,
240 struct task_struct *p,
241 unsigned long long *sample)
242{
243 struct task_cputime cputime;
244
245 switch (CPUCLOCK_WHICH(which_clock)) {
246 default:
247 return -EINVAL;
248 case CPUCLOCK_PROF:
249 thread_group_cputime(p, &cputime);
250 *sample = cputime_to_expires(cputime.utime + cputime.stime);
251 break;
252 case CPUCLOCK_VIRT:
253 thread_group_cputime(p, &cputime);
254 *sample = cputime_to_expires(cputime.utime);
255 break;
256 case CPUCLOCK_SCHED:
257 thread_group_cputime(p, &cputime);
258 *sample = cputime.sum_exec_runtime;
259 break;
260 }
261 return 0;
262}
263
264static int posix_cpu_clock_get_task(struct task_struct *tsk,
265 const clockid_t which_clock,
266 struct timespec *tp)
267{
268 int err = -EINVAL;
269 unsigned long long rtn;
270
271 if (CPUCLOCK_PERTHREAD(which_clock)) {
272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 }
292
293 if (!err)
294 sample_to_timespec(which_clock, rtn, tp);
295
296 return err;
297}
298
299
300static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
301{
302 const pid_t pid = CPUCLOCK_PID(which_clock);
303 int err = -EINVAL;
304
305 if (pid == 0) {
306 /*
307 * Special case constant value for our own clocks.
308 * We don't have to do any lookup to find ourselves.
309 */
310 err = posix_cpu_clock_get_task(current, which_clock, tp);
311 } else {
312 /*
313 * Find the given PID, and validate that the caller
314 * should be able to see it.
315 */
316 struct task_struct *p;
317 rcu_read_lock();
318 p = find_task_by_vpid(pid);
319 if (p)
320 err = posix_cpu_clock_get_task(p, which_clock, tp);
321 rcu_read_unlock();
322 }
323
324 return err;
325}
326
327
328/*
329 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
330 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
331 * new timer already all-zeros initialized.
332 */
333static int posix_cpu_timer_create(struct k_itimer *new_timer)
334{
335 int ret = 0;
336 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
337 struct task_struct *p;
338
339 if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
340 return -EINVAL;
341
342 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
343
344 rcu_read_lock();
345 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
346 if (pid == 0) {
347 p = current;
348 } else {
349 p = find_task_by_vpid(pid);
350 if (p && !same_thread_group(p, current))
351 p = NULL;
352 }
353 } else {
354 if (pid == 0) {
355 p = current->group_leader;
356 } else {
357 p = find_task_by_vpid(pid);
358 if (p && !has_group_leader_pid(p))
359 p = NULL;
360 }
361 }
362 new_timer->it.cpu.task = p;
363 if (p) {
364 get_task_struct(p);
365 } else {
366 ret = -EINVAL;
367 }
368 rcu_read_unlock();
369
370 return ret;
371}
372
373/*
374 * Clean up a CPU-clock timer that is about to be destroyed.
375 * This is called from timer deletion with the timer already locked.
376 * If we return TIMER_RETRY, it's necessary to release the timer's lock
377 * and try again. (This happens when the timer is in the middle of firing.)
378 */
379static int posix_cpu_timer_del(struct k_itimer *timer)
380{
381 int ret = 0;
382 unsigned long flags;
383 struct sighand_struct *sighand;
384 struct task_struct *p = timer->it.cpu.task;
385
386 WARN_ON_ONCE(p == NULL);
387
388 /*
389 * Protect against sighand release/switch in exit/exec and process/
390 * thread timer list entry concurrent read/writes.
391 */
392 sighand = lock_task_sighand(p, &flags);
393 if (unlikely(sighand == NULL)) {
394 /*
395 * We raced with the reaping of the task.
396 * The deletion should have cleared us off the list.
397 */
398 WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
399 } else {
400 if (timer->it.cpu.firing)
401 ret = TIMER_RETRY;
402 else
403 list_del(&timer->it.cpu.entry);
404
405 unlock_task_sighand(p, &flags);
406 }
407
408 if (!ret)
409 put_task_struct(p);
410
411 return ret;
412}
413
414static void cleanup_timers_list(struct list_head *head)
415{
416 struct cpu_timer_list *timer, *next;
417
418 list_for_each_entry_safe(timer, next, head, entry)
419 list_del_init(&timer->entry);
420}
421
422/*
423 * Clean out CPU timers still ticking when a thread exited. The task
424 * pointer is cleared, and the expiry time is replaced with the residual
425 * time for later timer_gettime calls to return.
426 * This must be called with the siglock held.
427 */
428static void cleanup_timers(struct list_head *head)
429{
430 cleanup_timers_list(head);
431 cleanup_timers_list(++head);
432 cleanup_timers_list(++head);
433}
434
435/*
436 * These are both called with the siglock held, when the current thread
437 * is being reaped. When the final (leader) thread in the group is reaped,
438 * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
439 */
440void posix_cpu_timers_exit(struct task_struct *tsk)
441{
442 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
443 sizeof(unsigned long long));
444 cleanup_timers(tsk->cpu_timers);
445
446}
447void posix_cpu_timers_exit_group(struct task_struct *tsk)
448{
449 cleanup_timers(tsk->signal->cpu_timers);
450}
451
452static inline int expires_gt(cputime_t expires, cputime_t new_exp)
453{
454 return expires == 0 || expires > new_exp;
455}
456
457/*
458 * Insert the timer on the appropriate list before any timers that
459 * expire later. This must be called with the sighand lock held.
460 */
461static void arm_timer(struct k_itimer *timer)
462{
463 struct task_struct *p = timer->it.cpu.task;
464 struct list_head *head, *listpos;
465 struct task_cputime *cputime_expires;
466 struct cpu_timer_list *const nt = &timer->it.cpu;
467 struct cpu_timer_list *next;
468
469 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
470 head = p->cpu_timers;
471 cputime_expires = &p->cputime_expires;
472 } else {
473 head = p->signal->cpu_timers;
474 cputime_expires = &p->signal->cputime_expires;
475 }
476 head += CPUCLOCK_WHICH(timer->it_clock);
477
478 listpos = head;
479 list_for_each_entry(next, head, entry) {
480 if (nt->expires < next->expires)
481 break;
482 listpos = &next->entry;
483 }
484 list_add(&nt->entry, listpos);
485
486 if (listpos == head) {
487 unsigned long long exp = nt->expires;
488
489 /*
490 * We are the new earliest-expiring POSIX 1.b timer, hence
491 * need to update expiration cache. Take into account that
492 * for process timers we share expiration cache with itimers
493 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
494 */
495
496 switch (CPUCLOCK_WHICH(timer->it_clock)) {
497 case CPUCLOCK_PROF:
498 if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
499 cputime_expires->prof_exp = expires_to_cputime(exp);
500 break;
501 case CPUCLOCK_VIRT:
502 if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
503 cputime_expires->virt_exp = expires_to_cputime(exp);
504 break;
505 case CPUCLOCK_SCHED:
506 if (cputime_expires->sched_exp == 0 ||
507 cputime_expires->sched_exp > exp)
508 cputime_expires->sched_exp = exp;
509 break;
510 }
511 }
512}
513
514/*
515 * The timer is locked, fire it and arrange for its reload.
516 */
517static void cpu_timer_fire(struct k_itimer *timer)
518{
519 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
520 /*
521 * User don't want any signal.
522 */
523 timer->it.cpu.expires = 0;
524 } else if (unlikely(timer->sigq == NULL)) {
525 /*
526 * This a special case for clock_nanosleep,
527 * not a normal timer from sys_timer_create.
528 */
529 wake_up_process(timer->it_process);
530 timer->it.cpu.expires = 0;
531 } else if (timer->it.cpu.incr == 0) {
532 /*
533 * One-shot timer. Clear it as soon as it's fired.
534 */
535 posix_timer_event(timer, 0);
536 timer->it.cpu.expires = 0;
537 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
538 /*
539 * The signal did not get queued because the signal
540 * was ignored, so we won't get any callback to
541 * reload the timer. But we need to keep it
542 * ticking in case the signal is deliverable next time.
543 */
544 posix_cpu_timer_schedule(timer);
545 }
546}
547
548/*
549 * Sample a process (thread group) timer for the given group_leader task.
550 * Must be called with task sighand lock held for safe while_each_thread()
551 * traversal.
552 */
553static int cpu_timer_sample_group(const clockid_t which_clock,
554 struct task_struct *p,
555 unsigned long long *sample)
556{
557 struct task_cputime cputime;
558
559 thread_group_cputimer(p, &cputime);
560 switch (CPUCLOCK_WHICH(which_clock)) {
561 default:
562 return -EINVAL;
563 case CPUCLOCK_PROF:
564 *sample = cputime_to_expires(cputime.utime + cputime.stime);
565 break;
566 case CPUCLOCK_VIRT:
567 *sample = cputime_to_expires(cputime.utime);
568 break;
569 case CPUCLOCK_SCHED:
570 *sample = cputime.sum_exec_runtime + task_delta_exec(p);
571 break;
572 }
573 return 0;
574}
575
576#ifdef CONFIG_NO_HZ_FULL
577static void nohz_kick_work_fn(struct work_struct *work)
578{
579 tick_nohz_full_kick_all();
580}
581
582static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
583
584/*
585 * We need the IPIs to be sent from sane process context.
586 * The posix cpu timers are always set with irqs disabled.
587 */
588static void posix_cpu_timer_kick_nohz(void)
589{
590 if (context_tracking_is_enabled())
591 schedule_work(&nohz_kick_work);
592}
593
594bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
595{
596 if (!task_cputime_zero(&tsk->cputime_expires))
597 return false;
598
599 if (tsk->signal->cputimer.running)
600 return false;
601
602 return true;
603}
604#else
605static inline void posix_cpu_timer_kick_nohz(void) { }
606#endif
607
608/*
609 * Guts of sys_timer_settime for CPU timers.
610 * This is called with the timer locked and interrupts disabled.
611 * If we return TIMER_RETRY, it's necessary to release the timer's lock
612 * and try again. (This happens when the timer is in the middle of firing.)
613 */
614static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
615 struct itimerspec *new, struct itimerspec *old)
616{
617 unsigned long flags;
618 struct sighand_struct *sighand;
619 struct task_struct *p = timer->it.cpu.task;
620 unsigned long long old_expires, new_expires, old_incr, val;
621 int ret;
622
623 WARN_ON_ONCE(p == NULL);
624
625 new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
626
627 /*
628 * Protect against sighand release/switch in exit/exec and p->cpu_timers
629 * and p->signal->cpu_timers read/write in arm_timer()
630 */
631 sighand = lock_task_sighand(p, &flags);
632 /*
633 * If p has just been reaped, we can no
634 * longer get any information about it at all.
635 */
636 if (unlikely(sighand == NULL)) {
637 return -ESRCH;
638 }
639
640 /*
641 * Disarm any old timer after extracting its expiry time.
642 */
643 WARN_ON_ONCE(!irqs_disabled());
644
645 ret = 0;
646 old_incr = timer->it.cpu.incr;
647 old_expires = timer->it.cpu.expires;
648 if (unlikely(timer->it.cpu.firing)) {
649 timer->it.cpu.firing = -1;
650 ret = TIMER_RETRY;
651 } else
652 list_del_init(&timer->it.cpu.entry);
653
654 /*
655 * We need to sample the current value to convert the new
656 * value from to relative and absolute, and to convert the
657 * old value from absolute to relative. To set a process
658 * timer, we need a sample to balance the thread expiry
659 * times (in arm_timer). With an absolute time, we must
660 * check if it's already passed. In short, we need a sample.
661 */
662 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
663 cpu_clock_sample(timer->it_clock, p, &val);
664 } else {
665 cpu_timer_sample_group(timer->it_clock, p, &val);
666 }
667
668 if (old) {
669 if (old_expires == 0) {
670 old->it_value.tv_sec = 0;
671 old->it_value.tv_nsec = 0;
672 } else {
673 /*
674 * Update the timer in case it has
675 * overrun already. If it has,
676 * we'll report it as having overrun
677 * and with the next reloaded timer
678 * already ticking, though we are
679 * swallowing that pending
680 * notification here to install the
681 * new setting.
682 */
683 bump_cpu_timer(timer, val);
684 if (val < timer->it.cpu.expires) {
685 old_expires = timer->it.cpu.expires - val;
686 sample_to_timespec(timer->it_clock,
687 old_expires,
688 &old->it_value);
689 } else {
690 old->it_value.tv_nsec = 1;
691 old->it_value.tv_sec = 0;
692 }
693 }
694 }
695
696 if (unlikely(ret)) {
697 /*
698 * We are colliding with the timer actually firing.
699 * Punt after filling in the timer's old value, and
700 * disable this firing since we are already reporting
701 * it as an overrun (thanks to bump_cpu_timer above).
702 */
703 unlock_task_sighand(p, &flags);
704 goto out;
705 }
706
707 if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
708 new_expires += val;
709 }
710
711 /*
712 * Install the new expiry time (or zero).
713 * For a timer with no notification action, we don't actually
714 * arm the timer (we'll just fake it for timer_gettime).
715 */
716 timer->it.cpu.expires = new_expires;
717 if (new_expires != 0 && val < new_expires) {
718 arm_timer(timer);
719 }
720
721 unlock_task_sighand(p, &flags);
722 /*
723 * Install the new reload setting, and
724 * set up the signal and overrun bookkeeping.
725 */
726 timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
727 &new->it_interval);
728
729 /*
730 * This acts as a modification timestamp for the timer,
731 * so any automatic reload attempt will punt on seeing
732 * that we have reset the timer manually.
733 */
734 timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
735 ~REQUEUE_PENDING;
736 timer->it_overrun_last = 0;
737 timer->it_overrun = -1;
738
739 if (new_expires != 0 && !(val < new_expires)) {
740 /*
741 * The designated time already passed, so we notify
742 * immediately, even if the thread never runs to
743 * accumulate more time on this clock.
744 */
745 cpu_timer_fire(timer);
746 }
747
748 ret = 0;
749 out:
750 if (old) {
751 sample_to_timespec(timer->it_clock,
752 old_incr, &old->it_interval);
753 }
754 if (!ret)
755 posix_cpu_timer_kick_nohz();
756 return ret;
757}
758
759static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
760{
761 unsigned long long now;
762 struct task_struct *p = timer->it.cpu.task;
763
764 WARN_ON_ONCE(p == NULL);
765
766 /*
767 * Easy part: convert the reload time.
768 */
769 sample_to_timespec(timer->it_clock,
770 timer->it.cpu.incr, &itp->it_interval);
771
772 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
773 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
774 return;
775 }
776
777 /*
778 * Sample the clock to take the difference with the expiry time.
779 */
780 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
781 cpu_clock_sample(timer->it_clock, p, &now);
782 } else {
783 struct sighand_struct *sighand;
784 unsigned long flags;
785
786 /*
787 * Protect against sighand release/switch in exit/exec and
788 * also make timer sampling safe if it ends up calling
789 * thread_group_cputime().
790 */
791 sighand = lock_task_sighand(p, &flags);
792 if (unlikely(sighand == NULL)) {
793 /*
794 * The process has been reaped.
795 * We can't even collect a sample any more.
796 * Call the timer disarmed, nothing else to do.
797 */
798 timer->it.cpu.expires = 0;
799 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
800 &itp->it_value);
801 } else {
802 cpu_timer_sample_group(timer->it_clock, p, &now);
803 unlock_task_sighand(p, &flags);
804 }
805 }
806
807 if (now < timer->it.cpu.expires) {
808 sample_to_timespec(timer->it_clock,
809 timer->it.cpu.expires - now,
810 &itp->it_value);
811 } else {
812 /*
813 * The timer should have expired already, but the firing
814 * hasn't taken place yet. Say it's just about to expire.
815 */
816 itp->it_value.tv_nsec = 1;
817 itp->it_value.tv_sec = 0;
818 }
819}
820
821static unsigned long long
822check_timers_list(struct list_head *timers,
823 struct list_head *firing,
824 unsigned long long curr)
825{
826 int maxfire = 20;
827
828 while (!list_empty(timers)) {
829 struct cpu_timer_list *t;
830
831 t = list_first_entry(timers, struct cpu_timer_list, entry);
832
833 if (!--maxfire || curr < t->expires)
834 return t->expires;
835
836 t->firing = 1;
837 list_move_tail(&t->entry, firing);
838 }
839
840 return 0;
841}
842
843/*
844 * Check for any per-thread CPU timers that have fired and move them off
845 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
846 * tsk->it_*_expires values to reflect the remaining thread CPU timers.
847 */
848static void check_thread_timers(struct task_struct *tsk,
849 struct list_head *firing)
850{
851 struct list_head *timers = tsk->cpu_timers;
852 struct signal_struct *const sig = tsk->signal;
853 struct task_cputime *tsk_expires = &tsk->cputime_expires;
854 unsigned long long expires;
855 unsigned long soft;
856
857 expires = check_timers_list(timers, firing, prof_ticks(tsk));
858 tsk_expires->prof_exp = expires_to_cputime(expires);
859
860 expires = check_timers_list(++timers, firing, virt_ticks(tsk));
861 tsk_expires->virt_exp = expires_to_cputime(expires);
862
863 tsk_expires->sched_exp = check_timers_list(++timers, firing,
864 tsk->se.sum_exec_runtime);
865
866 /*
867 * Check for the special case thread timers.
868 */
869 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
870 if (soft != RLIM_INFINITY) {
871 unsigned long hard =
872 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
873
874 if (hard != RLIM_INFINITY &&
875 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
876 /*
877 * At the hard limit, we just die.
878 * No need to calculate anything else now.
879 */
880 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
881 return;
882 }
883 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
884 /*
885 * At the soft limit, send a SIGXCPU every second.
886 */
887 if (soft < hard) {
888 soft += USEC_PER_SEC;
889 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
890 }
891 printk(KERN_INFO
892 "RT Watchdog Timeout: %s[%d]\n",
893 tsk->comm, task_pid_nr(tsk));
894 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
895 }
896 }
897}
898
899static void stop_process_timers(struct signal_struct *sig)
900{
901 struct thread_group_cputimer *cputimer = &sig->cputimer;
902 unsigned long flags;
903
904 raw_spin_lock_irqsave(&cputimer->lock, flags);
905 cputimer->running = 0;
906 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
907}
908
909static u32 onecputick;
910
911static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
912 unsigned long long *expires,
913 unsigned long long cur_time, int signo)
914{
915 if (!it->expires)
916 return;
917
918 if (cur_time >= it->expires) {
919 if (it->incr) {
920 it->expires += it->incr;
921 it->error += it->incr_error;
922 if (it->error >= onecputick) {
923 it->expires -= cputime_one_jiffy;
924 it->error -= onecputick;
925 }
926 } else {
927 it->expires = 0;
928 }
929
930 trace_itimer_expire(signo == SIGPROF ?
931 ITIMER_PROF : ITIMER_VIRTUAL,
932 tsk->signal->leader_pid, cur_time);
933 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
934 }
935
936 if (it->expires && (!*expires || it->expires < *expires)) {
937 *expires = it->expires;
938 }
939}
940
941/*
942 * Check for any per-thread CPU timers that have fired and move them
943 * off the tsk->*_timers list onto the firing list. Per-thread timers
944 * have already been taken off.
945 */
946static void check_process_timers(struct task_struct *tsk,
947 struct list_head *firing)
948{
949 struct signal_struct *const sig = tsk->signal;
950 unsigned long long utime, ptime, virt_expires, prof_expires;
951 unsigned long long sum_sched_runtime, sched_expires;
952 struct list_head *timers = sig->cpu_timers;
953 struct task_cputime cputime;
954 unsigned long soft;
955
956 /*
957 * Collect the current process totals.
958 */
959 thread_group_cputimer(tsk, &cputime);
960 utime = cputime_to_expires(cputime.utime);
961 ptime = utime + cputime_to_expires(cputime.stime);
962 sum_sched_runtime = cputime.sum_exec_runtime;
963
964 prof_expires = check_timers_list(timers, firing, ptime);
965 virt_expires = check_timers_list(++timers, firing, utime);
966 sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
967
968 /*
969 * Check for the special case process timers.
970 */
971 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
972 SIGPROF);
973 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
974 SIGVTALRM);
975 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
976 if (soft != RLIM_INFINITY) {
977 unsigned long psecs = cputime_to_secs(ptime);
978 unsigned long hard =
979 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
980 cputime_t x;
981 if (psecs >= hard) {
982 /*
983 * At the hard limit, we just die.
984 * No need to calculate anything else now.
985 */
986 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
987 return;
988 }
989 if (psecs >= soft) {
990 /*
991 * At the soft limit, send a SIGXCPU every second.
992 */
993 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
994 if (soft < hard) {
995 soft++;
996 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
997 }
998 }
999 x = secs_to_cputime(soft);
1000 if (!prof_expires || x < prof_expires) {
1001 prof_expires = x;
1002 }
1003 }
1004
1005 sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
1006 sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
1007 sig->cputime_expires.sched_exp = sched_expires;
1008 if (task_cputime_zero(&sig->cputime_expires))
1009 stop_process_timers(sig);
1010}
1011
1012/*
1013 * This is called from the signal code (via do_schedule_next_timer)
1014 * when the last timer signal was delivered and we have to reload the timer.
1015 */
1016void posix_cpu_timer_schedule(struct k_itimer *timer)
1017{
1018 struct sighand_struct *sighand;
1019 unsigned long flags;
1020 struct task_struct *p = timer->it.cpu.task;
1021 unsigned long long now;
1022
1023 WARN_ON_ONCE(p == NULL);
1024
1025 /*
1026 * Fetch the current sample and update the timer's expiry time.
1027 */
1028 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1029 cpu_clock_sample(timer->it_clock, p, &now);
1030 bump_cpu_timer(timer, now);
1031 if (unlikely(p->exit_state))
1032 goto out;
1033
1034 /* Protect timer list r/w in arm_timer() */
1035 sighand = lock_task_sighand(p, &flags);
1036 if (!sighand)
1037 goto out;
1038 } else {
1039 /*
1040 * Protect arm_timer() and timer sampling in case of call to
1041 * thread_group_cputime().
1042 */
1043 sighand = lock_task_sighand(p, &flags);
1044 if (unlikely(sighand == NULL)) {
1045 /*
1046 * The process has been reaped.
1047 * We can't even collect a sample any more.
1048 */
1049 timer->it.cpu.expires = 0;
1050 goto out;
1051 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1052 unlock_task_sighand(p, &flags);
1053 /* Optimizations: if the process is dying, no need to rearm */
1054 goto out;
1055 }
1056 cpu_timer_sample_group(timer->it_clock, p, &now);
1057 bump_cpu_timer(timer, now);
1058 /* Leave the sighand locked for the call below. */
1059 }
1060
1061 /*
1062 * Now re-arm for the new expiry time.
1063 */
1064 WARN_ON_ONCE(!irqs_disabled());
1065 arm_timer(timer);
1066 unlock_task_sighand(p, &flags);
1067
1068 /* Kick full dynticks CPUs in case they need to tick on the new timer */
1069 posix_cpu_timer_kick_nohz();
1070out:
1071 timer->it_overrun_last = timer->it_overrun;
1072 timer->it_overrun = -1;
1073 ++timer->it_requeue_pending;
1074}
1075
1076/**
1077 * task_cputime_expired - Compare two task_cputime entities.
1078 *
1079 * @sample: The task_cputime structure to be checked for expiration.
1080 * @expires: Expiration times, against which @sample will be checked.
1081 *
1082 * Checks @sample against @expires to see if any field of @sample has expired.
1083 * Returns true if any field of the former is greater than the corresponding
1084 * field of the latter if the latter field is set. Otherwise returns false.
1085 */
1086static inline int task_cputime_expired(const struct task_cputime *sample,
1087 const struct task_cputime *expires)
1088{
1089 if (expires->utime && sample->utime >= expires->utime)
1090 return 1;
1091 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1092 return 1;
1093 if (expires->sum_exec_runtime != 0 &&
1094 sample->sum_exec_runtime >= expires->sum_exec_runtime)
1095 return 1;
1096 return 0;
1097}
1098
1099/**
1100 * fastpath_timer_check - POSIX CPU timers fast path.
1101 *
1102 * @tsk: The task (thread) being checked.
1103 *
1104 * Check the task and thread group timers. If both are zero (there are no
1105 * timers set) return false. Otherwise snapshot the task and thread group
1106 * timers and compare them with the corresponding expiration times. Return
1107 * true if a timer has expired, else return false.
1108 */
1109static inline int fastpath_timer_check(struct task_struct *tsk)
1110{
1111 struct signal_struct *sig;
1112 cputime_t utime, stime;
1113
1114 task_cputime(tsk, &utime, &stime);
1115
1116 if (!task_cputime_zero(&tsk->cputime_expires)) {
1117 struct task_cputime task_sample = {
1118 .utime = utime,
1119 .stime = stime,
1120 .sum_exec_runtime = tsk->se.sum_exec_runtime
1121 };
1122
1123 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1124 return 1;
1125 }
1126
1127 sig = tsk->signal;
1128 if (sig->cputimer.running) {
1129 struct task_cputime group_sample;
1130
1131 raw_spin_lock(&sig->cputimer.lock);
1132 group_sample = sig->cputimer.cputime;
1133 raw_spin_unlock(&sig->cputimer.lock);
1134
1135 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1136 return 1;
1137 }
1138
1139 return 0;
1140}
1141
1142/*
1143 * This is called from the timer interrupt handler. The irq handler has
1144 * already updated our counts. We need to check if any timers fire now.
1145 * Interrupts are disabled.
1146 */
1147void run_posix_cpu_timers(struct task_struct *tsk)
1148{
1149 LIST_HEAD(firing);
1150 struct k_itimer *timer, *next;
1151 unsigned long flags;
1152
1153 WARN_ON_ONCE(!irqs_disabled());
1154
1155 /*
1156 * The fast path checks that there are no expired thread or thread
1157 * group timers. If that's so, just return.
1158 */
1159 if (!fastpath_timer_check(tsk))
1160 return;
1161
1162 if (!lock_task_sighand(tsk, &flags))
1163 return;
1164 /*
1165 * Here we take off tsk->signal->cpu_timers[N] and
1166 * tsk->cpu_timers[N] all the timers that are firing, and
1167 * put them on the firing list.
1168 */
1169 check_thread_timers(tsk, &firing);
1170 /*
1171 * If there are any active process wide timers (POSIX 1.b, itimers,
1172 * RLIMIT_CPU) cputimer must be running.
1173 */
1174 if (tsk->signal->cputimer.running)
1175 check_process_timers(tsk, &firing);
1176
1177 /*
1178 * We must release these locks before taking any timer's lock.
1179 * There is a potential race with timer deletion here, as the
1180 * siglock now protects our private firing list. We have set
1181 * the firing flag in each timer, so that a deletion attempt
1182 * that gets the timer lock before we do will give it up and
1183 * spin until we've taken care of that timer below.
1184 */
1185 unlock_task_sighand(tsk, &flags);
1186
1187 /*
1188 * Now that all the timers on our list have the firing flag,
1189 * no one will touch their list entries but us. We'll take
1190 * each timer's lock before clearing its firing flag, so no
1191 * timer call will interfere.
1192 */
1193 list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1194 int cpu_firing;
1195
1196 spin_lock(&timer->it_lock);
1197 list_del_init(&timer->it.cpu.entry);
1198 cpu_firing = timer->it.cpu.firing;
1199 timer->it.cpu.firing = 0;
1200 /*
1201 * The firing flag is -1 if we collided with a reset
1202 * of the timer, which already reported this
1203 * almost-firing as an overrun. So don't generate an event.
1204 */
1205 if (likely(cpu_firing >= 0))
1206 cpu_timer_fire(timer);
1207 spin_unlock(&timer->it_lock);
1208 }
1209}
1210
1211/*
1212 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1213 * The tsk->sighand->siglock must be held by the caller.
1214 */
1215void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1216 cputime_t *newval, cputime_t *oldval)
1217{
1218 unsigned long long now;
1219
1220 WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
1221 cpu_timer_sample_group(clock_idx, tsk, &now);
1222
1223 if (oldval) {
1224 /*
1225 * We are setting itimer. The *oldval is absolute and we update
1226 * it to be relative, *newval argument is relative and we update
1227 * it to be absolute.
1228 */
1229 if (*oldval) {
1230 if (*oldval <= now) {
1231 /* Just about to fire. */
1232 *oldval = cputime_one_jiffy;
1233 } else {
1234 *oldval -= now;
1235 }
1236 }
1237
1238 if (!*newval)
1239 goto out;
1240 *newval += now;
1241 }
1242
1243 /*
1244 * Update expiration cache if we are the earliest timer, or eventually
1245 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1246 */
1247 switch (clock_idx) {
1248 case CPUCLOCK_PROF:
1249 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1250 tsk->signal->cputime_expires.prof_exp = *newval;
1251 break;
1252 case CPUCLOCK_VIRT:
1253 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1254 tsk->signal->cputime_expires.virt_exp = *newval;
1255 break;
1256 }
1257out:
1258 posix_cpu_timer_kick_nohz();
1259}
1260
1261static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1262 struct timespec *rqtp, struct itimerspec *it)
1263{
1264 struct k_itimer timer;
1265 int error;
1266
1267 /*
1268 * Set up a temporary timer and then wait for it to go off.
1269 */
1270 memset(&timer, 0, sizeof timer);
1271 spin_lock_init(&timer.it_lock);
1272 timer.it_clock = which_clock;
1273 timer.it_overrun = -1;
1274 error = posix_cpu_timer_create(&timer);
1275 timer.it_process = current;
1276 if (!error) {
1277 static struct itimerspec zero_it;
1278
1279 memset(it, 0, sizeof *it);
1280 it->it_value = *rqtp;
1281
1282 spin_lock_irq(&timer.it_lock);
1283 error = posix_cpu_timer_set(&timer, flags, it, NULL);
1284 if (error) {
1285 spin_unlock_irq(&timer.it_lock);
1286 return error;
1287 }
1288
1289 while (!signal_pending(current)) {
1290 if (timer.it.cpu.expires == 0) {
1291 /*
1292 * Our timer fired and was reset, below
1293 * deletion can not fail.
1294 */
1295 posix_cpu_timer_del(&timer);
1296 spin_unlock_irq(&timer.it_lock);
1297 return 0;
1298 }
1299
1300 /*
1301 * Block until cpu_timer_fire (or a signal) wakes us.
1302 */
1303 __set_current_state(TASK_INTERRUPTIBLE);
1304 spin_unlock_irq(&timer.it_lock);
1305 schedule();
1306 spin_lock_irq(&timer.it_lock);
1307 }
1308
1309 /*
1310 * We were interrupted by a signal.
1311 */
1312 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1313 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1314 if (!error) {
1315 /*
1316 * Timer is now unarmed, deletion can not fail.
1317 */
1318 posix_cpu_timer_del(&timer);
1319 }
1320 spin_unlock_irq(&timer.it_lock);
1321
1322 while (error == TIMER_RETRY) {
1323 /*
1324 * We need to handle case when timer was or is in the
1325 * middle of firing. In other cases we already freed
1326 * resources.
1327 */
1328 spin_lock_irq(&timer.it_lock);
1329 error = posix_cpu_timer_del(&timer);
1330 spin_unlock_irq(&timer.it_lock);
1331 }
1332
1333 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1334 /*
1335 * It actually did fire already.
1336 */
1337 return 0;
1338 }
1339
1340 error = -ERESTART_RESTARTBLOCK;
1341 }
1342
1343 return error;
1344}
1345
1346static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1347
1348static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1349 struct timespec *rqtp, struct timespec __user *rmtp)
1350{
1351 struct restart_block *restart_block =
1352 &current_thread_info()->restart_block;
1353 struct itimerspec it;
1354 int error;
1355
1356 /*
1357 * Diagnose required errors first.
1358 */
1359 if (CPUCLOCK_PERTHREAD(which_clock) &&
1360 (CPUCLOCK_PID(which_clock) == 0 ||
1361 CPUCLOCK_PID(which_clock) == current->pid))
1362 return -EINVAL;
1363
1364 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1365
1366 if (error == -ERESTART_RESTARTBLOCK) {
1367
1368 if (flags & TIMER_ABSTIME)
1369 return -ERESTARTNOHAND;
1370 /*
1371 * Report back to the user the time still remaining.
1372 */
1373 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1374 return -EFAULT;
1375
1376 restart_block->fn = posix_cpu_nsleep_restart;
1377 restart_block->nanosleep.clockid = which_clock;
1378 restart_block->nanosleep.rmtp = rmtp;
1379 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1380 }
1381 return error;
1382}
1383
1384static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1385{
1386 clockid_t which_clock = restart_block->nanosleep.clockid;
1387 struct timespec t;
1388 struct itimerspec it;
1389 int error;
1390
1391 t = ns_to_timespec(restart_block->nanosleep.expires);
1392
1393 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1394
1395 if (error == -ERESTART_RESTARTBLOCK) {
1396 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1397 /*
1398 * Report back to the user the time still remaining.
1399 */
1400 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1401 return -EFAULT;
1402
1403 restart_block->nanosleep.expires = timespec_to_ns(&t);
1404 }
1405 return error;
1406
1407}
1408
1409#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1410#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1411
1412static int process_cpu_clock_getres(const clockid_t which_clock,
1413 struct timespec *tp)
1414{
1415 return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1416}
1417static int process_cpu_clock_get(const clockid_t which_clock,
1418 struct timespec *tp)
1419{
1420 return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1421}
1422static int process_cpu_timer_create(struct k_itimer *timer)
1423{
1424 timer->it_clock = PROCESS_CLOCK;
1425 return posix_cpu_timer_create(timer);
1426}
1427static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1428 struct timespec *rqtp,
1429 struct timespec __user *rmtp)
1430{
1431 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1432}
1433static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1434{
1435 return -EINVAL;
1436}
1437static int thread_cpu_clock_getres(const clockid_t which_clock,
1438 struct timespec *tp)
1439{
1440 return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1441}
1442static int thread_cpu_clock_get(const clockid_t which_clock,
1443 struct timespec *tp)
1444{
1445 return posix_cpu_clock_get(THREAD_CLOCK, tp);
1446}
1447static int thread_cpu_timer_create(struct k_itimer *timer)
1448{
1449 timer->it_clock = THREAD_CLOCK;
1450 return posix_cpu_timer_create(timer);
1451}
1452
1453struct k_clock clock_posix_cpu = {
1454 .clock_getres = posix_cpu_clock_getres,
1455 .clock_set = posix_cpu_clock_set,
1456 .clock_get = posix_cpu_clock_get,
1457 .timer_create = posix_cpu_timer_create,
1458 .nsleep = posix_cpu_nsleep,
1459 .nsleep_restart = posix_cpu_nsleep_restart,
1460 .timer_set = posix_cpu_timer_set,
1461 .timer_del = posix_cpu_timer_del,
1462 .timer_get = posix_cpu_timer_get,
1463};
1464
1465static __init int init_posix_cpu_timers(void)
1466{
1467 struct k_clock process = {
1468 .clock_getres = process_cpu_clock_getres,
1469 .clock_get = process_cpu_clock_get,
1470 .timer_create = process_cpu_timer_create,
1471 .nsleep = process_cpu_nsleep,
1472 .nsleep_restart = process_cpu_nsleep_restart,
1473 };
1474 struct k_clock thread = {
1475 .clock_getres = thread_cpu_clock_getres,
1476 .clock_get = thread_cpu_clock_get,
1477 .timer_create = thread_cpu_timer_create,
1478 };
1479 struct timespec ts;
1480
1481 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1482 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1483
1484 cputime_to_timespec(cputime_one_jiffy, &ts);
1485 onecputick = ts.tv_nsec;
1486 WARN_ON(ts.tv_sec != 0);
1487
1488 return 0;
1489}
1490__initcall(init_posix_cpu_timers);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
new file mode 100644
index 000000000000..424c2d4265c9
--- /dev/null
+++ b/kernel/time/posix-timers.c
@@ -0,0 +1,1121 @@
1/*
2 * linux/kernel/posix-timers.c
3 *
4 *
5 * 2002-10-15 Posix Clocks & timers
6 * by George Anzinger george@mvista.com
7 *
8 * Copyright (C) 2002 2003 by MontaVista Software.
9 *
10 * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
11 * Copyright (C) 2004 Boris Hu
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or (at
16 * your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
28 */
29
30/* These are all the functions necessary to implement
31 * POSIX clocks & timers
32 */
33#include <linux/mm.h>
34#include <linux/interrupt.h>
35#include <linux/slab.h>
36#include <linux/time.h>
37#include <linux/mutex.h>
38
39#include <asm/uaccess.h>
40#include <linux/list.h>
41#include <linux/init.h>
42#include <linux/compiler.h>
43#include <linux/hash.h>
44#include <linux/posix-clock.h>
45#include <linux/posix-timers.h>
46#include <linux/syscalls.h>
47#include <linux/wait.h>
48#include <linux/workqueue.h>
49#include <linux/export.h>
50#include <linux/hashtable.h>
51
52/*
53 * Management arrays for POSIX timers. Timers are now kept in static hash table
54 * with 512 entries.
55 * Timer ids are allocated by local routine, which selects proper hash head by
56 * key, constructed from current->signal address and per signal struct counter.
57 * This keeps timer ids unique per process, but now they can intersect between
58 * processes.
59 */
60
61/*
62 * Lets keep our timers in a slab cache :-)
63 */
64static struct kmem_cache *posix_timers_cache;
65
66static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
67static DEFINE_SPINLOCK(hash_lock);
68
69/*
70 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
71 * SIGEV values. Here we put out an error if this assumption fails.
72 */
73#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
74 ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
75#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
76#endif
77
78/*
79 * parisc wants ENOTSUP instead of EOPNOTSUPP
80 */
81#ifndef ENOTSUP
82# define ENANOSLEEP_NOTSUP EOPNOTSUPP
83#else
84# define ENANOSLEEP_NOTSUP ENOTSUP
85#endif
86
87/*
88 * The timer ID is turned into a timer address by idr_find().
89 * Verifying a valid ID consists of:
90 *
91 * a) checking that idr_find() returns other than -1.
92 * b) checking that the timer id matches the one in the timer itself.
93 * c) that the timer owner is in the callers thread group.
94 */
95
96/*
97 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
98 * to implement others. This structure defines the various
99 * clocks.
100 *
101 * RESOLUTION: Clock resolution is used to round up timer and interval
102 * times, NOT to report clock times, which are reported with as
103 * much resolution as the system can muster. In some cases this
104 * resolution may depend on the underlying clock hardware and
105 * may not be quantifiable until run time, and only then is the
106 * necessary code is written. The standard says we should say
107 * something about this issue in the documentation...
108 *
109 * FUNCTIONS: The CLOCKs structure defines possible functions to
110 * handle various clock functions.
111 *
112 * The standard POSIX timer management code assumes the
113 * following: 1.) The k_itimer struct (sched.h) is used for
114 * the timer. 2.) The list, it_lock, it_clock, it_id and
115 * it_pid fields are not modified by timer code.
116 *
117 * Permissions: It is assumed that the clock_settime() function defined
118 * for each clock will take care of permission checks. Some
119 * clocks may be set able by any user (i.e. local process
120 * clocks) others not. Currently the only set able clock we
121 * have is CLOCK_REALTIME and its high res counter part, both of
122 * which we beg off on and pass to do_sys_settimeofday().
123 */
124
125static struct k_clock posix_clocks[MAX_CLOCKS];
126
127/*
128 * These ones are defined below.
129 */
130static int common_nsleep(const clockid_t, int flags, struct timespec *t,
131 struct timespec __user *rmtp);
132static int common_timer_create(struct k_itimer *new_timer);
133static void common_timer_get(struct k_itimer *, struct itimerspec *);
134static int common_timer_set(struct k_itimer *, int,
135 struct itimerspec *, struct itimerspec *);
136static int common_timer_del(struct k_itimer *timer);
137
138static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
139
140static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
141
142#define lock_timer(tid, flags) \
143({ struct k_itimer *__timr; \
144 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
145 __timr; \
146})
147
148static int hash(struct signal_struct *sig, unsigned int nr)
149{
150 return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
151}
152
153static struct k_itimer *__posix_timers_find(struct hlist_head *head,
154 struct signal_struct *sig,
155 timer_t id)
156{
157 struct k_itimer *timer;
158
159 hlist_for_each_entry_rcu(timer, head, t_hash) {
160 if ((timer->it_signal == sig) && (timer->it_id == id))
161 return timer;
162 }
163 return NULL;
164}
165
166static struct k_itimer *posix_timer_by_id(timer_t id)
167{
168 struct signal_struct *sig = current->signal;
169 struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
170
171 return __posix_timers_find(head, sig, id);
172}
173
174static int posix_timer_add(struct k_itimer *timer)
175{
176 struct signal_struct *sig = current->signal;
177 int first_free_id = sig->posix_timer_id;
178 struct hlist_head *head;
179 int ret = -ENOENT;
180
181 do {
182 spin_lock(&hash_lock);
183 head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
184 if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
185 hlist_add_head_rcu(&timer->t_hash, head);
186 ret = sig->posix_timer_id;
187 }
188 if (++sig->posix_timer_id < 0)
189 sig->posix_timer_id = 0;
190 if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
191 /* Loop over all possible ids completed */
192 ret = -EAGAIN;
193 spin_unlock(&hash_lock);
194 } while (ret == -ENOENT);
195 return ret;
196}
197
198static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
199{
200 spin_unlock_irqrestore(&timr->it_lock, flags);
201}
202
203/* Get clock_realtime */
204static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
205{
206 ktime_get_real_ts(tp);
207 return 0;
208}
209
210/* Set clock_realtime */
211static int posix_clock_realtime_set(const clockid_t which_clock,
212 const struct timespec *tp)
213{
214 return do_sys_settimeofday(tp, NULL);
215}
216
217static int posix_clock_realtime_adj(const clockid_t which_clock,
218 struct timex *t)
219{
220 return do_adjtimex(t);
221}
222
223/*
224 * Get monotonic time for posix timers
225 */
226static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
227{
228 ktime_get_ts(tp);
229 return 0;
230}
231
232/*
233 * Get monotonic-raw time for posix timers
234 */
235static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
236{
237 getrawmonotonic(tp);
238 return 0;
239}
240
241
242static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
243{
244 *tp = current_kernel_time();
245 return 0;
246}
247
248static int posix_get_monotonic_coarse(clockid_t which_clock,
249 struct timespec *tp)
250{
251 *tp = get_monotonic_coarse();
252 return 0;
253}
254
255static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
256{
257 *tp = ktime_to_timespec(KTIME_LOW_RES);
258 return 0;
259}
260
261static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
262{
263 get_monotonic_boottime(tp);
264 return 0;
265}
266
267static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
268{
269 timekeeping_clocktai(tp);
270 return 0;
271}
272
273/*
274 * Initialize everything, well, just everything in Posix clocks/timers ;)
275 */
276static __init int init_posix_timers(void)
277{
278 struct k_clock clock_realtime = {
279 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_clock_realtime_get,
281 .clock_set = posix_clock_realtime_set,
282 .clock_adj = posix_clock_realtime_adj,
283 .nsleep = common_nsleep,
284 .nsleep_restart = hrtimer_nanosleep_restart,
285 .timer_create = common_timer_create,
286 .timer_set = common_timer_set,
287 .timer_get = common_timer_get,
288 .timer_del = common_timer_del,
289 };
290 struct k_clock clock_monotonic = {
291 .clock_getres = hrtimer_get_res,
292 .clock_get = posix_ktime_get_ts,
293 .nsleep = common_nsleep,
294 .nsleep_restart = hrtimer_nanosleep_restart,
295 .timer_create = common_timer_create,
296 .timer_set = common_timer_set,
297 .timer_get = common_timer_get,
298 .timer_del = common_timer_del,
299 };
300 struct k_clock clock_monotonic_raw = {
301 .clock_getres = hrtimer_get_res,
302 .clock_get = posix_get_monotonic_raw,
303 };
304 struct k_clock clock_realtime_coarse = {
305 .clock_getres = posix_get_coarse_res,
306 .clock_get = posix_get_realtime_coarse,
307 };
308 struct k_clock clock_monotonic_coarse = {
309 .clock_getres = posix_get_coarse_res,
310 .clock_get = posix_get_monotonic_coarse,
311 };
312 struct k_clock clock_tai = {
313 .clock_getres = hrtimer_get_res,
314 .clock_get = posix_get_tai,
315 .nsleep = common_nsleep,
316 .nsleep_restart = hrtimer_nanosleep_restart,
317 .timer_create = common_timer_create,
318 .timer_set = common_timer_set,
319 .timer_get = common_timer_get,
320 .timer_del = common_timer_del,
321 };
322 struct k_clock clock_boottime = {
323 .clock_getres = hrtimer_get_res,
324 .clock_get = posix_get_boottime,
325 .nsleep = common_nsleep,
326 .nsleep_restart = hrtimer_nanosleep_restart,
327 .timer_create = common_timer_create,
328 .timer_set = common_timer_set,
329 .timer_get = common_timer_get,
330 .timer_del = common_timer_del,
331 };
332
333 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
334 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
335 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
336 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
337 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
338 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
339 posix_timers_register_clock(CLOCK_TAI, &clock_tai);
340
341 posix_timers_cache = kmem_cache_create("posix_timers_cache",
342 sizeof (struct k_itimer), 0, SLAB_PANIC,
343 NULL);
344 return 0;
345}
346
347__initcall(init_posix_timers);
348
349static void schedule_next_timer(struct k_itimer *timr)
350{
351 struct hrtimer *timer = &timr->it.real.timer;
352
353 if (timr->it.real.interval.tv64 == 0)
354 return;
355
356 timr->it_overrun += (unsigned int) hrtimer_forward(timer,
357 timer->base->get_time(),
358 timr->it.real.interval);
359
360 timr->it_overrun_last = timr->it_overrun;
361 timr->it_overrun = -1;
362 ++timr->it_requeue_pending;
363 hrtimer_restart(timer);
364}
365
366/*
367 * This function is exported for use by the signal deliver code. It is
368 * called just prior to the info block being released and passes that
369 * block to us. It's function is to update the overrun entry AND to
370 * restart the timer. It should only be called if the timer is to be
371 * restarted (i.e. we have flagged this in the sys_private entry of the
372 * info block).
373 *
374 * To protect against the timer going away while the interrupt is queued,
375 * we require that the it_requeue_pending flag be set.
376 */
377void do_schedule_next_timer(struct siginfo *info)
378{
379 struct k_itimer *timr;
380 unsigned long flags;
381
382 timr = lock_timer(info->si_tid, &flags);
383
384 if (timr && timr->it_requeue_pending == info->si_sys_private) {
385 if (timr->it_clock < 0)
386 posix_cpu_timer_schedule(timr);
387 else
388 schedule_next_timer(timr);
389
390 info->si_overrun += timr->it_overrun_last;
391 }
392
393 if (timr)
394 unlock_timer(timr, flags);
395}
396
397int posix_timer_event(struct k_itimer *timr, int si_private)
398{
399 struct task_struct *task;
400 int shared, ret = -1;
401 /*
402 * FIXME: if ->sigq is queued we can race with
403 * dequeue_signal()->do_schedule_next_timer().
404 *
405 * If dequeue_signal() sees the "right" value of
406 * si_sys_private it calls do_schedule_next_timer().
407 * We re-queue ->sigq and drop ->it_lock().
408 * do_schedule_next_timer() locks the timer
409 * and re-schedules it while ->sigq is pending.
410 * Not really bad, but not that we want.
411 */
412 timr->sigq->info.si_sys_private = si_private;
413
414 rcu_read_lock();
415 task = pid_task(timr->it_pid, PIDTYPE_PID);
416 if (task) {
417 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
418 ret = send_sigqueue(timr->sigq, task, shared);
419 }
420 rcu_read_unlock();
421 /* If we failed to send the signal the timer stops. */
422 return ret > 0;
423}
424EXPORT_SYMBOL_GPL(posix_timer_event);
425
426/*
427 * This function gets called when a POSIX.1b interval timer expires. It
428 * is used as a callback from the kernel internal timer. The
429 * run_timer_list code ALWAYS calls with interrupts on.
430
431 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
432 */
433static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
434{
435 struct k_itimer *timr;
436 unsigned long flags;
437 int si_private = 0;
438 enum hrtimer_restart ret = HRTIMER_NORESTART;
439
440 timr = container_of(timer, struct k_itimer, it.real.timer);
441 spin_lock_irqsave(&timr->it_lock, flags);
442
443 if (timr->it.real.interval.tv64 != 0)
444 si_private = ++timr->it_requeue_pending;
445
446 if (posix_timer_event(timr, si_private)) {
447 /*
448 * signal was not sent because of sig_ignor
449 * we will not get a call back to restart it AND
450 * it should be restarted.
451 */
452 if (timr->it.real.interval.tv64 != 0) {
453 ktime_t now = hrtimer_cb_get_time(timer);
454
455 /*
456 * FIXME: What we really want, is to stop this
457 * timer completely and restart it in case the
458 * SIG_IGN is removed. This is a non trivial
459 * change which involves sighand locking
460 * (sigh !), which we don't want to do late in
461 * the release cycle.
462 *
463 * For now we just let timers with an interval
464 * less than a jiffie expire every jiffie to
465 * avoid softirq starvation in case of SIG_IGN
466 * and a very small interval, which would put
467 * the timer right back on the softirq pending
468 * list. By moving now ahead of time we trick
469 * hrtimer_forward() to expire the timer
470 * later, while we still maintain the overrun
471 * accuracy, but have some inconsistency in
472 * the timer_gettime() case. This is at least
473 * better than a starved softirq. A more
474 * complex fix which solves also another related
475 * inconsistency is already in the pipeline.
476 */
477#ifdef CONFIG_HIGH_RES_TIMERS
478 {
479 ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
480
481 if (timr->it.real.interval.tv64 < kj.tv64)
482 now = ktime_add(now, kj);
483 }
484#endif
485 timr->it_overrun += (unsigned int)
486 hrtimer_forward(timer, now,
487 timr->it.real.interval);
488 ret = HRTIMER_RESTART;
489 ++timr->it_requeue_pending;
490 }
491 }
492
493 unlock_timer(timr, flags);
494 return ret;
495}
496
497static struct pid *good_sigevent(sigevent_t * event)
498{
499 struct task_struct *rtn = current->group_leader;
500
501 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
502 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
503 !same_thread_group(rtn, current) ||
504 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
505 return NULL;
506
507 if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
508 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
509 return NULL;
510
511 return task_pid(rtn);
512}
513
514void posix_timers_register_clock(const clockid_t clock_id,
515 struct k_clock *new_clock)
516{
517 if ((unsigned) clock_id >= MAX_CLOCKS) {
518 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
519 clock_id);
520 return;
521 }
522
523 if (!new_clock->clock_get) {
524 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
525 clock_id);
526 return;
527 }
528 if (!new_clock->clock_getres) {
529 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
530 clock_id);
531 return;
532 }
533
534 posix_clocks[clock_id] = *new_clock;
535}
536EXPORT_SYMBOL_GPL(posix_timers_register_clock);
537
538static struct k_itimer * alloc_posix_timer(void)
539{
540 struct k_itimer *tmr;
541 tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
542 if (!tmr)
543 return tmr;
544 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
545 kmem_cache_free(posix_timers_cache, tmr);
546 return NULL;
547 }
548 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
549 return tmr;
550}
551
552static void k_itimer_rcu_free(struct rcu_head *head)
553{
554 struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
555
556 kmem_cache_free(posix_timers_cache, tmr);
557}
558
559#define IT_ID_SET 1
560#define IT_ID_NOT_SET 0
561static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
562{
563 if (it_id_set) {
564 unsigned long flags;
565 spin_lock_irqsave(&hash_lock, flags);
566 hlist_del_rcu(&tmr->t_hash);
567 spin_unlock_irqrestore(&hash_lock, flags);
568 }
569 put_pid(tmr->it_pid);
570 sigqueue_free(tmr->sigq);
571 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
572}
573
574static struct k_clock *clockid_to_kclock(const clockid_t id)
575{
576 if (id < 0)
577 return (id & CLOCKFD_MASK) == CLOCKFD ?
578 &clock_posix_dynamic : &clock_posix_cpu;
579
580 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
581 return NULL;
582 return &posix_clocks[id];
583}
584
585static int common_timer_create(struct k_itimer *new_timer)
586{
587 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
588 return 0;
589}
590
591/* Create a POSIX.1b interval timer. */
592
593SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
594 struct sigevent __user *, timer_event_spec,
595 timer_t __user *, created_timer_id)
596{
597 struct k_clock *kc = clockid_to_kclock(which_clock);
598 struct k_itimer *new_timer;
599 int error, new_timer_id;
600 sigevent_t event;
601 int it_id_set = IT_ID_NOT_SET;
602
603 if (!kc)
604 return -EINVAL;
605 if (!kc->timer_create)
606 return -EOPNOTSUPP;
607
608 new_timer = alloc_posix_timer();
609 if (unlikely(!new_timer))
610 return -EAGAIN;
611
612 spin_lock_init(&new_timer->it_lock);
613 new_timer_id = posix_timer_add(new_timer);
614 if (new_timer_id < 0) {
615 error = new_timer_id;
616 goto out;
617 }
618
619 it_id_set = IT_ID_SET;
620 new_timer->it_id = (timer_t) new_timer_id;
621 new_timer->it_clock = which_clock;
622 new_timer->it_overrun = -1;
623
624 if (timer_event_spec) {
625 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
626 error = -EFAULT;
627 goto out;
628 }
629 rcu_read_lock();
630 new_timer->it_pid = get_pid(good_sigevent(&event));
631 rcu_read_unlock();
632 if (!new_timer->it_pid) {
633 error = -EINVAL;
634 goto out;
635 }
636 } else {
637 event.sigev_notify = SIGEV_SIGNAL;
638 event.sigev_signo = SIGALRM;
639 event.sigev_value.sival_int = new_timer->it_id;
640 new_timer->it_pid = get_pid(task_tgid(current));
641 }
642
643 new_timer->it_sigev_notify = event.sigev_notify;
644 new_timer->sigq->info.si_signo = event.sigev_signo;
645 new_timer->sigq->info.si_value = event.sigev_value;
646 new_timer->sigq->info.si_tid = new_timer->it_id;
647 new_timer->sigq->info.si_code = SI_TIMER;
648
649 if (copy_to_user(created_timer_id,
650 &new_timer_id, sizeof (new_timer_id))) {
651 error = -EFAULT;
652 goto out;
653 }
654
655 error = kc->timer_create(new_timer);
656 if (error)
657 goto out;
658
659 spin_lock_irq(&current->sighand->siglock);
660 new_timer->it_signal = current->signal;
661 list_add(&new_timer->list, &current->signal->posix_timers);
662 spin_unlock_irq(&current->sighand->siglock);
663
664 return 0;
665 /*
666 * In the case of the timer belonging to another task, after
667 * the task is unlocked, the timer is owned by the other task
668 * and may cease to exist at any time. Don't use or modify
669 * new_timer after the unlock call.
670 */
671out:
672 release_posix_timer(new_timer, it_id_set);
673 return error;
674}
675
676/*
677 * Locking issues: We need to protect the result of the id look up until
678 * we get the timer locked down so it is not deleted under us. The
679 * removal is done under the idr spinlock so we use that here to bridge
680 * the find to the timer lock. To avoid a dead lock, the timer id MUST
681 * be release with out holding the timer lock.
682 */
683static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
684{
685 struct k_itimer *timr;
686
687 /*
688 * timer_t could be any type >= int and we want to make sure any
689 * @timer_id outside positive int range fails lookup.
690 */
691 if ((unsigned long long)timer_id > INT_MAX)
692 return NULL;
693
694 rcu_read_lock();
695 timr = posix_timer_by_id(timer_id);
696 if (timr) {
697 spin_lock_irqsave(&timr->it_lock, *flags);
698 if (timr->it_signal == current->signal) {
699 rcu_read_unlock();
700 return timr;
701 }
702 spin_unlock_irqrestore(&timr->it_lock, *flags);
703 }
704 rcu_read_unlock();
705
706 return NULL;
707}
708
709/*
710 * Get the time remaining on a POSIX.1b interval timer. This function
711 * is ALWAYS called with spin_lock_irq on the timer, thus it must not
712 * mess with irq.
713 *
714 * We have a couple of messes to clean up here. First there is the case
715 * of a timer that has a requeue pending. These timers should appear to
716 * be in the timer list with an expiry as if we were to requeue them
717 * now.
718 *
719 * The second issue is the SIGEV_NONE timer which may be active but is
720 * not really ever put in the timer list (to save system resources).
721 * This timer may be expired, and if so, we will do it here. Otherwise
722 * it is the same as a requeue pending timer WRT to what we should
723 * report.
724 */
725static void
726common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
727{
728 ktime_t now, remaining, iv;
729 struct hrtimer *timer = &timr->it.real.timer;
730
731 memset(cur_setting, 0, sizeof(struct itimerspec));
732
733 iv = timr->it.real.interval;
734
735 /* interval timer ? */
736 if (iv.tv64)
737 cur_setting->it_interval = ktime_to_timespec(iv);
738 else if (!hrtimer_active(timer) &&
739 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
740 return;
741
742 now = timer->base->get_time();
743
744 /*
745 * When a requeue is pending or this is a SIGEV_NONE
746 * timer move the expiry time forward by intervals, so
747 * expiry is > now.
748 */
749 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
750 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
751 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
752
753 remaining = ktime_sub(hrtimer_get_expires(timer), now);
754 /* Return 0 only, when the timer is expired and not pending */
755 if (remaining.tv64 <= 0) {
756 /*
757 * A single shot SIGEV_NONE timer must return 0, when
758 * it is expired !
759 */
760 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
761 cur_setting->it_value.tv_nsec = 1;
762 } else
763 cur_setting->it_value = ktime_to_timespec(remaining);
764}
765
766/* Get the time remaining on a POSIX.1b interval timer. */
767SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
768 struct itimerspec __user *, setting)
769{
770 struct itimerspec cur_setting;
771 struct k_itimer *timr;
772 struct k_clock *kc;
773 unsigned long flags;
774 int ret = 0;
775
776 timr = lock_timer(timer_id, &flags);
777 if (!timr)
778 return -EINVAL;
779
780 kc = clockid_to_kclock(timr->it_clock);
781 if (WARN_ON_ONCE(!kc || !kc->timer_get))
782 ret = -EINVAL;
783 else
784 kc->timer_get(timr, &cur_setting);
785
786 unlock_timer(timr, flags);
787
788 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
789 return -EFAULT;
790
791 return ret;
792}
793
794/*
795 * Get the number of overruns of a POSIX.1b interval timer. This is to
796 * be the overrun of the timer last delivered. At the same time we are
797 * accumulating overruns on the next timer. The overrun is frozen when
798 * the signal is delivered, either at the notify time (if the info block
799 * is not queued) or at the actual delivery time (as we are informed by
800 * the call back to do_schedule_next_timer(). So all we need to do is
801 * to pick up the frozen overrun.
802 */
803SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
804{
805 struct k_itimer *timr;
806 int overrun;
807 unsigned long flags;
808
809 timr = lock_timer(timer_id, &flags);
810 if (!timr)
811 return -EINVAL;
812
813 overrun = timr->it_overrun_last;
814 unlock_timer(timr, flags);
815
816 return overrun;
817}
818
819/* Set a POSIX.1b interval timer. */
820/* timr->it_lock is taken. */
821static int
822common_timer_set(struct k_itimer *timr, int flags,
823 struct itimerspec *new_setting, struct itimerspec *old_setting)
824{
825 struct hrtimer *timer = &timr->it.real.timer;
826 enum hrtimer_mode mode;
827
828 if (old_setting)
829 common_timer_get(timr, old_setting);
830
831 /* disable the timer */
832 timr->it.real.interval.tv64 = 0;
833 /*
834 * careful here. If smp we could be in the "fire" routine which will
835 * be spinning as we hold the lock. But this is ONLY an SMP issue.
836 */
837 if (hrtimer_try_to_cancel(timer) < 0)
838 return TIMER_RETRY;
839
840 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
841 ~REQUEUE_PENDING;
842 timr->it_overrun_last = 0;
843
844 /* switch off the timer when it_value is zero */
845 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
846 return 0;
847
848 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
849 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
850 timr->it.real.timer.function = posix_timer_fn;
851
852 hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
853
854 /* Convert interval */
855 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
856
857 /* SIGEV_NONE timers are not queued ! See common_timer_get */
858 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
859 /* Setup correct expiry time for relative timers */
860 if (mode == HRTIMER_MODE_REL) {
861 hrtimer_add_expires(timer, timer->base->get_time());
862 }
863 return 0;
864 }
865
866 hrtimer_start_expires(timer, mode);
867 return 0;
868}
869
870/* Set a POSIX.1b interval timer */
871SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
872 const struct itimerspec __user *, new_setting,
873 struct itimerspec __user *, old_setting)
874{
875 struct k_itimer *timr;
876 struct itimerspec new_spec, old_spec;
877 int error = 0;
878 unsigned long flag;
879 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
880 struct k_clock *kc;
881
882 if (!new_setting)
883 return -EINVAL;
884
885 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
886 return -EFAULT;
887
888 if (!timespec_valid(&new_spec.it_interval) ||
889 !timespec_valid(&new_spec.it_value))
890 return -EINVAL;
891retry:
892 timr = lock_timer(timer_id, &flag);
893 if (!timr)
894 return -EINVAL;
895
896 kc = clockid_to_kclock(timr->it_clock);
897 if (WARN_ON_ONCE(!kc || !kc->timer_set))
898 error = -EINVAL;
899 else
900 error = kc->timer_set(timr, flags, &new_spec, rtn);
901
902 unlock_timer(timr, flag);
903 if (error == TIMER_RETRY) {
904 rtn = NULL; // We already got the old time...
905 goto retry;
906 }
907
908 if (old_setting && !error &&
909 copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
910 error = -EFAULT;
911
912 return error;
913}
914
915static int common_timer_del(struct k_itimer *timer)
916{
917 timer->it.real.interval.tv64 = 0;
918
919 if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
920 return TIMER_RETRY;
921 return 0;
922}
923
924static inline int timer_delete_hook(struct k_itimer *timer)
925{
926 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
927
928 if (WARN_ON_ONCE(!kc || !kc->timer_del))
929 return -EINVAL;
930 return kc->timer_del(timer);
931}
932
933/* Delete a POSIX.1b interval timer. */
934SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
935{
936 struct k_itimer *timer;
937 unsigned long flags;
938
939retry_delete:
940 timer = lock_timer(timer_id, &flags);
941 if (!timer)
942 return -EINVAL;
943
944 if (timer_delete_hook(timer) == TIMER_RETRY) {
945 unlock_timer(timer, flags);
946 goto retry_delete;
947 }
948
949 spin_lock(&current->sighand->siglock);
950 list_del(&timer->list);
951 spin_unlock(&current->sighand->siglock);
952 /*
953 * This keeps any tasks waiting on the spin lock from thinking
954 * they got something (see the lock code above).
955 */
956 timer->it_signal = NULL;
957
958 unlock_timer(timer, flags);
959 release_posix_timer(timer, IT_ID_SET);
960 return 0;
961}
962
963/*
964 * return timer owned by the process, used by exit_itimers
965 */
966static void itimer_delete(struct k_itimer *timer)
967{
968 unsigned long flags;
969
970retry_delete:
971 spin_lock_irqsave(&timer->it_lock, flags);
972
973 if (timer_delete_hook(timer) == TIMER_RETRY) {
974 unlock_timer(timer, flags);
975 goto retry_delete;
976 }
977 list_del(&timer->list);
978 /*
979 * This keeps any tasks waiting on the spin lock from thinking
980 * they got something (see the lock code above).
981 */
982 timer->it_signal = NULL;
983
984 unlock_timer(timer, flags);
985 release_posix_timer(timer, IT_ID_SET);
986}
987
988/*
989 * This is called by do_exit or de_thread, only when there are no more
990 * references to the shared signal_struct.
991 */
992void exit_itimers(struct signal_struct *sig)
993{
994 struct k_itimer *tmr;
995
996 while (!list_empty(&sig->posix_timers)) {
997 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
998 itimer_delete(tmr);
999 }
1000}
1001
1002SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
1003 const struct timespec __user *, tp)
1004{
1005 struct k_clock *kc = clockid_to_kclock(which_clock);
1006 struct timespec new_tp;
1007
1008 if (!kc || !kc->clock_set)
1009 return -EINVAL;
1010
1011 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
1012 return -EFAULT;
1013
1014 return kc->clock_set(which_clock, &new_tp);
1015}
1016
1017SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
1018 struct timespec __user *,tp)
1019{
1020 struct k_clock *kc = clockid_to_kclock(which_clock);
1021 struct timespec kernel_tp;
1022 int error;
1023
1024 if (!kc)
1025 return -EINVAL;
1026
1027 error = kc->clock_get(which_clock, &kernel_tp);
1028
1029 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
1030 error = -EFAULT;
1031
1032 return error;
1033}
1034
1035SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
1036 struct timex __user *, utx)
1037{
1038 struct k_clock *kc = clockid_to_kclock(which_clock);
1039 struct timex ktx;
1040 int err;
1041
1042 if (!kc)
1043 return -EINVAL;
1044 if (!kc->clock_adj)
1045 return -EOPNOTSUPP;
1046
1047 if (copy_from_user(&ktx, utx, sizeof(ktx)))
1048 return -EFAULT;
1049
1050 err = kc->clock_adj(which_clock, &ktx);
1051
1052 if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
1053 return -EFAULT;
1054
1055 return err;
1056}
1057
1058SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
1059 struct timespec __user *, tp)
1060{
1061 struct k_clock *kc = clockid_to_kclock(which_clock);
1062 struct timespec rtn_tp;
1063 int error;
1064
1065 if (!kc)
1066 return -EINVAL;
1067
1068 error = kc->clock_getres(which_clock, &rtn_tp);
1069
1070 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
1071 error = -EFAULT;
1072
1073 return error;
1074}
1075
1076/*
1077 * nanosleep for monotonic and realtime clocks
1078 */
1079static int common_nsleep(const clockid_t which_clock, int flags,
1080 struct timespec *tsave, struct timespec __user *rmtp)
1081{
1082 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
1083 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
1084 which_clock);
1085}
1086
1087SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1088 const struct timespec __user *, rqtp,
1089 struct timespec __user *, rmtp)
1090{
1091 struct k_clock *kc = clockid_to_kclock(which_clock);
1092 struct timespec t;
1093
1094 if (!kc)
1095 return -EINVAL;
1096 if (!kc->nsleep)
1097 return -ENANOSLEEP_NOTSUP;
1098
1099 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1100 return -EFAULT;
1101
1102 if (!timespec_valid(&t))
1103 return -EINVAL;
1104
1105 return kc->nsleep(which_clock, flags, &t, rmtp);
1106}
1107
1108/*
1109 * This will restart clock_nanosleep. This is required only by
1110 * compat_clock_nanosleep_restart for now.
1111 */
1112long clock_nanosleep_restart(struct restart_block *restart_block)
1113{
1114 clockid_t which_clock = restart_block->nanosleep.clockid;
1115 struct k_clock *kc = clockid_to_kclock(which_clock);
1116
1117 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1118 return -EINVAL;
1119
1120 return kc->nsleep_restart(restart_block);
1121}
diff --git a/kernel/time/time.c b/kernel/time/time.c
new file mode 100644
index 000000000000..7c7964c33ae7
--- /dev/null
+++ b/kernel/time/time.c
@@ -0,0 +1,714 @@
1/*
2 * linux/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * This file contains the interface functions for the various
7 * time related system calls: time, stime, gettimeofday, settimeofday,
8 * adjtime
9 */
10/*
11 * Modification history kernel/time.c
12 *
13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched/core.c and adjtimex()
15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe
18 * kernel PLL updated to 1994-12-13 specs (rfc-1589)
19 * 1999-01-16 Ulrich Windl
20 * Introduced error checking for many cases in adjtimex().
21 * Updated NTP code according to technical memorandum Jan '96
22 * "A Kernel Model for Precision Timekeeping" by Dave Mills
23 * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
24 * (Even though the technical memorandum forbids it)
25 * 2004-07-14 Christoph Lameter
26 * Added getnstimeofday to allow the posix timer functions to return
27 * with nanosecond accuracy
28 */
29
30#include <linux/export.h>
31#include <linux/timex.h>
32#include <linux/capability.h>
33#include <linux/timekeeper_internal.h>
34#include <linux/errno.h>
35#include <linux/syscalls.h>
36#include <linux/security.h>
37#include <linux/fs.h>
38#include <linux/math64.h>
39#include <linux/ptrace.h>
40
41#include <asm/uaccess.h>
42#include <asm/unistd.h>
43
44#include "timeconst.h"
45
46/*
47 * The timezone where the local system is located. Used as a default by some
48 * programs who obtain this value by using gettimeofday.
49 */
50struct timezone sys_tz;
51
52EXPORT_SYMBOL(sys_tz);
53
54#ifdef __ARCH_WANT_SYS_TIME
55
56/*
57 * sys_time() can be implemented in user-level using
58 * sys_gettimeofday(). Is this for backwards compatibility? If so,
59 * why not move it into the appropriate arch directory (for those
60 * architectures that need it).
61 */
62SYSCALL_DEFINE1(time, time_t __user *, tloc)
63{
64 time_t i = get_seconds();
65
66 if (tloc) {
67 if (put_user(i,tloc))
68 return -EFAULT;
69 }
70 force_successful_syscall_return();
71 return i;
72}
73
74/*
75 * sys_stime() can be implemented in user-level using
76 * sys_settimeofday(). Is this for backwards compatibility? If so,
77 * why not move it into the appropriate arch directory (for those
78 * architectures that need it).
79 */
80
81SYSCALL_DEFINE1(stime, time_t __user *, tptr)
82{
83 struct timespec tv;
84 int err;
85
86 if (get_user(tv.tv_sec, tptr))
87 return -EFAULT;
88
89 tv.tv_nsec = 0;
90
91 err = security_settime(&tv, NULL);
92 if (err)
93 return err;
94
95 do_settimeofday(&tv);
96 return 0;
97}
98
99#endif /* __ARCH_WANT_SYS_TIME */
100
101SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
102 struct timezone __user *, tz)
103{
104 if (likely(tv != NULL)) {
105 struct timeval ktv;
106 do_gettimeofday(&ktv);
107 if (copy_to_user(tv, &ktv, sizeof(ktv)))
108 return -EFAULT;
109 }
110 if (unlikely(tz != NULL)) {
111 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
112 return -EFAULT;
113 }
114 return 0;
115}
116
117/*
118 * Indicates if there is an offset between the system clock and the hardware
119 * clock/persistent clock/rtc.
120 */
121int persistent_clock_is_local;
122
123/*
124 * Adjust the time obtained from the CMOS to be UTC time instead of
125 * local time.
126 *
127 * This is ugly, but preferable to the alternatives. Otherwise we
128 * would either need to write a program to do it in /etc/rc (and risk
129 * confusion if the program gets run more than once; it would also be
130 * hard to make the program warp the clock precisely n hours) or
131 * compile in the timezone information into the kernel. Bad, bad....
132 *
133 * - TYT, 1992-01-01
134 *
135 * The best thing to do is to keep the CMOS clock in universal time (UTC)
136 * as real UNIX machines always do it. This avoids all headaches about
137 * daylight saving times and warping kernel clocks.
138 */
139static inline void warp_clock(void)
140{
141 if (sys_tz.tz_minuteswest != 0) {
142 struct timespec adjust;
143
144 persistent_clock_is_local = 1;
145 adjust.tv_sec = sys_tz.tz_minuteswest * 60;
146 adjust.tv_nsec = 0;
147 timekeeping_inject_offset(&adjust);
148 }
149}
150
151/*
152 * In case for some reason the CMOS clock has not already been running
153 * in UTC, but in some local time: The first time we set the timezone,
154 * we will warp the clock so that it is ticking UTC time instead of
155 * local time. Presumably, if someone is setting the timezone then we
156 * are running in an environment where the programs understand about
157 * timezones. This should be done at boot time in the /etc/rc script,
158 * as soon as possible, so that the clock can be set right. Otherwise,
159 * various programs will get confused when the clock gets warped.
160 */
161
162int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
163{
164 static int firsttime = 1;
165 int error = 0;
166
167 if (tv && !timespec_valid(tv))
168 return -EINVAL;
169
170 error = security_settime(tv, tz);
171 if (error)
172 return error;
173
174 if (tz) {
175 sys_tz = *tz;
176 update_vsyscall_tz();
177 if (firsttime) {
178 firsttime = 0;
179 if (!tv)
180 warp_clock();
181 }
182 }
183 if (tv)
184 return do_settimeofday(tv);
185 return 0;
186}
187
188SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
189 struct timezone __user *, tz)
190{
191 struct timeval user_tv;
192 struct timespec new_ts;
193 struct timezone new_tz;
194
195 if (tv) {
196 if (copy_from_user(&user_tv, tv, sizeof(*tv)))
197 return -EFAULT;
198 new_ts.tv_sec = user_tv.tv_sec;
199 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
200 }
201 if (tz) {
202 if (copy_from_user(&new_tz, tz, sizeof(*tz)))
203 return -EFAULT;
204 }
205
206 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
207}
208
209SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
210{
211 struct timex txc; /* Local copy of parameter */
212 int ret;
213
214 /* Copy the user data space into the kernel copy
215 * structure. But bear in mind that the structures
216 * may change
217 */
218 if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
219 return -EFAULT;
220 ret = do_adjtimex(&txc);
221 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
222}
223
224/**
225 * current_fs_time - Return FS time
226 * @sb: Superblock.
227 *
228 * Return the current time truncated to the time granularity supported by
229 * the fs.
230 */
231struct timespec current_fs_time(struct super_block *sb)
232{
233 struct timespec now = current_kernel_time();
234 return timespec_trunc(now, sb->s_time_gran);
235}
236EXPORT_SYMBOL(current_fs_time);
237
238/*
239 * Convert jiffies to milliseconds and back.
240 *
241 * Avoid unnecessary multiplications/divisions in the
242 * two most common HZ cases:
243 */
244unsigned int jiffies_to_msecs(const unsigned long j)
245{
246#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
247 return (MSEC_PER_SEC / HZ) * j;
248#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
249 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
250#else
251# if BITS_PER_LONG == 32
252 return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
253# else
254 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
255# endif
256#endif
257}
258EXPORT_SYMBOL(jiffies_to_msecs);
259
260unsigned int jiffies_to_usecs(const unsigned long j)
261{
262#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
263 return (USEC_PER_SEC / HZ) * j;
264#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
265 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
266#else
267# if BITS_PER_LONG == 32
268 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
269# else
270 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
271# endif
272#endif
273}
274EXPORT_SYMBOL(jiffies_to_usecs);
275
276/**
277 * timespec_trunc - Truncate timespec to a granularity
278 * @t: Timespec
279 * @gran: Granularity in ns.
280 *
281 * Truncate a timespec to a granularity. gran must be smaller than a second.
282 * Always rounds down.
283 *
284 * This function should be only used for timestamps returned by
285 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
286 * it doesn't handle the better resolution of the latter.
287 */
288struct timespec timespec_trunc(struct timespec t, unsigned gran)
289{
290 /*
291 * Division is pretty slow so avoid it for common cases.
292 * Currently current_kernel_time() never returns better than
293 * jiffies resolution. Exploit that.
294 */
295 if (gran <= jiffies_to_usecs(1) * 1000) {
296 /* nothing */
297 } else if (gran == 1000000000) {
298 t.tv_nsec = 0;
299 } else {
300 t.tv_nsec -= t.tv_nsec % gran;
301 }
302 return t;
303}
304EXPORT_SYMBOL(timespec_trunc);
305
306/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
307 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
308 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
309 *
310 * [For the Julian calendar (which was used in Russia before 1917,
311 * Britain & colonies before 1752, anywhere else before 1582,
312 * and is still in use by some communities) leave out the
313 * -year/100+year/400 terms, and add 10.]
314 *
315 * This algorithm was first published by Gauss (I think).
316 *
317 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
318 * machines where long is 32-bit! (However, as time_t is signed, we
319 * will already get problems at other places on 2038-01-19 03:14:08)
320 */
321unsigned long
322mktime(const unsigned int year0, const unsigned int mon0,
323 const unsigned int day, const unsigned int hour,
324 const unsigned int min, const unsigned int sec)
325{
326 unsigned int mon = mon0, year = year0;
327
328 /* 1..12 -> 11,12,1..10 */
329 if (0 >= (int) (mon -= 2)) {
330 mon += 12; /* Puts Feb last since it has leap day */
331 year -= 1;
332 }
333
334 return ((((unsigned long)
335 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
336 year*365 - 719499
337 )*24 + hour /* now have hours */
338 )*60 + min /* now have minutes */
339 )*60 + sec; /* finally seconds */
340}
341
342EXPORT_SYMBOL(mktime);
343
344/**
345 * set_normalized_timespec - set timespec sec and nsec parts and normalize
346 *
347 * @ts: pointer to timespec variable to be set
348 * @sec: seconds to set
349 * @nsec: nanoseconds to set
350 *
351 * Set seconds and nanoseconds field of a timespec variable and
352 * normalize to the timespec storage format
353 *
354 * Note: The tv_nsec part is always in the range of
355 * 0 <= tv_nsec < NSEC_PER_SEC
356 * For negative values only the tv_sec field is negative !
357 */
358void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
359{
360 while (nsec >= NSEC_PER_SEC) {
361 /*
362 * The following asm() prevents the compiler from
363 * optimising this loop into a modulo operation. See
364 * also __iter_div_u64_rem() in include/linux/time.h
365 */
366 asm("" : "+rm"(nsec));
367 nsec -= NSEC_PER_SEC;
368 ++sec;
369 }
370 while (nsec < 0) {
371 asm("" : "+rm"(nsec));
372 nsec += NSEC_PER_SEC;
373 --sec;
374 }
375 ts->tv_sec = sec;
376 ts->tv_nsec = nsec;
377}
378EXPORT_SYMBOL(set_normalized_timespec);
379
380/**
381 * ns_to_timespec - Convert nanoseconds to timespec
382 * @nsec: the nanoseconds value to be converted
383 *
384 * Returns the timespec representation of the nsec parameter.
385 */
386struct timespec ns_to_timespec(const s64 nsec)
387{
388 struct timespec ts;
389 s32 rem;
390
391 if (!nsec)
392 return (struct timespec) {0, 0};
393
394 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
395 if (unlikely(rem < 0)) {
396 ts.tv_sec--;
397 rem += NSEC_PER_SEC;
398 }
399 ts.tv_nsec = rem;
400
401 return ts;
402}
403EXPORT_SYMBOL(ns_to_timespec);
404
405/**
406 * ns_to_timeval - Convert nanoseconds to timeval
407 * @nsec: the nanoseconds value to be converted
408 *
409 * Returns the timeval representation of the nsec parameter.
410 */
411struct timeval ns_to_timeval(const s64 nsec)
412{
413 struct timespec ts = ns_to_timespec(nsec);
414 struct timeval tv;
415
416 tv.tv_sec = ts.tv_sec;
417 tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
418
419 return tv;
420}
421EXPORT_SYMBOL(ns_to_timeval);
422
423/*
424 * When we convert to jiffies then we interpret incoming values
425 * the following way:
426 *
427 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
428 *
429 * - 'too large' values [that would result in larger than
430 * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
431 *
432 * - all other values are converted to jiffies by either multiplying
433 * the input value by a factor or dividing it with a factor
434 *
435 * We must also be careful about 32-bit overflows.
436 */
437unsigned long msecs_to_jiffies(const unsigned int m)
438{
439 /*
440 * Negative value, means infinite timeout:
441 */
442 if ((int)m < 0)
443 return MAX_JIFFY_OFFSET;
444
445#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
446 /*
447 * HZ is equal to or smaller than 1000, and 1000 is a nice
448 * round multiple of HZ, divide with the factor between them,
449 * but round upwards:
450 */
451 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
452#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
453 /*
454 * HZ is larger than 1000, and HZ is a nice round multiple of
455 * 1000 - simply multiply with the factor between them.
456 *
457 * But first make sure the multiplication result cannot
458 * overflow:
459 */
460 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
461 return MAX_JIFFY_OFFSET;
462
463 return m * (HZ / MSEC_PER_SEC);
464#else
465 /*
466 * Generic case - multiply, round and divide. But first
467 * check that if we are doing a net multiplication, that
468 * we wouldn't overflow:
469 */
470 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
471 return MAX_JIFFY_OFFSET;
472
473 return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
474 >> MSEC_TO_HZ_SHR32;
475#endif
476}
477EXPORT_SYMBOL(msecs_to_jiffies);
478
479unsigned long usecs_to_jiffies(const unsigned int u)
480{
481 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
482 return MAX_JIFFY_OFFSET;
483#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
484 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
485#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
486 return u * (HZ / USEC_PER_SEC);
487#else
488 return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
489 >> USEC_TO_HZ_SHR32;
490#endif
491}
492EXPORT_SYMBOL(usecs_to_jiffies);
493
494/*
495 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
496 * that a remainder subtract here would not do the right thing as the
497 * resolution values don't fall on second boundries. I.e. the line:
498 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
499 *
500 * Rather, we just shift the bits off the right.
501 *
502 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
503 * value to a scaled second value.
504 */
505unsigned long
506timespec_to_jiffies(const struct timespec *value)
507{
508 unsigned long sec = value->tv_sec;
509 long nsec = value->tv_nsec + TICK_NSEC - 1;
510
511 if (sec >= MAX_SEC_IN_JIFFIES){
512 sec = MAX_SEC_IN_JIFFIES;
513 nsec = 0;
514 }
515 return (((u64)sec * SEC_CONVERSION) +
516 (((u64)nsec * NSEC_CONVERSION) >>
517 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
518
519}
520EXPORT_SYMBOL(timespec_to_jiffies);
521
522void
523jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
524{
525 /*
526 * Convert jiffies to nanoseconds and separate with
527 * one divide.
528 */
529 u32 rem;
530 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
531 NSEC_PER_SEC, &rem);
532 value->tv_nsec = rem;
533}
534EXPORT_SYMBOL(jiffies_to_timespec);
535
536/* Same for "timeval"
537 *
538 * Well, almost. The problem here is that the real system resolution is
539 * in nanoseconds and the value being converted is in micro seconds.
540 * Also for some machines (those that use HZ = 1024, in-particular),
541 * there is a LARGE error in the tick size in microseconds.
542
543 * The solution we use is to do the rounding AFTER we convert the
544 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
545 * Instruction wise, this should cost only an additional add with carry
546 * instruction above the way it was done above.
547 */
548unsigned long
549timeval_to_jiffies(const struct timeval *value)
550{
551 unsigned long sec = value->tv_sec;
552 long usec = value->tv_usec;
553
554 if (sec >= MAX_SEC_IN_JIFFIES){
555 sec = MAX_SEC_IN_JIFFIES;
556 usec = 0;
557 }
558 return (((u64)sec * SEC_CONVERSION) +
559 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
560 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
561}
562EXPORT_SYMBOL(timeval_to_jiffies);
563
564void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
565{
566 /*
567 * Convert jiffies to nanoseconds and separate with
568 * one divide.
569 */
570 u32 rem;
571
572 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
573 NSEC_PER_SEC, &rem);
574 value->tv_usec = rem / NSEC_PER_USEC;
575}
576EXPORT_SYMBOL(jiffies_to_timeval);
577
578/*
579 * Convert jiffies/jiffies_64 to clock_t and back.
580 */
581clock_t jiffies_to_clock_t(unsigned long x)
582{
583#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
584# if HZ < USER_HZ
585 return x * (USER_HZ / HZ);
586# else
587 return x / (HZ / USER_HZ);
588# endif
589#else
590 return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
591#endif
592}
593EXPORT_SYMBOL(jiffies_to_clock_t);
594
595unsigned long clock_t_to_jiffies(unsigned long x)
596{
597#if (HZ % USER_HZ)==0
598 if (x >= ~0UL / (HZ / USER_HZ))
599 return ~0UL;
600 return x * (HZ / USER_HZ);
601#else
602 /* Don't worry about loss of precision here .. */
603 if (x >= ~0UL / HZ * USER_HZ)
604 return ~0UL;
605
606 /* .. but do try to contain it here */
607 return div_u64((u64)x * HZ, USER_HZ);
608#endif
609}
610EXPORT_SYMBOL(clock_t_to_jiffies);
611
612u64 jiffies_64_to_clock_t(u64 x)
613{
614#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
615# if HZ < USER_HZ
616 x = div_u64(x * USER_HZ, HZ);
617# elif HZ > USER_HZ
618 x = div_u64(x, HZ / USER_HZ);
619# else
620 /* Nothing to do */
621# endif
622#else
623 /*
624 * There are better ways that don't overflow early,
625 * but even this doesn't overflow in hundreds of years
626 * in 64 bits, so..
627 */
628 x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
629#endif
630 return x;
631}
632EXPORT_SYMBOL(jiffies_64_to_clock_t);
633
634u64 nsec_to_clock_t(u64 x)
635{
636#if (NSEC_PER_SEC % USER_HZ) == 0
637 return div_u64(x, NSEC_PER_SEC / USER_HZ);
638#elif (USER_HZ % 512) == 0
639 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
640#else
641 /*
642 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
643 * overflow after 64.99 years.
644 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
645 */
646 return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
647#endif
648}
649
650/**
651 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
652 *
653 * @n: nsecs in u64
654 *
655 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
656 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
657 * for scheduler, not for use in device drivers to calculate timeout value.
658 *
659 * note:
660 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
661 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
662 */
663u64 nsecs_to_jiffies64(u64 n)
664{
665#if (NSEC_PER_SEC % HZ) == 0
666 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
667 return div_u64(n, NSEC_PER_SEC / HZ);
668#elif (HZ % 512) == 0
669 /* overflow after 292 years if HZ = 1024 */
670 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
671#else
672 /*
673 * Generic case - optimized for cases where HZ is a multiple of 3.
674 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
675 */
676 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
677#endif
678}
679
680/**
681 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
682 *
683 * @n: nsecs in u64
684 *
685 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
686 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
687 * for scheduler, not for use in device drivers to calculate timeout value.
688 *
689 * note:
690 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
691 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
692 */
693unsigned long nsecs_to_jiffies(u64 n)
694{
695 return (unsigned long)nsecs_to_jiffies64(n);
696}
697
698/*
699 * Add two timespec values and do a safety check for overflow.
700 * It's assumed that both values are valid (>= 0)
701 */
702struct timespec timespec_add_safe(const struct timespec lhs,
703 const struct timespec rhs)
704{
705 struct timespec res;
706
707 set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
708 lhs.tv_nsec + rhs.tv_nsec);
709
710 if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
711 res.tv_sec = TIME_T_MAX;
712
713 return res;
714}
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/time/timeconst.bc
@@ -0,0 +1,108 @@
1scale=0
2
3define gcd(a,b) {
4 auto t;
5 while (b) {
6 t = b;
7 b = a % b;
8 a = t;
9 }
10 return a;
11}
12
13/* Division by reciprocal multiplication. */
14define fmul(b,n,d) {
15 return (2^b*n+d-1)/d;
16}
17
18/* Adjustment factor when a ceiling value is used. Use as:
19 (imul * n) + (fmulxx * n + fadjxx) >> xx) */
20define fadj(b,n,d) {
21 auto v;
22 d = d/gcd(n,d);
23 v = 2^b*(d-1)/d;
24 return v;
25}
26
27/* Compute the appropriate mul/adj values as well as a shift count,
28 which brings the mul value into the range 2^b-1 <= x < 2^b. Such
29 a shift value will be correct in the signed integer range and off
30 by at most one in the upper half of the unsigned range. */
31define fmuls(b,n,d) {
32 auto s, m;
33 for (s = 0; 1; s++) {
34 m = fmul(s,n,d);
35 if (m >= 2^(b-1))
36 return s;
37 }
38 return 0;
39}
40
41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n"
45
46 print "#ifndef KERNEL_TIMECONST_H\n"
47 print "#define KERNEL_TIMECONST_H\n\n"
48
49 print "#include <linux/param.h>\n"
50 print "#include <linux/types.h>\n\n"
51
52 print "#if HZ != ", hz, "\n"
53 print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
54 print "#endif\n\n"
55
56 if (hz < 2) {
57 print "#error Totally bogus HZ value!\n"
58 } else {
59 s=fmuls(32,1000,hz)
60 obase=16
61 print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
62 print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
63 obase=10
64 print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
65
66 s=fmuls(32,hz,1000)
67 obase=16
68 print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
69 print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
70 obase=10
71 print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
72
73 obase=10
74 cd=gcd(hz,1000)
75 print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
76 print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
77 print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
78 print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
79 print "\n"
80
81 s=fmuls(32,1000000,hz)
82 obase=16
83 print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
84 print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
85 obase=10
86 print "#define HZ_TO_USEC_SHR32\t", s, "\n"
87
88 s=fmuls(32,hz,1000000)
89 obase=16
90 print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
91 print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
92 obase=10
93 print "#define USEC_TO_HZ_SHR32\t", s, "\n"
94
95 obase=10
96 cd=gcd(hz,1000000)
97 print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
98 print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
99 print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
100 print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
101 print "\n"
102
103 print "#endif /* KERNEL_TIMECONST_H */\n"
104 }
105 halt
106}
107
108timeconst(hz)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
new file mode 100644
index 000000000000..3bb01a323b2a
--- /dev/null
+++ b/kernel/time/timer.c
@@ -0,0 +1,1734 @@
1/*
2 * linux/kernel/timer.c
3 *
4 * Kernel internal timers
5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds
7 *
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
9 *
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
20 */
21
22#include <linux/kernel_stat.h>
23#include <linux/export.h>
24#include <linux/interrupt.h>
25#include <linux/percpu.h>
26#include <linux/init.h>
27#include <linux/mm.h>
28#include <linux/swap.h>
29#include <linux/pid_namespace.h>
30#include <linux/notifier.h>
31#include <linux/thread_info.h>
32#include <linux/time.h>
33#include <linux/jiffies.h>
34#include <linux/posix-timers.h>
35#include <linux/cpu.h>
36#include <linux/syscalls.h>
37#include <linux/delay.h>
38#include <linux/tick.h>
39#include <linux/kallsyms.h>
40#include <linux/irq_work.h>
41#include <linux/sched.h>
42#include <linux/sched/sysctl.h>
43#include <linux/slab.h>
44#include <linux/compat.h>
45
46#include <asm/uaccess.h>
47#include <asm/unistd.h>
48#include <asm/div64.h>
49#include <asm/timex.h>
50#include <asm/io.h>
51
52#define CREATE_TRACE_POINTS
53#include <trace/events/timer.h>
54
55__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
56
57EXPORT_SYMBOL(jiffies_64);
58
59/*
60 * per-CPU timer vector definitions:
61 */
62#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
63#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
64#define TVN_SIZE (1 << TVN_BITS)
65#define TVR_SIZE (1 << TVR_BITS)
66#define TVN_MASK (TVN_SIZE - 1)
67#define TVR_MASK (TVR_SIZE - 1)
68#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
69
70struct tvec {
71 struct list_head vec[TVN_SIZE];
72};
73
74struct tvec_root {
75 struct list_head vec[TVR_SIZE];
76};
77
78struct tvec_base {
79 spinlock_t lock;
80 struct timer_list *running_timer;
81 unsigned long timer_jiffies;
82 unsigned long next_timer;
83 unsigned long active_timers;
84 unsigned long all_timers;
85 struct tvec_root tv1;
86 struct tvec tv2;
87 struct tvec tv3;
88 struct tvec tv4;
89 struct tvec tv5;
90} ____cacheline_aligned;
91
92struct tvec_base boot_tvec_bases;
93EXPORT_SYMBOL(boot_tvec_bases);
94static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
95
96/* Functions below help us manage 'deferrable' flag */
97static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
98{
99 return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
100}
101
102static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
103{
104 return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
105}
106
107static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
108{
109 return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
110}
111
112static inline void
113timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
114{
115 unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
116
117 timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
118}
119
120static unsigned long round_jiffies_common(unsigned long j, int cpu,
121 bool force_up)
122{
123 int rem;
124 unsigned long original = j;
125
126 /*
127 * We don't want all cpus firing their timers at once hitting the
128 * same lock or cachelines, so we skew each extra cpu with an extra
129 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
130 * already did this.
131 * The skew is done by adding 3*cpunr, then round, then subtract this
132 * extra offset again.
133 */
134 j += cpu * 3;
135
136 rem = j % HZ;
137
138 /*
139 * If the target jiffie is just after a whole second (which can happen
140 * due to delays of the timer irq, long irq off times etc etc) then
141 * we should round down to the whole second, not up. Use 1/4th second
142 * as cutoff for this rounding as an extreme upper bound for this.
143 * But never round down if @force_up is set.
144 */
145 if (rem < HZ/4 && !force_up) /* round down */
146 j = j - rem;
147 else /* round up */
148 j = j - rem + HZ;
149
150 /* now that we have rounded, subtract the extra skew again */
151 j -= cpu * 3;
152
153 /*
154 * Make sure j is still in the future. Otherwise return the
155 * unmodified value.
156 */
157 return time_is_after_jiffies(j) ? j : original;
158}
159
160/**
161 * __round_jiffies - function to round jiffies to a full second
162 * @j: the time in (absolute) jiffies that should be rounded
163 * @cpu: the processor number on which the timeout will happen
164 *
165 * __round_jiffies() rounds an absolute time in the future (in jiffies)
166 * up or down to (approximately) full seconds. This is useful for timers
167 * for which the exact time they fire does not matter too much, as long as
168 * they fire approximately every X seconds.
169 *
170 * By rounding these timers to whole seconds, all such timers will fire
171 * at the same time, rather than at various times spread out. The goal
172 * of this is to have the CPU wake up less, which saves power.
173 *
174 * The exact rounding is skewed for each processor to avoid all
175 * processors firing at the exact same time, which could lead
176 * to lock contention or spurious cache line bouncing.
177 *
178 * The return value is the rounded version of the @j parameter.
179 */
180unsigned long __round_jiffies(unsigned long j, int cpu)
181{
182 return round_jiffies_common(j, cpu, false);
183}
184EXPORT_SYMBOL_GPL(__round_jiffies);
185
186/**
187 * __round_jiffies_relative - function to round jiffies to a full second
188 * @j: the time in (relative) jiffies that should be rounded
189 * @cpu: the processor number on which the timeout will happen
190 *
191 * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
192 * up or down to (approximately) full seconds. This is useful for timers
193 * for which the exact time they fire does not matter too much, as long as
194 * they fire approximately every X seconds.
195 *
196 * By rounding these timers to whole seconds, all such timers will fire
197 * at the same time, rather than at various times spread out. The goal
198 * of this is to have the CPU wake up less, which saves power.
199 *
200 * The exact rounding is skewed for each processor to avoid all
201 * processors firing at the exact same time, which could lead
202 * to lock contention or spurious cache line bouncing.
203 *
204 * The return value is the rounded version of the @j parameter.
205 */
206unsigned long __round_jiffies_relative(unsigned long j, int cpu)
207{
208 unsigned long j0 = jiffies;
209
210 /* Use j0 because jiffies might change while we run */
211 return round_jiffies_common(j + j0, cpu, false) - j0;
212}
213EXPORT_SYMBOL_GPL(__round_jiffies_relative);
214
215/**
216 * round_jiffies - function to round jiffies to a full second
217 * @j: the time in (absolute) jiffies that should be rounded
218 *
219 * round_jiffies() rounds an absolute time in the future (in jiffies)
220 * up or down to (approximately) full seconds. This is useful for timers
221 * for which the exact time they fire does not matter too much, as long as
222 * they fire approximately every X seconds.
223 *
224 * By rounding these timers to whole seconds, all such timers will fire
225 * at the same time, rather than at various times spread out. The goal
226 * of this is to have the CPU wake up less, which saves power.
227 *
228 * The return value is the rounded version of the @j parameter.
229 */
230unsigned long round_jiffies(unsigned long j)
231{
232 return round_jiffies_common(j, raw_smp_processor_id(), false);
233}
234EXPORT_SYMBOL_GPL(round_jiffies);
235
236/**
237 * round_jiffies_relative - function to round jiffies to a full second
238 * @j: the time in (relative) jiffies that should be rounded
239 *
240 * round_jiffies_relative() rounds a time delta in the future (in jiffies)
241 * up or down to (approximately) full seconds. This is useful for timers
242 * for which the exact time they fire does not matter too much, as long as
243 * they fire approximately every X seconds.
244 *
245 * By rounding these timers to whole seconds, all such timers will fire
246 * at the same time, rather than at various times spread out. The goal
247 * of this is to have the CPU wake up less, which saves power.
248 *
249 * The return value is the rounded version of the @j parameter.
250 */
251unsigned long round_jiffies_relative(unsigned long j)
252{
253 return __round_jiffies_relative(j, raw_smp_processor_id());
254}
255EXPORT_SYMBOL_GPL(round_jiffies_relative);
256
257/**
258 * __round_jiffies_up - function to round jiffies up to a full second
259 * @j: the time in (absolute) jiffies that should be rounded
260 * @cpu: the processor number on which the timeout will happen
261 *
262 * This is the same as __round_jiffies() except that it will never
263 * round down. This is useful for timeouts for which the exact time
264 * of firing does not matter too much, as long as they don't fire too
265 * early.
266 */
267unsigned long __round_jiffies_up(unsigned long j, int cpu)
268{
269 return round_jiffies_common(j, cpu, true);
270}
271EXPORT_SYMBOL_GPL(__round_jiffies_up);
272
273/**
274 * __round_jiffies_up_relative - function to round jiffies up to a full second
275 * @j: the time in (relative) jiffies that should be rounded
276 * @cpu: the processor number on which the timeout will happen
277 *
278 * This is the same as __round_jiffies_relative() except that it will never
279 * round down. This is useful for timeouts for which the exact time
280 * of firing does not matter too much, as long as they don't fire too
281 * early.
282 */
283unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
284{
285 unsigned long j0 = jiffies;
286
287 /* Use j0 because jiffies might change while we run */
288 return round_jiffies_common(j + j0, cpu, true) - j0;
289}
290EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
291
292/**
293 * round_jiffies_up - function to round jiffies up to a full second
294 * @j: the time in (absolute) jiffies that should be rounded
295 *
296 * This is the same as round_jiffies() except that it will never
297 * round down. This is useful for timeouts for which the exact time
298 * of firing does not matter too much, as long as they don't fire too
299 * early.
300 */
301unsigned long round_jiffies_up(unsigned long j)
302{
303 return round_jiffies_common(j, raw_smp_processor_id(), true);
304}
305EXPORT_SYMBOL_GPL(round_jiffies_up);
306
307/**
308 * round_jiffies_up_relative - function to round jiffies up to a full second
309 * @j: the time in (relative) jiffies that should be rounded
310 *
311 * This is the same as round_jiffies_relative() except that it will never
312 * round down. This is useful for timeouts for which the exact time
313 * of firing does not matter too much, as long as they don't fire too
314 * early.
315 */
316unsigned long round_jiffies_up_relative(unsigned long j)
317{
318 return __round_jiffies_up_relative(j, raw_smp_processor_id());
319}
320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @timer: the timer to be modified
325 * @slack_hz: the amount of time (in jiffies) allowed for rounding
326 *
327 * Set the amount of time, in jiffies, that a certain timer has
328 * in terms of slack. By setting this value, the timer subsystem
329 * will schedule the actual timer somewhere between
330 * the time mod_timer() asks for, and that time plus the slack.
331 *
332 * By setting the slack to -1, a percentage of the delay is used
333 * instead.
334 */
335void set_timer_slack(struct timer_list *timer, int slack_hz)
336{
337 timer->slack = slack_hz;
338}
339EXPORT_SYMBOL_GPL(set_timer_slack);
340
341/*
342 * If the list is empty, catch up ->timer_jiffies to the current time.
343 * The caller must hold the tvec_base lock. Returns true if the list
344 * was empty and therefore ->timer_jiffies was updated.
345 */
346static bool catchup_timer_jiffies(struct tvec_base *base)
347{
348 if (!base->all_timers) {
349 base->timer_jiffies = jiffies;
350 return true;
351 }
352 return false;
353}
354
355static void
356__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
357{
358 unsigned long expires = timer->expires;
359 unsigned long idx = expires - base->timer_jiffies;
360 struct list_head *vec;
361
362 if (idx < TVR_SIZE) {
363 int i = expires & TVR_MASK;
364 vec = base->tv1.vec + i;
365 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
366 int i = (expires >> TVR_BITS) & TVN_MASK;
367 vec = base->tv2.vec + i;
368 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
369 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
370 vec = base->tv3.vec + i;
371 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
372 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
373 vec = base->tv4.vec + i;
374 } else if ((signed long) idx < 0) {
375 /*
376 * Can happen if you add a timer with expires == jiffies,
377 * or you set a timer to go off in the past
378 */
379 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
380 } else {
381 int i;
382 /* If the timeout is larger than MAX_TVAL (on 64-bit
383 * architectures or with CONFIG_BASE_SMALL=1) then we
384 * use the maximum timeout.
385 */
386 if (idx > MAX_TVAL) {
387 idx = MAX_TVAL;
388 expires = idx + base->timer_jiffies;
389 }
390 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
391 vec = base->tv5.vec + i;
392 }
393 /*
394 * Timers are FIFO:
395 */
396 list_add_tail(&timer->entry, vec);
397}
398
399static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
400{
401 (void)catchup_timer_jiffies(base);
402 __internal_add_timer(base, timer);
403 /*
404 * Update base->active_timers and base->next_timer
405 */
406 if (!tbase_get_deferrable(timer->base)) {
407 if (!base->active_timers++ ||
408 time_before(timer->expires, base->next_timer))
409 base->next_timer = timer->expires;
410 }
411 base->all_timers++;
412}
413
414#ifdef CONFIG_TIMER_STATS
415void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
416{
417 if (timer->start_site)
418 return;
419
420 timer->start_site = addr;
421 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
422 timer->start_pid = current->pid;
423}
424
425static void timer_stats_account_timer(struct timer_list *timer)
426{
427 unsigned int flag = 0;
428
429 if (likely(!timer->start_site))
430 return;
431 if (unlikely(tbase_get_deferrable(timer->base)))
432 flag |= TIMER_STATS_FLAG_DEFERRABLE;
433
434 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
435 timer->function, timer->start_comm, flag);
436}
437
438#else
439static void timer_stats_account_timer(struct timer_list *timer) {}
440#endif
441
442#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
443
444static struct debug_obj_descr timer_debug_descr;
445
446static void *timer_debug_hint(void *addr)
447{
448 return ((struct timer_list *) addr)->function;
449}
450
451/*
452 * fixup_init is called when:
453 * - an active object is initialized
454 */
455static int timer_fixup_init(void *addr, enum debug_obj_state state)
456{
457 struct timer_list *timer = addr;
458
459 switch (state) {
460 case ODEBUG_STATE_ACTIVE:
461 del_timer_sync(timer);
462 debug_object_init(timer, &timer_debug_descr);
463 return 1;
464 default:
465 return 0;
466 }
467}
468
469/* Stub timer callback for improperly used timers. */
470static void stub_timer(unsigned long data)
471{
472 WARN_ON(1);
473}
474
475/*
476 * fixup_activate is called when:
477 * - an active object is activated
478 * - an unknown object is activated (might be a statically initialized object)
479 */
480static int timer_fixup_activate(void *addr, enum debug_obj_state state)
481{
482 struct timer_list *timer = addr;
483
484 switch (state) {
485
486 case ODEBUG_STATE_NOTAVAILABLE:
487 /*
488 * This is not really a fixup. The timer was
489 * statically initialized. We just make sure that it
490 * is tracked in the object tracker.
491 */
492 if (timer->entry.next == NULL &&
493 timer->entry.prev == TIMER_ENTRY_STATIC) {
494 debug_object_init(timer, &timer_debug_descr);
495 debug_object_activate(timer, &timer_debug_descr);
496 return 0;
497 } else {
498 setup_timer(timer, stub_timer, 0);
499 return 1;
500 }
501 return 0;
502
503 case ODEBUG_STATE_ACTIVE:
504 WARN_ON(1);
505
506 default:
507 return 0;
508 }
509}
510
511/*
512 * fixup_free is called when:
513 * - an active object is freed
514 */
515static int timer_fixup_free(void *addr, enum debug_obj_state state)
516{
517 struct timer_list *timer = addr;
518
519 switch (state) {
520 case ODEBUG_STATE_ACTIVE:
521 del_timer_sync(timer);
522 debug_object_free(timer, &timer_debug_descr);
523 return 1;
524 default:
525 return 0;
526 }
527}
528
529/*
530 * fixup_assert_init is called when:
531 * - an untracked/uninit-ed object is found
532 */
533static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
534{
535 struct timer_list *timer = addr;
536
537 switch (state) {
538 case ODEBUG_STATE_NOTAVAILABLE:
539 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
540 /*
541 * This is not really a fixup. The timer was
542 * statically initialized. We just make sure that it
543 * is tracked in the object tracker.
544 */
545 debug_object_init(timer, &timer_debug_descr);
546 return 0;
547 } else {
548 setup_timer(timer, stub_timer, 0);
549 return 1;
550 }
551 default:
552 return 0;
553 }
554}
555
556static struct debug_obj_descr timer_debug_descr = {
557 .name = "timer_list",
558 .debug_hint = timer_debug_hint,
559 .fixup_init = timer_fixup_init,
560 .fixup_activate = timer_fixup_activate,
561 .fixup_free = timer_fixup_free,
562 .fixup_assert_init = timer_fixup_assert_init,
563};
564
565static inline void debug_timer_init(struct timer_list *timer)
566{
567 debug_object_init(timer, &timer_debug_descr);
568}
569
570static inline void debug_timer_activate(struct timer_list *timer)
571{
572 debug_object_activate(timer, &timer_debug_descr);
573}
574
575static inline void debug_timer_deactivate(struct timer_list *timer)
576{
577 debug_object_deactivate(timer, &timer_debug_descr);
578}
579
580static inline void debug_timer_free(struct timer_list *timer)
581{
582 debug_object_free(timer, &timer_debug_descr);
583}
584
585static inline void debug_timer_assert_init(struct timer_list *timer)
586{
587 debug_object_assert_init(timer, &timer_debug_descr);
588}
589
590static void do_init_timer(struct timer_list *timer, unsigned int flags,
591 const char *name, struct lock_class_key *key);
592
593void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
594 const char *name, struct lock_class_key *key)
595{
596 debug_object_init_on_stack(timer, &timer_debug_descr);
597 do_init_timer(timer, flags, name, key);
598}
599EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
600
601void destroy_timer_on_stack(struct timer_list *timer)
602{
603 debug_object_free(timer, &timer_debug_descr);
604}
605EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
606
607#else
608static inline void debug_timer_init(struct timer_list *timer) { }
609static inline void debug_timer_activate(struct timer_list *timer) { }
610static inline void debug_timer_deactivate(struct timer_list *timer) { }
611static inline void debug_timer_assert_init(struct timer_list *timer) { }
612#endif
613
614static inline void debug_init(struct timer_list *timer)
615{
616 debug_timer_init(timer);
617 trace_timer_init(timer);
618}
619
620static inline void
621debug_activate(struct timer_list *timer, unsigned long expires)
622{
623 debug_timer_activate(timer);
624 trace_timer_start(timer, expires);
625}
626
627static inline void debug_deactivate(struct timer_list *timer)
628{
629 debug_timer_deactivate(timer);
630 trace_timer_cancel(timer);
631}
632
633static inline void debug_assert_init(struct timer_list *timer)
634{
635 debug_timer_assert_init(timer);
636}
637
638static void do_init_timer(struct timer_list *timer, unsigned int flags,
639 const char *name, struct lock_class_key *key)
640{
641 struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
642
643 timer->entry.next = NULL;
644 timer->base = (void *)((unsigned long)base | flags);
645 timer->slack = -1;
646#ifdef CONFIG_TIMER_STATS
647 timer->start_site = NULL;
648 timer->start_pid = -1;
649 memset(timer->start_comm, 0, TASK_COMM_LEN);
650#endif
651 lockdep_init_map(&timer->lockdep_map, name, key, 0);
652}
653
654/**
655 * init_timer_key - initialize a timer
656 * @timer: the timer to be initialized
657 * @flags: timer flags
658 * @name: name of the timer
659 * @key: lockdep class key of the fake lock used for tracking timer
660 * sync lock dependencies
661 *
662 * init_timer_key() must be done to a timer prior calling *any* of the
663 * other timer functions.
664 */
665void init_timer_key(struct timer_list *timer, unsigned int flags,
666 const char *name, struct lock_class_key *key)
667{
668 debug_init(timer);
669 do_init_timer(timer, flags, name, key);
670}
671EXPORT_SYMBOL(init_timer_key);
672
673static inline void detach_timer(struct timer_list *timer, bool clear_pending)
674{
675 struct list_head *entry = &timer->entry;
676
677 debug_deactivate(timer);
678
679 __list_del(entry->prev, entry->next);
680 if (clear_pending)
681 entry->next = NULL;
682 entry->prev = LIST_POISON2;
683}
684
685static inline void
686detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
687{
688 detach_timer(timer, true);
689 if (!tbase_get_deferrable(timer->base))
690 base->active_timers--;
691 base->all_timers--;
692 (void)catchup_timer_jiffies(base);
693}
694
695static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
696 bool clear_pending)
697{
698 if (!timer_pending(timer))
699 return 0;
700
701 detach_timer(timer, clear_pending);
702 if (!tbase_get_deferrable(timer->base)) {
703 base->active_timers--;
704 if (timer->expires == base->next_timer)
705 base->next_timer = base->timer_jiffies;
706 }
707 base->all_timers--;
708 (void)catchup_timer_jiffies(base);
709 return 1;
710}
711
712/*
713 * We are using hashed locking: holding per_cpu(tvec_bases).lock
714 * means that all timers which are tied to this base via timer->base are
715 * locked, and the base itself is locked too.
716 *
717 * So __run_timers/migrate_timers can safely modify all timers which could
718 * be found on ->tvX lists.
719 *
720 * When the timer's base is locked, and the timer removed from list, it is
721 * possible to set timer->base = NULL and drop the lock: the timer remains
722 * locked.
723 */
724static struct tvec_base *lock_timer_base(struct timer_list *timer,
725 unsigned long *flags)
726 __acquires(timer->base->lock)
727{
728 struct tvec_base *base;
729
730 for (;;) {
731 struct tvec_base *prelock_base = timer->base;
732 base = tbase_get_base(prelock_base);
733 if (likely(base != NULL)) {
734 spin_lock_irqsave(&base->lock, *flags);
735 if (likely(prelock_base == timer->base))
736 return base;
737 /* The timer has migrated to another CPU */
738 spin_unlock_irqrestore(&base->lock, *flags);
739 }
740 cpu_relax();
741 }
742}
743
744static inline int
745__mod_timer(struct timer_list *timer, unsigned long expires,
746 bool pending_only, int pinned)
747{
748 struct tvec_base *base, *new_base;
749 unsigned long flags;
750 int ret = 0 , cpu;
751
752 timer_stats_timer_set_start_info(timer);
753 BUG_ON(!timer->function);
754
755 base = lock_timer_base(timer, &flags);
756
757 ret = detach_if_pending(timer, base, false);
758 if (!ret && pending_only)
759 goto out_unlock;
760
761 debug_activate(timer, expires);
762
763 cpu = get_nohz_timer_target(pinned);
764 new_base = per_cpu(tvec_bases, cpu);
765
766 if (base != new_base) {
767 /*
768 * We are trying to schedule the timer on the local CPU.
769 * However we can't change timer's base while it is running,
770 * otherwise del_timer_sync() can't detect that the timer's
771 * handler yet has not finished. This also guarantees that
772 * the timer is serialized wrt itself.
773 */
774 if (likely(base->running_timer != timer)) {
775 /* See the comment in lock_timer_base() */
776 timer_set_base(timer, NULL);
777 spin_unlock(&base->lock);
778 base = new_base;
779 spin_lock(&base->lock);
780 timer_set_base(timer, base);
781 }
782 }
783
784 timer->expires = expires;
785 internal_add_timer(base, timer);
786
787out_unlock:
788 spin_unlock_irqrestore(&base->lock, flags);
789
790 return ret;
791}
792
793/**
794 * mod_timer_pending - modify a pending timer's timeout
795 * @timer: the pending timer to be modified
796 * @expires: new timeout in jiffies
797 *
798 * mod_timer_pending() is the same for pending timers as mod_timer(),
799 * but will not re-activate and modify already deleted timers.
800 *
801 * It is useful for unserialized use of timers.
802 */
803int mod_timer_pending(struct timer_list *timer, unsigned long expires)
804{
805 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
806}
807EXPORT_SYMBOL(mod_timer_pending);
808
809/*
810 * Decide where to put the timer while taking the slack into account
811 *
812 * Algorithm:
813 * 1) calculate the maximum (absolute) time
814 * 2) calculate the highest bit where the expires and new max are different
815 * 3) use this bit to make a mask
816 * 4) use the bitmask to round down the maximum time, so that all last
817 * bits are zeros
818 */
819static inline
820unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
821{
822 unsigned long expires_limit, mask;
823 int bit;
824
825 if (timer->slack >= 0) {
826 expires_limit = expires + timer->slack;
827 } else {
828 long delta = expires - jiffies;
829
830 if (delta < 256)
831 return expires;
832
833 expires_limit = expires + delta / 256;
834 }
835 mask = expires ^ expires_limit;
836 if (mask == 0)
837 return expires;
838
839 bit = find_last_bit(&mask, BITS_PER_LONG);
840
841 mask = (1UL << bit) - 1;
842
843 expires_limit = expires_limit & ~(mask);
844
845 return expires_limit;
846}
847
848/**
849 * mod_timer - modify a timer's timeout
850 * @timer: the timer to be modified
851 * @expires: new timeout in jiffies
852 *
853 * mod_timer() is a more efficient way to update the expire field of an
854 * active timer (if the timer is inactive it will be activated)
855 *
856 * mod_timer(timer, expires) is equivalent to:
857 *
858 * del_timer(timer); timer->expires = expires; add_timer(timer);
859 *
860 * Note that if there are multiple unserialized concurrent users of the
861 * same timer, then mod_timer() is the only safe way to modify the timeout,
862 * since add_timer() cannot modify an already running timer.
863 *
864 * The function returns whether it has modified a pending timer or not.
865 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
866 * active timer returns 1.)
867 */
868int mod_timer(struct timer_list *timer, unsigned long expires)
869{
870 expires = apply_slack(timer, expires);
871
872 /*
873 * This is a common optimization triggered by the
874 * networking code - if the timer is re-modified
875 * to be the same thing then just return:
876 */
877 if (timer_pending(timer) && timer->expires == expires)
878 return 1;
879
880 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
881}
882EXPORT_SYMBOL(mod_timer);
883
884/**
885 * mod_timer_pinned - modify a timer's timeout
886 * @timer: the timer to be modified
887 * @expires: new timeout in jiffies
888 *
889 * mod_timer_pinned() is a way to update the expire field of an
890 * active timer (if the timer is inactive it will be activated)
891 * and to ensure that the timer is scheduled on the current CPU.
892 *
893 * Note that this does not prevent the timer from being migrated
894 * when the current CPU goes offline. If this is a problem for
895 * you, use CPU-hotplug notifiers to handle it correctly, for
896 * example, cancelling the timer when the corresponding CPU goes
897 * offline.
898 *
899 * mod_timer_pinned(timer, expires) is equivalent to:
900 *
901 * del_timer(timer); timer->expires = expires; add_timer(timer);
902 */
903int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
904{
905 if (timer->expires == expires && timer_pending(timer))
906 return 1;
907
908 return __mod_timer(timer, expires, false, TIMER_PINNED);
909}
910EXPORT_SYMBOL(mod_timer_pinned);
911
912/**
913 * add_timer - start a timer
914 * @timer: the timer to be added
915 *
916 * The kernel will do a ->function(->data) callback from the
917 * timer interrupt at the ->expires point in the future. The
918 * current time is 'jiffies'.
919 *
920 * The timer's ->expires, ->function (and if the handler uses it, ->data)
921 * fields must be set prior calling this function.
922 *
923 * Timers with an ->expires field in the past will be executed in the next
924 * timer tick.
925 */
926void add_timer(struct timer_list *timer)
927{
928 BUG_ON(timer_pending(timer));
929 mod_timer(timer, timer->expires);
930}
931EXPORT_SYMBOL(add_timer);
932
933/**
934 * add_timer_on - start a timer on a particular CPU
935 * @timer: the timer to be added
936 * @cpu: the CPU to start it on
937 *
938 * This is not very scalable on SMP. Double adds are not possible.
939 */
940void add_timer_on(struct timer_list *timer, int cpu)
941{
942 struct tvec_base *base = per_cpu(tvec_bases, cpu);
943 unsigned long flags;
944
945 timer_stats_timer_set_start_info(timer);
946 BUG_ON(timer_pending(timer) || !timer->function);
947 spin_lock_irqsave(&base->lock, flags);
948 timer_set_base(timer, base);
949 debug_activate(timer, timer->expires);
950 internal_add_timer(base, timer);
951 /*
952 * Check whether the other CPU is in dynticks mode and needs
953 * to be triggered to reevaluate the timer wheel.
954 * We are protected against the other CPU fiddling
955 * with the timer by holding the timer base lock. This also
956 * makes sure that a CPU on the way to stop its tick can not
957 * evaluate the timer wheel.
958 *
959 * Spare the IPI for deferrable timers on idle targets though.
960 * The next busy ticks will take care of it. Except full dynticks
961 * require special care against races with idle_cpu(), lets deal
962 * with that later.
963 */
964 if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
965 wake_up_nohz_cpu(cpu);
966
967 spin_unlock_irqrestore(&base->lock, flags);
968}
969EXPORT_SYMBOL_GPL(add_timer_on);
970
971/**
972 * del_timer - deactive a timer.
973 * @timer: the timer to be deactivated
974 *
975 * del_timer() deactivates a timer - this works on both active and inactive
976 * timers.
977 *
978 * The function returns whether it has deactivated a pending timer or not.
979 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
980 * active timer returns 1.)
981 */
982int del_timer(struct timer_list *timer)
983{
984 struct tvec_base *base;
985 unsigned long flags;
986 int ret = 0;
987
988 debug_assert_init(timer);
989
990 timer_stats_timer_clear_start_info(timer);
991 if (timer_pending(timer)) {
992 base = lock_timer_base(timer, &flags);
993 ret = detach_if_pending(timer, base, true);
994 spin_unlock_irqrestore(&base->lock, flags);
995 }
996
997 return ret;
998}
999EXPORT_SYMBOL(del_timer);
1000
1001/**
1002 * try_to_del_timer_sync - Try to deactivate a timer
1003 * @timer: timer do del
1004 *
1005 * This function tries to deactivate a timer. Upon successful (ret >= 0)
1006 * exit the timer is not queued and the handler is not running on any CPU.
1007 */
1008int try_to_del_timer_sync(struct timer_list *timer)
1009{
1010 struct tvec_base *base;
1011 unsigned long flags;
1012 int ret = -1;
1013
1014 debug_assert_init(timer);
1015
1016 base = lock_timer_base(timer, &flags);
1017
1018 if (base->running_timer != timer) {
1019 timer_stats_timer_clear_start_info(timer);
1020 ret = detach_if_pending(timer, base, true);
1021 }
1022 spin_unlock_irqrestore(&base->lock, flags);
1023
1024 return ret;
1025}
1026EXPORT_SYMBOL(try_to_del_timer_sync);
1027
1028#ifdef CONFIG_SMP
1029/**
1030 * del_timer_sync - deactivate a timer and wait for the handler to finish.
1031 * @timer: the timer to be deactivated
1032 *
1033 * This function only differs from del_timer() on SMP: besides deactivating
1034 * the timer it also makes sure the handler has finished executing on other
1035 * CPUs.
1036 *
1037 * Synchronization rules: Callers must prevent restarting of the timer,
1038 * otherwise this function is meaningless. It must not be called from
1039 * interrupt contexts unless the timer is an irqsafe one. The caller must
1040 * not hold locks which would prevent completion of the timer's
1041 * handler. The timer's handler must not call add_timer_on(). Upon exit the
1042 * timer is not queued and the handler is not running on any CPU.
1043 *
1044 * Note: For !irqsafe timers, you must not hold locks that are held in
1045 * interrupt context while calling this function. Even if the lock has
1046 * nothing to do with the timer in question. Here's why:
1047 *
1048 * CPU0 CPU1
1049 * ---- ----
1050 * <SOFTIRQ>
1051 * call_timer_fn();
1052 * base->running_timer = mytimer;
1053 * spin_lock_irq(somelock);
1054 * <IRQ>
1055 * spin_lock(somelock);
1056 * del_timer_sync(mytimer);
1057 * while (base->running_timer == mytimer);
1058 *
1059 * Now del_timer_sync() will never return and never release somelock.
1060 * The interrupt on the other CPU is waiting to grab somelock but
1061 * it has interrupted the softirq that CPU0 is waiting to finish.
1062 *
1063 * The function returns whether it has deactivated a pending timer or not.
1064 */
1065int del_timer_sync(struct timer_list *timer)
1066{
1067#ifdef CONFIG_LOCKDEP
1068 unsigned long flags;
1069
1070 /*
1071 * If lockdep gives a backtrace here, please reference
1072 * the synchronization rules above.
1073 */
1074 local_irq_save(flags);
1075 lock_map_acquire(&timer->lockdep_map);
1076 lock_map_release(&timer->lockdep_map);
1077 local_irq_restore(flags);
1078#endif
1079 /*
1080 * don't use it in hardirq context, because it
1081 * could lead to deadlock.
1082 */
1083 WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
1084 for (;;) {
1085 int ret = try_to_del_timer_sync(timer);
1086 if (ret >= 0)
1087 return ret;
1088 cpu_relax();
1089 }
1090}
1091EXPORT_SYMBOL(del_timer_sync);
1092#endif
1093
1094static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1095{
1096 /* cascade all the timers from tv up one level */
1097 struct timer_list *timer, *tmp;
1098 struct list_head tv_list;
1099
1100 list_replace_init(tv->vec + index, &tv_list);
1101
1102 /*
1103 * We are removing _all_ timers from the list, so we
1104 * don't have to detach them individually.
1105 */
1106 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1107 BUG_ON(tbase_get_base(timer->base) != base);
1108 /* No accounting, while moving them */
1109 __internal_add_timer(base, timer);
1110 }
1111
1112 return index;
1113}
1114
1115static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1116 unsigned long data)
1117{
1118 int count = preempt_count();
1119
1120#ifdef CONFIG_LOCKDEP
1121 /*
1122 * It is permissible to free the timer from inside the
1123 * function that is called from it, this we need to take into
1124 * account for lockdep too. To avoid bogus "held lock freed"
1125 * warnings as well as problems when looking into
1126 * timer->lockdep_map, make a copy and use that here.
1127 */
1128 struct lockdep_map lockdep_map;
1129
1130 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1131#endif
1132 /*
1133 * Couple the lock chain with the lock chain at
1134 * del_timer_sync() by acquiring the lock_map around the fn()
1135 * call here and in del_timer_sync().
1136 */
1137 lock_map_acquire(&lockdep_map);
1138
1139 trace_timer_expire_entry(timer);
1140 fn(data);
1141 trace_timer_expire_exit(timer);
1142
1143 lock_map_release(&lockdep_map);
1144
1145 if (count != preempt_count()) {
1146 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1147 fn, count, preempt_count());
1148 /*
1149 * Restore the preempt count. That gives us a decent
1150 * chance to survive and extract information. If the
1151 * callback kept a lock held, bad luck, but not worse
1152 * than the BUG() we had.
1153 */
1154 preempt_count_set(count);
1155 }
1156}
1157
1158#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
1159
1160/**
1161 * __run_timers - run all expired timers (if any) on this CPU.
1162 * @base: the timer vector to be processed.
1163 *
1164 * This function cascades all vectors and executes all expired timer
1165 * vectors.
1166 */
1167static inline void __run_timers(struct tvec_base *base)
1168{
1169 struct timer_list *timer;
1170
1171 spin_lock_irq(&base->lock);
1172 if (catchup_timer_jiffies(base)) {
1173 spin_unlock_irq(&base->lock);
1174 return;
1175 }
1176 while (time_after_eq(jiffies, base->timer_jiffies)) {
1177 struct list_head work_list;
1178 struct list_head *head = &work_list;
1179 int index = base->timer_jiffies & TVR_MASK;
1180
1181 /*
1182 * Cascade timers:
1183 */
1184 if (!index &&
1185 (!cascade(base, &base->tv2, INDEX(0))) &&
1186 (!cascade(base, &base->tv3, INDEX(1))) &&
1187 !cascade(base, &base->tv4, INDEX(2)))
1188 cascade(base, &base->tv5, INDEX(3));
1189 ++base->timer_jiffies;
1190 list_replace_init(base->tv1.vec + index, head);
1191 while (!list_empty(head)) {
1192 void (*fn)(unsigned long);
1193 unsigned long data;
1194 bool irqsafe;
1195
1196 timer = list_first_entry(head, struct timer_list,entry);
1197 fn = timer->function;
1198 data = timer->data;
1199 irqsafe = tbase_get_irqsafe(timer->base);
1200
1201 timer_stats_account_timer(timer);
1202
1203 base->running_timer = timer;
1204 detach_expired_timer(timer, base);
1205
1206 if (irqsafe) {
1207 spin_unlock(&base->lock);
1208 call_timer_fn(timer, fn, data);
1209 spin_lock(&base->lock);
1210 } else {
1211 spin_unlock_irq(&base->lock);
1212 call_timer_fn(timer, fn, data);
1213 spin_lock_irq(&base->lock);
1214 }
1215 }
1216 }
1217 base->running_timer = NULL;
1218 spin_unlock_irq(&base->lock);
1219}
1220
1221#ifdef CONFIG_NO_HZ_COMMON
1222/*
1223 * Find out when the next timer event is due to happen. This
1224 * is used on S/390 to stop all activity when a CPU is idle.
1225 * This function needs to be called with interrupts disabled.
1226 */
1227static unsigned long __next_timer_interrupt(struct tvec_base *base)
1228{
1229 unsigned long timer_jiffies = base->timer_jiffies;
1230 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
1231 int index, slot, array, found = 0;
1232 struct timer_list *nte;
1233 struct tvec *varray[4];
1234
1235 /* Look for timer events in tv1. */
1236 index = slot = timer_jiffies & TVR_MASK;
1237 do {
1238 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
1239 if (tbase_get_deferrable(nte->base))
1240 continue;
1241
1242 found = 1;
1243 expires = nte->expires;
1244 /* Look at the cascade bucket(s)? */
1245 if (!index || slot < index)
1246 goto cascade;
1247 return expires;
1248 }
1249 slot = (slot + 1) & TVR_MASK;
1250 } while (slot != index);
1251
1252cascade:
1253 /* Calculate the next cascade event */
1254 if (index)
1255 timer_jiffies += TVR_SIZE - index;
1256 timer_jiffies >>= TVR_BITS;
1257
1258 /* Check tv2-tv5. */
1259 varray[0] = &base->tv2;
1260 varray[1] = &base->tv3;
1261 varray[2] = &base->tv4;
1262 varray[3] = &base->tv5;
1263
1264 for (array = 0; array < 4; array++) {
1265 struct tvec *varp = varray[array];
1266
1267 index = slot = timer_jiffies & TVN_MASK;
1268 do {
1269 list_for_each_entry(nte, varp->vec + slot, entry) {
1270 if (tbase_get_deferrable(nte->base))
1271 continue;
1272
1273 found = 1;
1274 if (time_before(nte->expires, expires))
1275 expires = nte->expires;
1276 }
1277 /*
1278 * Do we still search for the first timer or are
1279 * we looking up the cascade buckets ?
1280 */
1281 if (found) {
1282 /* Look at the cascade bucket(s)? */
1283 if (!index || slot < index)
1284 break;
1285 return expires;
1286 }
1287 slot = (slot + 1) & TVN_MASK;
1288 } while (slot != index);
1289
1290 if (index)
1291 timer_jiffies += TVN_SIZE - index;
1292 timer_jiffies >>= TVN_BITS;
1293 }
1294 return expires;
1295}
1296
1297/*
1298 * Check, if the next hrtimer event is before the next timer wheel
1299 * event:
1300 */
1301static unsigned long cmp_next_hrtimer_event(unsigned long now,
1302 unsigned long expires)
1303{
1304 ktime_t hr_delta = hrtimer_get_next_event();
1305 struct timespec tsdelta;
1306 unsigned long delta;
1307
1308 if (hr_delta.tv64 == KTIME_MAX)
1309 return expires;
1310
1311 /*
1312 * Expired timer available, let it expire in the next tick
1313 */
1314 if (hr_delta.tv64 <= 0)
1315 return now + 1;
1316
1317 tsdelta = ktime_to_timespec(hr_delta);
1318 delta = timespec_to_jiffies(&tsdelta);
1319
1320 /*
1321 * Limit the delta to the max value, which is checked in
1322 * tick_nohz_stop_sched_tick():
1323 */
1324 if (delta > NEXT_TIMER_MAX_DELTA)
1325 delta = NEXT_TIMER_MAX_DELTA;
1326
1327 /*
1328 * Take rounding errors in to account and make sure, that it
1329 * expires in the next tick. Otherwise we go into an endless
1330 * ping pong due to tick_nohz_stop_sched_tick() retriggering
1331 * the timer softirq
1332 */
1333 if (delta < 1)
1334 delta = 1;
1335 now += delta;
1336 if (time_before(now, expires))
1337 return now;
1338 return expires;
1339}
1340
1341/**
1342 * get_next_timer_interrupt - return the jiffy of the next pending timer
1343 * @now: current time (in jiffies)
1344 */
1345unsigned long get_next_timer_interrupt(unsigned long now)
1346{
1347 struct tvec_base *base = __this_cpu_read(tvec_bases);
1348 unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
1349
1350 /*
1351 * Pretend that there is no timer pending if the cpu is offline.
1352 * Possible pending timers will be migrated later to an active cpu.
1353 */
1354 if (cpu_is_offline(smp_processor_id()))
1355 return expires;
1356
1357 spin_lock(&base->lock);
1358 if (base->active_timers) {
1359 if (time_before_eq(base->next_timer, base->timer_jiffies))
1360 base->next_timer = __next_timer_interrupt(base);
1361 expires = base->next_timer;
1362 }
1363 spin_unlock(&base->lock);
1364
1365 if (time_before_eq(expires, now))
1366 return now;
1367
1368 return cmp_next_hrtimer_event(now, expires);
1369}
1370#endif
1371
1372/*
1373 * Called from the timer interrupt handler to charge one tick to the current
1374 * process. user_tick is 1 if the tick is user time, 0 for system.
1375 */
1376void update_process_times(int user_tick)
1377{
1378 struct task_struct *p = current;
1379 int cpu = smp_processor_id();
1380
1381 /* Note: this timer irq context must be accounted for as well. */
1382 account_process_tick(p, user_tick);
1383 run_local_timers();
1384 rcu_check_callbacks(cpu, user_tick);
1385#ifdef CONFIG_IRQ_WORK
1386 if (in_irq())
1387 irq_work_run();
1388#endif
1389 scheduler_tick();
1390 run_posix_cpu_timers(p);
1391}
1392
1393/*
1394 * This function runs timers and the timer-tq in bottom half context.
1395 */
1396static void run_timer_softirq(struct softirq_action *h)
1397{
1398 struct tvec_base *base = __this_cpu_read(tvec_bases);
1399
1400 hrtimer_run_pending();
1401
1402 if (time_after_eq(jiffies, base->timer_jiffies))
1403 __run_timers(base);
1404}
1405
1406/*
1407 * Called by the local, per-CPU timer interrupt on SMP.
1408 */
1409void run_local_timers(void)
1410{
1411 hrtimer_run_queues();
1412 raise_softirq(TIMER_SOFTIRQ);
1413}
1414
1415#ifdef __ARCH_WANT_SYS_ALARM
1416
1417/*
1418 * For backwards compatibility? This can be done in libc so Alpha
1419 * and all newer ports shouldn't need it.
1420 */
1421SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1422{
1423 return alarm_setitimer(seconds);
1424}
1425
1426#endif
1427
1428static void process_timeout(unsigned long __data)
1429{
1430 wake_up_process((struct task_struct *)__data);
1431}
1432
1433/**
1434 * schedule_timeout - sleep until timeout
1435 * @timeout: timeout value in jiffies
1436 *
1437 * Make the current task sleep until @timeout jiffies have
1438 * elapsed. The routine will return immediately unless
1439 * the current task state has been set (see set_current_state()).
1440 *
1441 * You can set the task state as follows -
1442 *
1443 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1444 * pass before the routine returns. The routine will return 0
1445 *
1446 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1447 * delivered to the current task. In this case the remaining time
1448 * in jiffies will be returned, or 0 if the timer expired in time
1449 *
1450 * The current task state is guaranteed to be TASK_RUNNING when this
1451 * routine returns.
1452 *
1453 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1454 * the CPU away without a bound on the timeout. In this case the return
1455 * value will be %MAX_SCHEDULE_TIMEOUT.
1456 *
1457 * In all cases the return value is guaranteed to be non-negative.
1458 */
1459signed long __sched schedule_timeout(signed long timeout)
1460{
1461 struct timer_list timer;
1462 unsigned long expire;
1463
1464 switch (timeout)
1465 {
1466 case MAX_SCHEDULE_TIMEOUT:
1467 /*
1468 * These two special cases are useful to be comfortable
1469 * in the caller. Nothing more. We could take
1470 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1471 * but I' d like to return a valid offset (>=0) to allow
1472 * the caller to do everything it want with the retval.
1473 */
1474 schedule();
1475 goto out;
1476 default:
1477 /*
1478 * Another bit of PARANOID. Note that the retval will be
1479 * 0 since no piece of kernel is supposed to do a check
1480 * for a negative retval of schedule_timeout() (since it
1481 * should never happens anyway). You just have the printk()
1482 * that will tell you if something is gone wrong and where.
1483 */
1484 if (timeout < 0) {
1485 printk(KERN_ERR "schedule_timeout: wrong timeout "
1486 "value %lx\n", timeout);
1487 dump_stack();
1488 current->state = TASK_RUNNING;
1489 goto out;
1490 }
1491 }
1492
1493 expire = timeout + jiffies;
1494
1495 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1496 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1497 schedule();
1498 del_singleshot_timer_sync(&timer);
1499
1500 /* Remove the timer from the object tracker */
1501 destroy_timer_on_stack(&timer);
1502
1503 timeout = expire - jiffies;
1504
1505 out:
1506 return timeout < 0 ? 0 : timeout;
1507}
1508EXPORT_SYMBOL(schedule_timeout);
1509
1510/*
1511 * We can use __set_current_state() here because schedule_timeout() calls
1512 * schedule() unconditionally.
1513 */
1514signed long __sched schedule_timeout_interruptible(signed long timeout)
1515{
1516 __set_current_state(TASK_INTERRUPTIBLE);
1517 return schedule_timeout(timeout);
1518}
1519EXPORT_SYMBOL(schedule_timeout_interruptible);
1520
1521signed long __sched schedule_timeout_killable(signed long timeout)
1522{
1523 __set_current_state(TASK_KILLABLE);
1524 return schedule_timeout(timeout);
1525}
1526EXPORT_SYMBOL(schedule_timeout_killable);
1527
1528signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1529{
1530 __set_current_state(TASK_UNINTERRUPTIBLE);
1531 return schedule_timeout(timeout);
1532}
1533EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1534
1535static int init_timers_cpu(int cpu)
1536{
1537 int j;
1538 struct tvec_base *base;
1539 static char tvec_base_done[NR_CPUS];
1540
1541 if (!tvec_base_done[cpu]) {
1542 static char boot_done;
1543
1544 if (boot_done) {
1545 /*
1546 * The APs use this path later in boot
1547 */
1548 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1549 cpu_to_node(cpu));
1550 if (!base)
1551 return -ENOMEM;
1552
1553 /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
1554 if (WARN_ON(base != tbase_get_base(base))) {
1555 kfree(base);
1556 return -ENOMEM;
1557 }
1558 per_cpu(tvec_bases, cpu) = base;
1559 } else {
1560 /*
1561 * This is for the boot CPU - we use compile-time
1562 * static initialisation because per-cpu memory isn't
1563 * ready yet and because the memory allocators are not
1564 * initialised either.
1565 */
1566 boot_done = 1;
1567 base = &boot_tvec_bases;
1568 }
1569 spin_lock_init(&base->lock);
1570 tvec_base_done[cpu] = 1;
1571 } else {
1572 base = per_cpu(tvec_bases, cpu);
1573 }
1574
1575
1576 for (j = 0; j < TVN_SIZE; j++) {
1577 INIT_LIST_HEAD(base->tv5.vec + j);
1578 INIT_LIST_HEAD(base->tv4.vec + j);
1579 INIT_LIST_HEAD(base->tv3.vec + j);
1580 INIT_LIST_HEAD(base->tv2.vec + j);
1581 }
1582 for (j = 0; j < TVR_SIZE; j++)
1583 INIT_LIST_HEAD(base->tv1.vec + j);
1584
1585 base->timer_jiffies = jiffies;
1586 base->next_timer = base->timer_jiffies;
1587 base->active_timers = 0;
1588 base->all_timers = 0;
1589 return 0;
1590}
1591
1592#ifdef CONFIG_HOTPLUG_CPU
1593static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1594{
1595 struct timer_list *timer;
1596
1597 while (!list_empty(head)) {
1598 timer = list_first_entry(head, struct timer_list, entry);
1599 /* We ignore the accounting on the dying cpu */
1600 detach_timer(timer, false);
1601 timer_set_base(timer, new_base);
1602 internal_add_timer(new_base, timer);
1603 }
1604}
1605
1606static void migrate_timers(int cpu)
1607{
1608 struct tvec_base *old_base;
1609 struct tvec_base *new_base;
1610 int i;
1611
1612 BUG_ON(cpu_online(cpu));
1613 old_base = per_cpu(tvec_bases, cpu);
1614 new_base = get_cpu_var(tvec_bases);
1615 /*
1616 * The caller is globally serialized and nobody else
1617 * takes two locks at once, deadlock is not possible.
1618 */
1619 spin_lock_irq(&new_base->lock);
1620 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1621
1622 BUG_ON(old_base->running_timer);
1623
1624 for (i = 0; i < TVR_SIZE; i++)
1625 migrate_timer_list(new_base, old_base->tv1.vec + i);
1626 for (i = 0; i < TVN_SIZE; i++) {
1627 migrate_timer_list(new_base, old_base->tv2.vec + i);
1628 migrate_timer_list(new_base, old_base->tv3.vec + i);
1629 migrate_timer_list(new_base, old_base->tv4.vec + i);
1630 migrate_timer_list(new_base, old_base->tv5.vec + i);
1631 }
1632
1633 spin_unlock(&old_base->lock);
1634 spin_unlock_irq(&new_base->lock);
1635 put_cpu_var(tvec_bases);
1636}
1637#endif /* CONFIG_HOTPLUG_CPU */
1638
1639static int timer_cpu_notify(struct notifier_block *self,
1640 unsigned long action, void *hcpu)
1641{
1642 long cpu = (long)hcpu;
1643 int err;
1644
1645 switch(action) {
1646 case CPU_UP_PREPARE:
1647 case CPU_UP_PREPARE_FROZEN:
1648 err = init_timers_cpu(cpu);
1649 if (err < 0)
1650 return notifier_from_errno(err);
1651 break;
1652#ifdef CONFIG_HOTPLUG_CPU
1653 case CPU_DEAD:
1654 case CPU_DEAD_FROZEN:
1655 migrate_timers(cpu);
1656 break;
1657#endif
1658 default:
1659 break;
1660 }
1661 return NOTIFY_OK;
1662}
1663
1664static struct notifier_block timers_nb = {
1665 .notifier_call = timer_cpu_notify,
1666};
1667
1668
1669void __init init_timers(void)
1670{
1671 int err;
1672
1673 /* ensure there are enough low bits for flags in timer->base pointer */
1674 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
1675
1676 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1677 (void *)(long)smp_processor_id());
1678 BUG_ON(err != NOTIFY_OK);
1679
1680 init_timer_stats();
1681 register_cpu_notifier(&timers_nb);
1682 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1683}
1684
1685/**
1686 * msleep - sleep safely even with waitqueue interruptions
1687 * @msecs: Time in milliseconds to sleep for
1688 */
1689void msleep(unsigned int msecs)
1690{
1691 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1692
1693 while (timeout)
1694 timeout = schedule_timeout_uninterruptible(timeout);
1695}
1696
1697EXPORT_SYMBOL(msleep);
1698
1699/**
1700 * msleep_interruptible - sleep waiting for signals
1701 * @msecs: Time in milliseconds to sleep for
1702 */
1703unsigned long msleep_interruptible(unsigned int msecs)
1704{
1705 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1706
1707 while (timeout && !signal_pending(current))
1708 timeout = schedule_timeout_interruptible(timeout);
1709 return jiffies_to_msecs(timeout);
1710}
1711
1712EXPORT_SYMBOL(msleep_interruptible);
1713
1714static int __sched do_usleep_range(unsigned long min, unsigned long max)
1715{
1716 ktime_t kmin;
1717 unsigned long delta;
1718
1719 kmin = ktime_set(0, min * NSEC_PER_USEC);
1720 delta = (max - min) * NSEC_PER_USEC;
1721 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1722}
1723
1724/**
1725 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1726 * @min: Minimum time in usecs to sleep
1727 * @max: Maximum time in usecs to sleep
1728 */
1729void usleep_range(unsigned long min, unsigned long max)
1730{
1731 __set_current_state(TASK_UNINTERRUPTIBLE);
1732 do_usleep_range(min, max);
1733}
1734EXPORT_SYMBOL(usleep_range);