aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/time
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-05 20:46:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-05 20:46:42 -0400
commite7fda6c4c3c1a7d6996dd75fd84670fa0b5d448f (patch)
treedaa51c16462c318b890acf7f01fba5827275dd74 /kernel/time
parent08d69a25714429850cf9ef71f22d8cdc9189d93f (diff)
parent953dec21aed4038464fec02f96a2f1b8701a5bce (diff)
Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer and time updates from Thomas Gleixner: "A rather large update of timers, timekeeping & co - Core timekeeping code is year-2038 safe now for 32bit machines. Now we just need to fix all in kernel users and the gazillion of user space interfaces which rely on timespec/timeval :) - Better cache layout for the timekeeping internal data structures. - Proper nanosecond based interfaces for in kernel users. - Tree wide cleanup of code which wants nanoseconds but does hoops and loops to convert back and forth from timespecs. Some of it definitely belongs into the ugly code museum. - Consolidation of the timekeeping interface zoo. - A fast NMI safe accessor to clock monotonic for tracing. This is a long standing request to support correlated user/kernel space traces. With proper NTP frequency correction it's also suitable for correlation of traces accross separate machines. - Checkpoint/restart support for timerfd. - A few NOHZ[_FULL] improvements in the [hr]timer code. - Code move from kernel to kernel/time of all time* related code. - New clocksource/event drivers from the ARM universe. I'm really impressed that despite an architected timer in the newer chips SoC manufacturers insist on inventing new and differently broken SoC specific timers. [ Ed. "Impressed"? I don't think that word means what you think it means ] - Another round of code move from arch to drivers. Looks like most of the legacy mess in ARM regarding timers is sorted out except for a few obnoxious strongholds. - The usual updates and fixlets all over the place" * 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (114 commits) timekeeping: Fixup typo in update_vsyscall_old definition clocksource: document some basic timekeeping concepts timekeeping: Use cached ntp_tick_length when accumulating error timekeeping: Rework frequency adjustments to work better w/ nohz timekeeping: Minor fixup for timespec64->timespec assignment ftrace: Provide trace clocks monotonic timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC seqcount: Add raw_write_seqcount_latch() seqcount: Provide raw_read_seqcount() timekeeping: Use tk_read_base as argument for timekeeping_get_ns() timekeeping: Create struct tk_read_base and use it in struct timekeeper timekeeping: Restructure the timekeeper some more clocksource: Get rid of cycle_last clocksource: Move cycle_last validation to core code clocksource: Make delta calculation a function wireless: ath9k: Get rid of timespec conversions drm: vmwgfx: Use nsec based interfaces drm: i915: Use nsec based interfaces timekeeping: Provide ktime_get_raw() hangcheck-timer: Use ktime_get_ns() ...
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/Makefile19
-rw-r--r--kernel/time/clocksource.c12
-rw-r--r--kernel/time/hrtimer.c1866
-rw-r--r--kernel/time/itimer.c301
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c1490
-rw-r--r--kernel/time/posix-timers.c1123
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/time.c778
-rw-r--r--kernel/time/timeconst.bc108
-rw-r--r--kernel/time/timekeeping.c1147
-rw-r--r--kernel/time/timekeeping.h20
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/time/timekeeping_internal.h17
-rw-r--r--kernel/time/timer.c1736
-rw-r--r--kernel/time/udelay_test.c168
18 files changed, 8260 insertions, 555 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f448513a45ed..d626dc98e8df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Clocksources require validation of the clocksource against the last
16# cycle update - x86/TSC misfeature
17config CLOCKSOURCE_VALIDATE_LAST_CYCLE
18 bool
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL
20config GENERIC_TIME_VSYSCALL_OLD 25config GENERIC_TIME_VSYSCALL_OLD
21 bool 26 bool
22 27
23# ktime_t scalar 64bit nsec representation
24config KTIME_SCALAR
25 bool
26
27# Old style timekeeping 28# Old style timekeeping
28config ARCH_USES_GETTIMEOFFSET 29config ARCH_USES_GETTIMEOFFSET
29 bool 30 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 57a413fd0ebf..7347426fa68d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,3 +1,4 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o posix-clock.o alarmtimer.o
3 4
@@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
12obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
13obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
14obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
17
18$(obj)/time.o: $(obj)/timeconst.h
19
20quiet_cmd_hzfile = HZFILE $@
21 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
22
23targets += hz.bc
24$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
25 $(call if_changed,hzfile)
26
27quiet_cmd_bc = BC $@
28 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
29
30targets += timeconst.h
31$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
32 $(call if_changed,bc)
33
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..2e949cc9c9f1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33 33
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h"
35 36
36void timecounter_init(struct timecounter *tc, 37void timecounter_init(struct timecounter *tc,
37 const struct cyclecounter *cc, 38 const struct cyclecounter *cc,
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
249static void clocksource_watchdog(unsigned long data) 250static void clocksource_watchdog(unsigned long data)
250{ 251{
251 struct clocksource *cs; 252 struct clocksource *cs;
252 cycle_t csnow, wdnow; 253 cycle_t csnow, wdnow, delta;
253 int64_t wd_nsec, cs_nsec; 254 int64_t wd_nsec, cs_nsec;
254 int next_cpu, reset_pending; 255 int next_cpu, reset_pending;
255 256
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)
282 continue; 283 continue;
283 } 284 }
284 285
285 wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, 286 delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
286 watchdog->mult, watchdog->shift); 287 wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
288 watchdog->shift);
287 289
288 cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & 290 delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
289 cs->mask, cs->mult, cs->shift); 291 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
290 cs->cs_last = csnow; 292 cs->cs_last = csnow;
291 cs->wd_last = wdnow; 293 cs->wd_last = wdnow;
292 294
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
new file mode 100644
index 000000000000..1c2fe7de2842
--- /dev/null
+++ b/kernel/time/hrtimer.c
@@ -0,0 +1,1866 @@
1/*
2 * linux/kernel/hrtimer.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * High-resolution kernel timers
9 *
10 * In contrast to the low-resolution timeout API implemented in
11 * kernel/timer.c, hrtimers provide finer resolution and accuracy
12 * depending on system configuration and capabilities.
13 *
14 * These timers are currently used for:
15 * - itimers
16 * - POSIX timers
17 * - nanosleep
18 * - precise in-kernel timing
19 *
20 * Started by: Thomas Gleixner and Ingo Molnar
21 *
22 * Credits:
23 * based on kernel/timer.c
24 *
25 * Help, testing, suggestions, bugfixes, improvements were
26 * provided by:
27 *
28 * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
29 * et. al.
30 *
31 * For licencing details see kernel-base/COPYING
32 */
33
34#include <linux/cpu.h>
35#include <linux/export.h>
36#include <linux/percpu.h>
37#include <linux/hrtimer.h>
38#include <linux/notifier.h>
39#include <linux/syscalls.h>
40#include <linux/kallsyms.h>
41#include <linux/interrupt.h>
42#include <linux/tick.h>
43#include <linux/seq_file.h>
44#include <linux/err.h>
45#include <linux/debugobjects.h>
46#include <linux/sched.h>
47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h>
49#include <linux/sched/deadline.h>
50#include <linux/timer.h>
51#include <linux/freezer.h>
52
53#include <asm/uaccess.h>
54
55#include <trace/events/timer.h>
56
57#include "timekeeping.h"
58
59/*
60 * The timer bases:
61 *
62 * There are more clockids then hrtimer bases. Thus, we index
63 * into the timer bases by the hrtimer_base_type enum. When trying
64 * to reach a base using a clockid, hrtimer_clockid_to_base()
65 * is used to convert from clockid to the proper hrtimer_base_type.
66 */
67DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
68{
69
70 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
71 .clock_base =
72 {
73 {
74 .index = HRTIMER_BASE_MONOTONIC,
75 .clockid = CLOCK_MONOTONIC,
76 .get_time = &ktime_get,
77 .resolution = KTIME_LOW_RES,
78 },
79 {
80 .index = HRTIMER_BASE_REALTIME,
81 .clockid = CLOCK_REALTIME,
82 .get_time = &ktime_get_real,
83 .resolution = KTIME_LOW_RES,
84 },
85 {
86 .index = HRTIMER_BASE_BOOTTIME,
87 .clockid = CLOCK_BOOTTIME,
88 .get_time = &ktime_get_boottime,
89 .resolution = KTIME_LOW_RES,
90 },
91 {
92 .index = HRTIMER_BASE_TAI,
93 .clockid = CLOCK_TAI,
94 .get_time = &ktime_get_clocktai,
95 .resolution = KTIME_LOW_RES,
96 },
97 }
98};
99
100static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
101 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
102 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
103 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
104 [CLOCK_TAI] = HRTIMER_BASE_TAI,
105};
106
107static inline int hrtimer_clockid_to_base(clockid_t clock_id)
108{
109 return hrtimer_clock_to_base_table[clock_id];
110}
111
112
113/*
114 * Get the coarse grained time at the softirq based on xtime and
115 * wall_to_monotonic.
116 */
117static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
118{
119 ktime_t xtim, mono, boot, tai;
120 ktime_t off_real, off_boot, off_tai;
121
122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
123 boot = ktime_add(mono, off_boot);
124 xtim = ktime_add(mono, off_real);
125 tai = ktime_add(xtim, off_tai);
126
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
129 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
130 base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
131}
132
133/*
134 * Functions and macros which are different for UP/SMP systems are kept in a
135 * single place
136 */
137#ifdef CONFIG_SMP
138
139/*
140 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
141 * means that all timers which are tied to this base via timer->base are
142 * locked, and the base itself is locked too.
143 *
144 * So __run_timers/migrate_timers can safely modify all timers which could
145 * be found on the lists/queues.
146 *
147 * When the timer's base is locked, and the timer removed from list, it is
148 * possible to set timer->base = NULL and drop the lock: the timer remains
149 * locked.
150 */
151static
152struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
153 unsigned long *flags)
154{
155 struct hrtimer_clock_base *base;
156
157 for (;;) {
158 base = timer->base;
159 if (likely(base != NULL)) {
160 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
161 if (likely(base == timer->base))
162 return base;
163 /* The timer has migrated to another CPU: */
164 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
165 }
166 cpu_relax();
167 }
168}
169
170/*
171 * With HIGHRES=y we do not migrate the timer when it is expiring
172 * before the next event on the target cpu because we cannot reprogram
173 * the target cpu hardware and we would cause it to fire late.
174 *
175 * Called with cpu_base->lock of target cpu held.
176 */
177static int
178hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
179{
180#ifdef CONFIG_HIGH_RES_TIMERS
181 ktime_t expires;
182
183 if (!new_base->cpu_base->hres_active)
184 return 0;
185
186 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
187 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
188#else
189 return 0;
190#endif
191}
192
193/*
194 * Switch the timer base to the current CPU when possible.
195 */
196static inline struct hrtimer_clock_base *
197switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
198 int pinned)
199{
200 struct hrtimer_clock_base *new_base;
201 struct hrtimer_cpu_base *new_cpu_base;
202 int this_cpu = smp_processor_id();
203 int cpu = get_nohz_timer_target(pinned);
204 int basenum = base->index;
205
206again:
207 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
208 new_base = &new_cpu_base->clock_base[basenum];
209
210 if (base != new_base) {
211 /*
212 * We are trying to move timer to new_base.
213 * However we can't change timer's base while it is running,
214 * so we keep it on the same CPU. No hassle vs. reprogramming
215 * the event source in the high resolution case. The softirq
216 * code will take care of this when the timer function has
217 * completed. There is no conflict as we hold the lock until
218 * the timer is enqueued.
219 */
220 if (unlikely(hrtimer_callback_running(timer)))
221 return base;
222
223 /* See the comment in lock_timer_base() */
224 timer->base = NULL;
225 raw_spin_unlock(&base->cpu_base->lock);
226 raw_spin_lock(&new_base->cpu_base->lock);
227
228 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
229 cpu = this_cpu;
230 raw_spin_unlock(&new_base->cpu_base->lock);
231 raw_spin_lock(&base->cpu_base->lock);
232 timer->base = base;
233 goto again;
234 }
235 timer->base = new_base;
236 } else {
237 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
238 cpu = this_cpu;
239 goto again;
240 }
241 }
242 return new_base;
243}
244
245#else /* CONFIG_SMP */
246
247static inline struct hrtimer_clock_base *
248lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
249{
250 struct hrtimer_clock_base *base = timer->base;
251
252 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
253
254 return base;
255}
256
257# define switch_hrtimer_base(t, b, p) (b)
258
259#endif /* !CONFIG_SMP */
260
261/*
262 * Functions for the union type storage format of ktime_t which are
263 * too large for inlining:
264 */
265#if BITS_PER_LONG < 64
266/*
267 * Divide a ktime value by a nanosecond value
268 */
269u64 ktime_divns(const ktime_t kt, s64 div)
270{
271 u64 dclc;
272 int sft = 0;
273
274 dclc = ktime_to_ns(kt);
275 /* Make sure the divisor is less than 2^32: */
276 while (div >> 32) {
277 sft++;
278 div >>= 1;
279 }
280 dclc >>= sft;
281 do_div(dclc, (unsigned long) div);
282
283 return dclc;
284}
285EXPORT_SYMBOL_GPL(ktime_divns);
286#endif /* BITS_PER_LONG >= 64 */
287
288/*
289 * Add two ktime values and do a safety check for overflow:
290 */
291ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
292{
293 ktime_t res = ktime_add(lhs, rhs);
294
295 /*
296 * We use KTIME_SEC_MAX here, the maximum timeout which we can
297 * return to user space in a timespec:
298 */
299 if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
300 res = ktime_set(KTIME_SEC_MAX, 0);
301
302 return res;
303}
304
305EXPORT_SYMBOL_GPL(ktime_add_safe);
306
307#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
308
309static struct debug_obj_descr hrtimer_debug_descr;
310
311static void *hrtimer_debug_hint(void *addr)
312{
313 return ((struct hrtimer *) addr)->function;
314}
315
316/*
317 * fixup_init is called when:
318 * - an active object is initialized
319 */
320static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
321{
322 struct hrtimer *timer = addr;
323
324 switch (state) {
325 case ODEBUG_STATE_ACTIVE:
326 hrtimer_cancel(timer);
327 debug_object_init(timer, &hrtimer_debug_descr);
328 return 1;
329 default:
330 return 0;
331 }
332}
333
334/*
335 * fixup_activate is called when:
336 * - an active object is activated
337 * - an unknown object is activated (might be a statically initialized object)
338 */
339static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
340{
341 switch (state) {
342
343 case ODEBUG_STATE_NOTAVAILABLE:
344 WARN_ON_ONCE(1);
345 return 0;
346
347 case ODEBUG_STATE_ACTIVE:
348 WARN_ON(1);
349
350 default:
351 return 0;
352 }
353}
354
355/*
356 * fixup_free is called when:
357 * - an active object is freed
358 */
359static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
360{
361 struct hrtimer *timer = addr;
362
363 switch (state) {
364 case ODEBUG_STATE_ACTIVE:
365 hrtimer_cancel(timer);
366 debug_object_free(timer, &hrtimer_debug_descr);
367 return 1;
368 default:
369 return 0;
370 }
371}
372
373static struct debug_obj_descr hrtimer_debug_descr = {
374 .name = "hrtimer",
375 .debug_hint = hrtimer_debug_hint,
376 .fixup_init = hrtimer_fixup_init,
377 .fixup_activate = hrtimer_fixup_activate,
378 .fixup_free = hrtimer_fixup_free,
379};
380
381static inline void debug_hrtimer_init(struct hrtimer *timer)
382{
383 debug_object_init(timer, &hrtimer_debug_descr);
384}
385
386static inline void debug_hrtimer_activate(struct hrtimer *timer)
387{
388 debug_object_activate(timer, &hrtimer_debug_descr);
389}
390
391static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
392{
393 debug_object_deactivate(timer, &hrtimer_debug_descr);
394}
395
396static inline void debug_hrtimer_free(struct hrtimer *timer)
397{
398 debug_object_free(timer, &hrtimer_debug_descr);
399}
400
401static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
402 enum hrtimer_mode mode);
403
404void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
405 enum hrtimer_mode mode)
406{
407 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
408 __hrtimer_init(timer, clock_id, mode);
409}
410EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
411
412void destroy_hrtimer_on_stack(struct hrtimer *timer)
413{
414 debug_object_free(timer, &hrtimer_debug_descr);
415}
416
417#else
418static inline void debug_hrtimer_init(struct hrtimer *timer) { }
419static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
420static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
421#endif
422
423static inline void
424debug_init(struct hrtimer *timer, clockid_t clockid,
425 enum hrtimer_mode mode)
426{
427 debug_hrtimer_init(timer);
428 trace_hrtimer_init(timer, clockid, mode);
429}
430
431static inline void debug_activate(struct hrtimer *timer)
432{
433 debug_hrtimer_activate(timer);
434 trace_hrtimer_start(timer);
435}
436
437static inline void debug_deactivate(struct hrtimer *timer)
438{
439 debug_hrtimer_deactivate(timer);
440 trace_hrtimer_cancel(timer);
441}
442
443/* High resolution timer related functions */
444#ifdef CONFIG_HIGH_RES_TIMERS
445
446/*
447 * High resolution timer enabled ?
448 */
449static int hrtimer_hres_enabled __read_mostly = 1;
450
451/*
452 * Enable / Disable high resolution mode
453 */
454static int __init setup_hrtimer_hres(char *str)
455{
456 if (!strcmp(str, "off"))
457 hrtimer_hres_enabled = 0;
458 else if (!strcmp(str, "on"))
459 hrtimer_hres_enabled = 1;
460 else
461 return 0;
462 return 1;
463}
464
465__setup("highres=", setup_hrtimer_hres);
466
467/*
468 * hrtimer_high_res_enabled - query, if the highres mode is enabled
469 */
470static inline int hrtimer_is_hres_enabled(void)
471{
472 return hrtimer_hres_enabled;
473}
474
475/*
476 * Is the high resolution mode active ?
477 */
478static inline int hrtimer_hres_active(void)
479{
480 return __this_cpu_read(hrtimer_bases.hres_active);
481}
482
483/*
484 * Reprogram the event source with checking both queues for the
485 * next event
486 * Called with interrupts disabled and base->lock held
487 */
488static void
489hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
490{
491 int i;
492 struct hrtimer_clock_base *base = cpu_base->clock_base;
493 ktime_t expires, expires_next;
494
495 expires_next.tv64 = KTIME_MAX;
496
497 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
498 struct hrtimer *timer;
499 struct timerqueue_node *next;
500
501 next = timerqueue_getnext(&base->active);
502 if (!next)
503 continue;
504 timer = container_of(next, struct hrtimer, node);
505
506 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
507 /*
508 * clock_was_set() has changed base->offset so the
509 * result might be negative. Fix it up to prevent a
510 * false positive in clockevents_program_event()
511 */
512 if (expires.tv64 < 0)
513 expires.tv64 = 0;
514 if (expires.tv64 < expires_next.tv64)
515 expires_next = expires;
516 }
517
518 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
519 return;
520
521 cpu_base->expires_next.tv64 = expires_next.tv64;
522
523 /*
524 * If a hang was detected in the last timer interrupt then we
525 * leave the hang delay active in the hardware. We want the
526 * system to make progress. That also prevents the following
527 * scenario:
528 * T1 expires 50ms from now
529 * T2 expires 5s from now
530 *
531 * T1 is removed, so this code is called and would reprogram
532 * the hardware to 5s from now. Any hrtimer_start after that
533 * will not reprogram the hardware due to hang_detected being
534 * set. So we'd effectivly block all timers until the T2 event
535 * fires.
536 */
537 if (cpu_base->hang_detected)
538 return;
539
540 if (cpu_base->expires_next.tv64 != KTIME_MAX)
541 tick_program_event(cpu_base->expires_next, 1);
542}
543
544/*
545 * Shared reprogramming for clock_realtime and clock_monotonic
546 *
547 * When a timer is enqueued and expires earlier than the already enqueued
548 * timers, we have to check, whether it expires earlier than the timer for
549 * which the clock event device was armed.
550 *
551 * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
552 * and no expiry check happens. The timer gets enqueued into the rbtree. The
553 * reprogramming and expiry check is done in the hrtimer_interrupt or in the
554 * softirq.
555 *
556 * Called with interrupts disabled and base->cpu_base.lock held
557 */
558static int hrtimer_reprogram(struct hrtimer *timer,
559 struct hrtimer_clock_base *base)
560{
561 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
562 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
563 int res;
564
565 WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
566
567 /*
568 * When the callback is running, we do not reprogram the clock event
569 * device. The timer callback is either running on a different CPU or
570 * the callback is executed in the hrtimer_interrupt context. The
571 * reprogramming is handled either by the softirq, which called the
572 * callback or at the end of the hrtimer_interrupt.
573 */
574 if (hrtimer_callback_running(timer))
575 return 0;
576
577 /*
578 * CLOCK_REALTIME timer might be requested with an absolute
579 * expiry time which is less than base->offset. Nothing wrong
580 * about that, just avoid to call into the tick code, which
581 * has now objections against negative expiry values.
582 */
583 if (expires.tv64 < 0)
584 return -ETIME;
585
586 if (expires.tv64 >= cpu_base->expires_next.tv64)
587 return 0;
588
589 /*
590 * If a hang was detected in the last timer interrupt then we
591 * do not schedule a timer which is earlier than the expiry
592 * which we enforced in the hang detection. We want the system
593 * to make progress.
594 */
595 if (cpu_base->hang_detected)
596 return 0;
597
598 /*
599 * Clockevents returns -ETIME, when the event was in the past.
600 */
601 res = tick_program_event(expires, 0);
602 if (!IS_ERR_VALUE(res))
603 cpu_base->expires_next = expires;
604 return res;
605}
606
607/*
608 * Initialize the high resolution related parts of cpu_base
609 */
610static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
611{
612 base->expires_next.tv64 = KTIME_MAX;
613 base->hres_active = 0;
614}
615
616static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
617{
618 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
619 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
620 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
621
622 return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
623}
624
625/*
626 * Retrigger next event is called after clock was set
627 *
628 * Called with interrupts disabled via on_each_cpu()
629 */
630static void retrigger_next_event(void *arg)
631{
632 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
633
634 if (!hrtimer_hres_active())
635 return;
636
637 raw_spin_lock(&base->lock);
638 hrtimer_update_base(base);
639 hrtimer_force_reprogram(base, 0);
640 raw_spin_unlock(&base->lock);
641}
642
643/*
644 * Switch to high resolution mode
645 */
646static int hrtimer_switch_to_hres(void)
647{
648 int i, cpu = smp_processor_id();
649 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
650 unsigned long flags;
651
652 if (base->hres_active)
653 return 1;
654
655 local_irq_save(flags);
656
657 if (tick_init_highres()) {
658 local_irq_restore(flags);
659 printk(KERN_WARNING "Could not switch to high resolution "
660 "mode on CPU %d\n", cpu);
661 return 0;
662 }
663 base->hres_active = 1;
664 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
665 base->clock_base[i].resolution = KTIME_HIGH_RES;
666
667 tick_setup_sched_timer();
668 /* "Retrigger" the interrupt to get things going */
669 retrigger_next_event(NULL);
670 local_irq_restore(flags);
671 return 1;
672}
673
674static void clock_was_set_work(struct work_struct *work)
675{
676 clock_was_set();
677}
678
679static DECLARE_WORK(hrtimer_work, clock_was_set_work);
680
681/*
682 * Called from timekeeping and resume code to reprogramm the hrtimer
683 * interrupt device on all cpus.
684 */
685void clock_was_set_delayed(void)
686{
687 schedule_work(&hrtimer_work);
688}
689
690#else
691
692static inline int hrtimer_hres_active(void) { return 0; }
693static inline int hrtimer_is_hres_enabled(void) { return 0; }
694static inline int hrtimer_switch_to_hres(void) { return 0; }
695static inline void
696hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
697static inline int hrtimer_reprogram(struct hrtimer *timer,
698 struct hrtimer_clock_base *base)
699{
700 return 0;
701}
702static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
703static inline void retrigger_next_event(void *arg) { }
704
705#endif /* CONFIG_HIGH_RES_TIMERS */
706
707/*
708 * Clock realtime was set
709 *
710 * Change the offset of the realtime clock vs. the monotonic
711 * clock.
712 *
713 * We might have to reprogram the high resolution timer interrupt. On
714 * SMP we call the architecture specific code to retrigger _all_ high
715 * resolution timer interrupts. On UP we just disable interrupts and
716 * call the high resolution interrupt code.
717 */
718void clock_was_set(void)
719{
720#ifdef CONFIG_HIGH_RES_TIMERS
721 /* Retrigger the CPU local events everywhere */
722 on_each_cpu(retrigger_next_event, NULL, 1);
723#endif
724 timerfd_clock_was_set();
725}
726
727/*
728 * During resume we might have to reprogram the high resolution timer
729 * interrupt on all online CPUs. However, all other CPUs will be
730 * stopped with IRQs interrupts disabled so the clock_was_set() call
731 * must be deferred.
732 */
733void hrtimers_resume(void)
734{
735 WARN_ONCE(!irqs_disabled(),
736 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
737
738 /* Retrigger on the local CPU */
739 retrigger_next_event(NULL);
740 /* And schedule a retrigger for all others */
741 clock_was_set_delayed();
742}
743
744static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
745{
746#ifdef CONFIG_TIMER_STATS
747 if (timer->start_site)
748 return;
749 timer->start_site = __builtin_return_address(0);
750 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
751 timer->start_pid = current->pid;
752#endif
753}
754
755static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
756{
757#ifdef CONFIG_TIMER_STATS
758 timer->start_site = NULL;
759#endif
760}
761
762static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
763{
764#ifdef CONFIG_TIMER_STATS
765 if (likely(!timer_stats_active))
766 return;
767 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
768 timer->function, timer->start_comm, 0);
769#endif
770}
771
772/*
773 * Counterpart to lock_hrtimer_base above:
774 */
775static inline
776void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
777{
778 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
779}
780
781/**
782 * hrtimer_forward - forward the timer expiry
783 * @timer: hrtimer to forward
784 * @now: forward past this time
785 * @interval: the interval to forward
786 *
787 * Forward the timer expiry so it will expire in the future.
788 * Returns the number of overruns.
789 */
790u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
791{
792 u64 orun = 1;
793 ktime_t delta;
794
795 delta = ktime_sub(now, hrtimer_get_expires(timer));
796
797 if (delta.tv64 < 0)
798 return 0;
799
800 if (interval.tv64 < timer->base->resolution.tv64)
801 interval.tv64 = timer->base->resolution.tv64;
802
803 if (unlikely(delta.tv64 >= interval.tv64)) {
804 s64 incr = ktime_to_ns(interval);
805
806 orun = ktime_divns(delta, incr);
807 hrtimer_add_expires_ns(timer, incr * orun);
808 if (hrtimer_get_expires_tv64(timer) > now.tv64)
809 return orun;
810 /*
811 * This (and the ktime_add() below) is the
812 * correction for exact:
813 */
814 orun++;
815 }
816 hrtimer_add_expires(timer, interval);
817
818 return orun;
819}
820EXPORT_SYMBOL_GPL(hrtimer_forward);
821
822/*
823 * enqueue_hrtimer - internal function to (re)start a timer
824 *
825 * The timer is inserted in expiry order. Insertion into the
826 * red black tree is O(log(n)). Must hold the base lock.
827 *
828 * Returns 1 when the new timer is the leftmost timer in the tree.
829 */
830static int enqueue_hrtimer(struct hrtimer *timer,
831 struct hrtimer_clock_base *base)
832{
833 debug_activate(timer);
834
835 timerqueue_add(&base->active, &timer->node);
836 base->cpu_base->active_bases |= 1 << base->index;
837
838 /*
839 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
840 * state of a possibly running callback.
841 */
842 timer->state |= HRTIMER_STATE_ENQUEUED;
843
844 return (&timer->node == base->active.next);
845}
846
847/*
848 * __remove_hrtimer - internal function to remove a timer
849 *
850 * Caller must hold the base lock.
851 *
852 * High resolution timer mode reprograms the clock event device when the
853 * timer is the one which expires next. The caller can disable this by setting
854 * reprogram to zero. This is useful, when the context does a reprogramming
855 * anyway (e.g. timer interrupt)
856 */
857static void __remove_hrtimer(struct hrtimer *timer,
858 struct hrtimer_clock_base *base,
859 unsigned long newstate, int reprogram)
860{
861 struct timerqueue_node *next_timer;
862 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
863 goto out;
864
865 next_timer = timerqueue_getnext(&base->active);
866 timerqueue_del(&base->active, &timer->node);
867 if (&timer->node == next_timer) {
868#ifdef CONFIG_HIGH_RES_TIMERS
869 /* Reprogram the clock event device. if enabled */
870 if (reprogram && hrtimer_hres_active()) {
871 ktime_t expires;
872
873 expires = ktime_sub(hrtimer_get_expires(timer),
874 base->offset);
875 if (base->cpu_base->expires_next.tv64 == expires.tv64)
876 hrtimer_force_reprogram(base->cpu_base, 1);
877 }
878#endif
879 }
880 if (!timerqueue_getnext(&base->active))
881 base->cpu_base->active_bases &= ~(1 << base->index);
882out:
883 timer->state = newstate;
884}
885
886/*
887 * remove hrtimer, called with base lock held
888 */
889static inline int
890remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
891{
892 if (hrtimer_is_queued(timer)) {
893 unsigned long state;
894 int reprogram;
895
896 /*
897 * Remove the timer and force reprogramming when high
898 * resolution mode is active and the timer is on the current
899 * CPU. If we remove a timer on another CPU, reprogramming is
900 * skipped. The interrupt event on this CPU is fired and
901 * reprogramming happens in the interrupt handler. This is a
902 * rare case and less expensive than a smp call.
903 */
904 debug_deactivate(timer);
905 timer_stats_hrtimer_clear_start_info(timer);
906 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
907 /*
908 * We must preserve the CALLBACK state flag here,
909 * otherwise we could move the timer base in
910 * switch_hrtimer_base.
911 */
912 state = timer->state & HRTIMER_STATE_CALLBACK;
913 __remove_hrtimer(timer, base, state, reprogram);
914 return 1;
915 }
916 return 0;
917}
918
919int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
920 unsigned long delta_ns, const enum hrtimer_mode mode,
921 int wakeup)
922{
923 struct hrtimer_clock_base *base, *new_base;
924 unsigned long flags;
925 int ret, leftmost;
926
927 base = lock_hrtimer_base(timer, &flags);
928
929 /* Remove an active timer from the queue: */
930 ret = remove_hrtimer(timer, base);
931
932 if (mode & HRTIMER_MODE_REL) {
933 tim = ktime_add_safe(tim, base->get_time());
934 /*
935 * CONFIG_TIME_LOW_RES is a temporary way for architectures
936 * to signal that they simply return xtime in
937 * do_gettimeoffset(). In this case we want to round up by
938 * resolution when starting a relative timer, to avoid short
939 * timeouts. This will go away with the GTOD framework.
940 */
941#ifdef CONFIG_TIME_LOW_RES
942 tim = ktime_add_safe(tim, base->resolution);
943#endif
944 }
945
946 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
947
948 /* Switch the timer base, if necessary: */
949 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
950
951 timer_stats_hrtimer_set_start_info(timer);
952
953 leftmost = enqueue_hrtimer(timer, new_base);
954
955 if (!leftmost) {
956 unlock_hrtimer_base(timer, &flags);
957 return ret;
958 }
959
960 if (!hrtimer_is_hres_active(timer)) {
961 /*
962 * Kick to reschedule the next tick to handle the new timer
963 * on dynticks target.
964 */
965 wake_up_nohz_cpu(new_base->cpu_base->cpu);
966 } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
967 hrtimer_reprogram(timer, new_base)) {
968 /*
969 * Only allow reprogramming if the new base is on this CPU.
970 * (it might still be on another CPU if the timer was pending)
971 *
972 * XXX send_remote_softirq() ?
973 */
974 if (wakeup) {
975 /*
976 * We need to drop cpu_base->lock to avoid a
977 * lock ordering issue vs. rq->lock.
978 */
979 raw_spin_unlock(&new_base->cpu_base->lock);
980 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
981 local_irq_restore(flags);
982 return ret;
983 } else {
984 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
985 }
986 }
987
988 unlock_hrtimer_base(timer, &flags);
989
990 return ret;
991}
992EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
993
994/**
995 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
996 * @timer: the timer to be added
997 * @tim: expiry time
998 * @delta_ns: "slack" range for the timer
999 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1000 * relative (HRTIMER_MODE_REL)
1001 *
1002 * Returns:
1003 * 0 on success
1004 * 1 when the timer was active
1005 */
1006int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1007 unsigned long delta_ns, const enum hrtimer_mode mode)
1008{
1009 return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
1010}
1011EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1012
1013/**
1014 * hrtimer_start - (re)start an hrtimer on the current CPU
1015 * @timer: the timer to be added
1016 * @tim: expiry time
1017 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1018 * relative (HRTIMER_MODE_REL)
1019 *
1020 * Returns:
1021 * 0 on success
1022 * 1 when the timer was active
1023 */
1024int
1025hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1026{
1027 return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
1028}
1029EXPORT_SYMBOL_GPL(hrtimer_start);
1030
1031
1032/**
1033 * hrtimer_try_to_cancel - try to deactivate a timer
1034 * @timer: hrtimer to stop
1035 *
1036 * Returns:
1037 * 0 when the timer was not active
1038 * 1 when the timer was active
1039 * -1 when the timer is currently excuting the callback function and
1040 * cannot be stopped
1041 */
1042int hrtimer_try_to_cancel(struct hrtimer *timer)
1043{
1044 struct hrtimer_clock_base *base;
1045 unsigned long flags;
1046 int ret = -1;
1047
1048 base = lock_hrtimer_base(timer, &flags);
1049
1050 if (!hrtimer_callback_running(timer))
1051 ret = remove_hrtimer(timer, base);
1052
1053 unlock_hrtimer_base(timer, &flags);
1054
1055 return ret;
1056
1057}
1058EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
1059
1060/**
1061 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
1062 * @timer: the timer to be cancelled
1063 *
1064 * Returns:
1065 * 0 when the timer was not active
1066 * 1 when the timer was active
1067 */
1068int hrtimer_cancel(struct hrtimer *timer)
1069{
1070 for (;;) {
1071 int ret = hrtimer_try_to_cancel(timer);
1072
1073 if (ret >= 0)
1074 return ret;
1075 cpu_relax();
1076 }
1077}
1078EXPORT_SYMBOL_GPL(hrtimer_cancel);
1079
1080/**
1081 * hrtimer_get_remaining - get remaining time for the timer
1082 * @timer: the timer to read
1083 */
1084ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1085{
1086 unsigned long flags;
1087 ktime_t rem;
1088
1089 lock_hrtimer_base(timer, &flags);
1090 rem = hrtimer_expires_remaining(timer);
1091 unlock_hrtimer_base(timer, &flags);
1092
1093 return rem;
1094}
1095EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1096
1097#ifdef CONFIG_NO_HZ_COMMON
1098/**
1099 * hrtimer_get_next_event - get the time until next expiry event
1100 *
1101 * Returns the delta to the next expiry event or KTIME_MAX if no timer
1102 * is pending.
1103 */
1104ktime_t hrtimer_get_next_event(void)
1105{
1106 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base;
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags;
1110 int i;
1111
1112 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1113
1114 if (!hrtimer_hres_active()) {
1115 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
1116 struct hrtimer *timer;
1117 struct timerqueue_node *next;
1118
1119 next = timerqueue_getnext(&base->active);
1120 if (!next)
1121 continue;
1122
1123 timer = container_of(next, struct hrtimer, node);
1124 delta.tv64 = hrtimer_get_expires_tv64(timer);
1125 delta = ktime_sub(delta, base->get_time());
1126 if (delta.tv64 < mindelta.tv64)
1127 mindelta.tv64 = delta.tv64;
1128 }
1129 }
1130
1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1132
1133 if (mindelta.tv64 < 0)
1134 mindelta.tv64 = 0;
1135 return mindelta;
1136}
1137#endif
1138
1139static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1140 enum hrtimer_mode mode)
1141{
1142 struct hrtimer_cpu_base *cpu_base;
1143 int base;
1144
1145 memset(timer, 0, sizeof(struct hrtimer));
1146
1147 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1148
1149 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1150 clock_id = CLOCK_MONOTONIC;
1151
1152 base = hrtimer_clockid_to_base(clock_id);
1153 timer->base = &cpu_base->clock_base[base];
1154 timerqueue_init(&timer->node);
1155
1156#ifdef CONFIG_TIMER_STATS
1157 timer->start_site = NULL;
1158 timer->start_pid = -1;
1159 memset(timer->start_comm, 0, TASK_COMM_LEN);
1160#endif
1161}
1162
1163/**
1164 * hrtimer_init - initialize a timer to the given clock
1165 * @timer: the timer to be initialized
1166 * @clock_id: the clock to be used
1167 * @mode: timer mode abs/rel
1168 */
1169void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1170 enum hrtimer_mode mode)
1171{
1172 debug_init(timer, clock_id, mode);
1173 __hrtimer_init(timer, clock_id, mode);
1174}
1175EXPORT_SYMBOL_GPL(hrtimer_init);
1176
1177/**
1178 * hrtimer_get_res - get the timer resolution for a clock
1179 * @which_clock: which clock to query
1180 * @tp: pointer to timespec variable to store the resolution
1181 *
1182 * Store the resolution of the clock selected by @which_clock in the
1183 * variable pointed to by @tp.
1184 */
1185int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1186{
1187 struct hrtimer_cpu_base *cpu_base;
1188 int base = hrtimer_clockid_to_base(which_clock);
1189
1190 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1191 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1192
1193 return 0;
1194}
1195EXPORT_SYMBOL_GPL(hrtimer_get_res);
1196
1197static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1198{
1199 struct hrtimer_clock_base *base = timer->base;
1200 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1201 enum hrtimer_restart (*fn)(struct hrtimer *);
1202 int restart;
1203
1204 WARN_ON(!irqs_disabled());
1205
1206 debug_deactivate(timer);
1207 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1208 timer_stats_account_hrtimer(timer);
1209 fn = timer->function;
1210
1211 /*
1212 * Because we run timers from hardirq context, there is no chance
1213 * they get migrated to another cpu, therefore its safe to unlock
1214 * the timer base.
1215 */
1216 raw_spin_unlock(&cpu_base->lock);
1217 trace_hrtimer_expire_entry(timer, now);
1218 restart = fn(timer);
1219 trace_hrtimer_expire_exit(timer);
1220 raw_spin_lock(&cpu_base->lock);
1221
1222 /*
1223 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1224 * we do not reprogramm the event hardware. Happens either in
1225 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1226 */
1227 if (restart != HRTIMER_NORESTART) {
1228 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1229 enqueue_hrtimer(timer, base);
1230 }
1231
1232 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1233
1234 timer->state &= ~HRTIMER_STATE_CALLBACK;
1235}
1236
1237#ifdef CONFIG_HIGH_RES_TIMERS
1238
1239/*
1240 * High resolution timer interrupt
1241 * Called with interrupts disabled
1242 */
1243void hrtimer_interrupt(struct clock_event_device *dev)
1244{
1245 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1246 ktime_t expires_next, now, entry_time, delta;
1247 int i, retries = 0;
1248
1249 BUG_ON(!cpu_base->hres_active);
1250 cpu_base->nr_events++;
1251 dev->next_event.tv64 = KTIME_MAX;
1252
1253 raw_spin_lock(&cpu_base->lock);
1254 entry_time = now = hrtimer_update_base(cpu_base);
1255retry:
1256 expires_next.tv64 = KTIME_MAX;
1257 /*
1258 * We set expires_next to KTIME_MAX here with cpu_base->lock
1259 * held to prevent that a timer is enqueued in our queue via
1260 * the migration code. This does not affect enqueueing of
1261 * timers which run their callback and need to be requeued on
1262 * this CPU.
1263 */
1264 cpu_base->expires_next.tv64 = KTIME_MAX;
1265
1266 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1267 struct hrtimer_clock_base *base;
1268 struct timerqueue_node *node;
1269 ktime_t basenow;
1270
1271 if (!(cpu_base->active_bases & (1 << i)))
1272 continue;
1273
1274 base = cpu_base->clock_base + i;
1275 basenow = ktime_add(now, base->offset);
1276
1277 while ((node = timerqueue_getnext(&base->active))) {
1278 struct hrtimer *timer;
1279
1280 timer = container_of(node, struct hrtimer, node);
1281
1282 /*
1283 * The immediate goal for using the softexpires is
1284 * minimizing wakeups, not running timers at the
1285 * earliest interrupt after their soft expiration.
1286 * This allows us to avoid using a Priority Search
1287 * Tree, which can answer a stabbing querry for
1288 * overlapping intervals and instead use the simple
1289 * BST we already have.
1290 * We don't add extra wakeups by delaying timers that
1291 * are right-of a not yet expired timer, because that
1292 * timer will have to trigger a wakeup anyway.
1293 */
1294
1295 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1296 ktime_t expires;
1297
1298 expires = ktime_sub(hrtimer_get_expires(timer),
1299 base->offset);
1300 if (expires.tv64 < 0)
1301 expires.tv64 = KTIME_MAX;
1302 if (expires.tv64 < expires_next.tv64)
1303 expires_next = expires;
1304 break;
1305 }
1306
1307 __run_hrtimer(timer, &basenow);
1308 }
1309 }
1310
1311 /*
1312 * Store the new expiry value so the migration code can verify
1313 * against it.
1314 */
1315 cpu_base->expires_next = expires_next;
1316 raw_spin_unlock(&cpu_base->lock);
1317
1318 /* Reprogramming necessary ? */
1319 if (expires_next.tv64 == KTIME_MAX ||
1320 !tick_program_event(expires_next, 0)) {
1321 cpu_base->hang_detected = 0;
1322 return;
1323 }
1324
1325 /*
1326 * The next timer was already expired due to:
1327 * - tracing
1328 * - long lasting callbacks
1329 * - being scheduled away when running in a VM
1330 *
1331 * We need to prevent that we loop forever in the hrtimer
1332 * interrupt routine. We give it 3 attempts to avoid
1333 * overreacting on some spurious event.
1334 *
1335 * Acquire base lock for updating the offsets and retrieving
1336 * the current time.
1337 */
1338 raw_spin_lock(&cpu_base->lock);
1339 now = hrtimer_update_base(cpu_base);
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 raw_spin_unlock(&cpu_base->lock);
1352 delta = ktime_sub(now, entry_time);
1353 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1354 cpu_base->max_hang_time = delta;
1355 /*
1356 * Limit it to a sensible value as we enforce a longer
1357 * delay. Give the CPU at least 100ms to catch up.
1358 */
1359 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1360 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1361 else
1362 expires_next = ktime_add(now, delta);
1363 tick_program_event(expires_next, 1);
1364 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1365 ktime_to_ns(delta));
1366}
1367
1368/*
1369 * local version of hrtimer_peek_ahead_timers() called with interrupts
1370 * disabled.
1371 */
1372static void __hrtimer_peek_ahead_timers(void)
1373{
1374 struct tick_device *td;
1375
1376 if (!hrtimer_hres_active())
1377 return;
1378
1379 td = &__get_cpu_var(tick_cpu_device);
1380 if (td && td->evtdev)
1381 hrtimer_interrupt(td->evtdev);
1382}
1383
1384/**
1385 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1386 *
1387 * hrtimer_peek_ahead_timers will peek at the timer queue of
1388 * the current cpu and check if there are any timers for which
1389 * the soft expires time has passed. If any such timers exist,
1390 * they are run immediately and then removed from the timer queue.
1391 *
1392 */
1393void hrtimer_peek_ahead_timers(void)
1394{
1395 unsigned long flags;
1396
1397 local_irq_save(flags);
1398 __hrtimer_peek_ahead_timers();
1399 local_irq_restore(flags);
1400}
1401
1402static void run_hrtimer_softirq(struct softirq_action *h)
1403{
1404 hrtimer_peek_ahead_timers();
1405}
1406
1407#else /* CONFIG_HIGH_RES_TIMERS */
1408
1409static inline void __hrtimer_peek_ahead_timers(void) { }
1410
1411#endif /* !CONFIG_HIGH_RES_TIMERS */
1412
1413/*
1414 * Called from timer softirq every jiffy, expire hrtimers:
1415 *
1416 * For HRT its the fall back code to run the softirq in the timer
1417 * softirq context in case the hrtimer initialization failed or has
1418 * not been done yet.
1419 */
1420void hrtimer_run_pending(void)
1421{
1422 if (hrtimer_hres_active())
1423 return;
1424
1425 /*
1426 * This _is_ ugly: We have to check in the softirq context,
1427 * whether we can switch to highres and / or nohz mode. The
1428 * clocksource switch happens in the timer interrupt with
1429 * xtime_lock held. Notification from there only sets the
1430 * check bit in the tick_oneshot code, otherwise we might
1431 * deadlock vs. xtime_lock.
1432 */
1433 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1434 hrtimer_switch_to_hres();
1435}
1436
1437/*
1438 * Called from hardirq context every jiffy
1439 */
1440void hrtimer_run_queues(void)
1441{
1442 struct timerqueue_node *node;
1443 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1444 struct hrtimer_clock_base *base;
1445 int index, gettime = 1;
1446
1447 if (hrtimer_hres_active())
1448 return;
1449
1450 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1451 base = &cpu_base->clock_base[index];
1452 if (!timerqueue_getnext(&base->active))
1453 continue;
1454
1455 if (gettime) {
1456 hrtimer_get_softirq_time(cpu_base);
1457 gettime = 0;
1458 }
1459
1460 raw_spin_lock(&cpu_base->lock);
1461
1462 while ((node = timerqueue_getnext(&base->active))) {
1463 struct hrtimer *timer;
1464
1465 timer = container_of(node, struct hrtimer, node);
1466 if (base->softirq_time.tv64 <=
1467 hrtimer_get_expires_tv64(timer))
1468 break;
1469
1470 __run_hrtimer(timer, &base->softirq_time);
1471 }
1472 raw_spin_unlock(&cpu_base->lock);
1473 }
1474}
1475
1476/*
1477 * Sleep related functions:
1478 */
1479static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
1480{
1481 struct hrtimer_sleeper *t =
1482 container_of(timer, struct hrtimer_sleeper, timer);
1483 struct task_struct *task = t->task;
1484
1485 t->task = NULL;
1486 if (task)
1487 wake_up_process(task);
1488
1489 return HRTIMER_NORESTART;
1490}
1491
1492void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1493{
1494 sl->timer.function = hrtimer_wakeup;
1495 sl->task = task;
1496}
1497EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1498
1499static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1500{
1501 hrtimer_init_sleeper(t, current);
1502
1503 do {
1504 set_current_state(TASK_INTERRUPTIBLE);
1505 hrtimer_start_expires(&t->timer, mode);
1506 if (!hrtimer_active(&t->timer))
1507 t->task = NULL;
1508
1509 if (likely(t->task))
1510 freezable_schedule();
1511
1512 hrtimer_cancel(&t->timer);
1513 mode = HRTIMER_MODE_ABS;
1514
1515 } while (t->task && !signal_pending(current));
1516
1517 __set_current_state(TASK_RUNNING);
1518
1519 return t->task == NULL;
1520}
1521
1522static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1523{
1524 struct timespec rmt;
1525 ktime_t rem;
1526
1527 rem = hrtimer_expires_remaining(timer);
1528 if (rem.tv64 <= 0)
1529 return 0;
1530 rmt = ktime_to_timespec(rem);
1531
1532 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
1533 return -EFAULT;
1534
1535 return 1;
1536}
1537
1538long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1539{
1540 struct hrtimer_sleeper t;
1541 struct timespec __user *rmtp;
1542 int ret = 0;
1543
1544 hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
1545 HRTIMER_MODE_ABS);
1546 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1547
1548 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1549 goto out;
1550
1551 rmtp = restart->nanosleep.rmtp;
1552 if (rmtp) {
1553 ret = update_rmtp(&t.timer, rmtp);
1554 if (ret <= 0)
1555 goto out;
1556 }
1557
1558 /* The other values in restart are already filled in */
1559 ret = -ERESTART_RESTARTBLOCK;
1560out:
1561 destroy_hrtimer_on_stack(&t.timer);
1562 return ret;
1563}
1564
1565long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1566 const enum hrtimer_mode mode, const clockid_t clockid)
1567{
1568 struct restart_block *restart;
1569 struct hrtimer_sleeper t;
1570 int ret = 0;
1571 unsigned long slack;
1572
1573 slack = current->timer_slack_ns;
1574 if (dl_task(current) || rt_task(current))
1575 slack = 0;
1576
1577 hrtimer_init_on_stack(&t.timer, clockid, mode);
1578 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
1579 if (do_nanosleep(&t, mode))
1580 goto out;
1581
1582 /* Absolute timers do not update the rmtp value and restart: */
1583 if (mode == HRTIMER_MODE_ABS) {
1584 ret = -ERESTARTNOHAND;
1585 goto out;
1586 }
1587
1588 if (rmtp) {
1589 ret = update_rmtp(&t.timer, rmtp);
1590 if (ret <= 0)
1591 goto out;
1592 }
1593
1594 restart = &current_thread_info()->restart_block;
1595 restart->fn = hrtimer_nanosleep_restart;
1596 restart->nanosleep.clockid = t.timer.base->clockid;
1597 restart->nanosleep.rmtp = rmtp;
1598 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1599
1600 ret = -ERESTART_RESTARTBLOCK;
1601out:
1602 destroy_hrtimer_on_stack(&t.timer);
1603 return ret;
1604}
1605
1606SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1607 struct timespec __user *, rmtp)
1608{
1609 struct timespec tu;
1610
1611 if (copy_from_user(&tu, rqtp, sizeof(tu)))
1612 return -EFAULT;
1613
1614 if (!timespec_valid(&tu))
1615 return -EINVAL;
1616
1617 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1618}
1619
1620/*
1621 * Functions related to boot-time initialization:
1622 */
1623static void init_hrtimers_cpu(int cpu)
1624{
1625 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1626 int i;
1627
1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1629 cpu_base->clock_base[i].cpu_base = cpu_base;
1630 timerqueue_init_head(&cpu_base->clock_base[i].active);
1631 }
1632
1633 cpu_base->cpu = cpu;
1634 hrtimer_init_hres(cpu_base);
1635}
1636
1637#ifdef CONFIG_HOTPLUG_CPU
1638
1639static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1640 struct hrtimer_clock_base *new_base)
1641{
1642 struct hrtimer *timer;
1643 struct timerqueue_node *node;
1644
1645 while ((node = timerqueue_getnext(&old_base->active))) {
1646 timer = container_of(node, struct hrtimer, node);
1647 BUG_ON(hrtimer_callback_running(timer));
1648 debug_deactivate(timer);
1649
1650 /*
1651 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1652 * timer could be seen as !active and just vanish away
1653 * under us on another CPU
1654 */
1655 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1656 timer->base = new_base;
1657 /*
1658 * Enqueue the timers on the new cpu. This does not
1659 * reprogram the event device in case the timer
1660 * expires before the earliest on this CPU, but we run
1661 * hrtimer_interrupt after we migrated everything to
1662 * sort out already expired timers and reprogram the
1663 * event device.
1664 */
1665 enqueue_hrtimer(timer, new_base);
1666
1667 /* Clear the migration state bit */
1668 timer->state &= ~HRTIMER_STATE_MIGRATE;
1669 }
1670}
1671
1672static void migrate_hrtimers(int scpu)
1673{
1674 struct hrtimer_cpu_base *old_base, *new_base;
1675 int i;
1676
1677 BUG_ON(cpu_online(scpu));
1678 tick_cancel_sched_timer(scpu);
1679
1680 local_irq_disable();
1681 old_base = &per_cpu(hrtimer_bases, scpu);
1682 new_base = &__get_cpu_var(hrtimer_bases);
1683 /*
1684 * The caller is globally serialized and nobody else
1685 * takes two locks at once, deadlock is not possible.
1686 */
1687 raw_spin_lock(&new_base->lock);
1688 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1689
1690 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1691 migrate_hrtimer_list(&old_base->clock_base[i],
1692 &new_base->clock_base[i]);
1693 }
1694
1695 raw_spin_unlock(&old_base->lock);
1696 raw_spin_unlock(&new_base->lock);
1697
1698 /* Check, if we got expired work to do */
1699 __hrtimer_peek_ahead_timers();
1700 local_irq_enable();
1701}
1702
1703#endif /* CONFIG_HOTPLUG_CPU */
1704
1705static int hrtimer_cpu_notify(struct notifier_block *self,
1706 unsigned long action, void *hcpu)
1707{
1708 int scpu = (long)hcpu;
1709
1710 switch (action) {
1711
1712 case CPU_UP_PREPARE:
1713 case CPU_UP_PREPARE_FROZEN:
1714 init_hrtimers_cpu(scpu);
1715 break;
1716
1717#ifdef CONFIG_HOTPLUG_CPU
1718 case CPU_DYING:
1719 case CPU_DYING_FROZEN:
1720 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1721 break;
1722 case CPU_DEAD:
1723 case CPU_DEAD_FROZEN:
1724 {
1725 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1726 migrate_hrtimers(scpu);
1727 break;
1728 }
1729#endif
1730
1731 default:
1732 break;
1733 }
1734
1735 return NOTIFY_OK;
1736}
1737
1738static struct notifier_block hrtimers_nb = {
1739 .notifier_call = hrtimer_cpu_notify,
1740};
1741
1742void __init hrtimers_init(void)
1743{
1744 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1745 (void *)(long)smp_processor_id());
1746 register_cpu_notifier(&hrtimers_nb);
1747#ifdef CONFIG_HIGH_RES_TIMERS
1748 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1749#endif
1750}
1751
1752/**
1753 * schedule_hrtimeout_range_clock - sleep until timeout
1754 * @expires: timeout value (ktime_t)
1755 * @delta: slack in expires timeout (ktime_t)
1756 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1757 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1758 */
1759int __sched
1760schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1761 const enum hrtimer_mode mode, int clock)
1762{
1763 struct hrtimer_sleeper t;
1764
1765 /*
1766 * Optimize when a zero timeout value is given. It does not
1767 * matter whether this is an absolute or a relative time.
1768 */
1769 if (expires && !expires->tv64) {
1770 __set_current_state(TASK_RUNNING);
1771 return 0;
1772 }
1773
1774 /*
1775 * A NULL parameter means "infinite"
1776 */
1777 if (!expires) {
1778 schedule();
1779 __set_current_state(TASK_RUNNING);
1780 return -EINTR;
1781 }
1782
1783 hrtimer_init_on_stack(&t.timer, clock, mode);
1784 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1785
1786 hrtimer_init_sleeper(&t, current);
1787
1788 hrtimer_start_expires(&t.timer, mode);
1789 if (!hrtimer_active(&t.timer))
1790 t.task = NULL;
1791
1792 if (likely(t.task))
1793 schedule();
1794
1795 hrtimer_cancel(&t.timer);
1796 destroy_hrtimer_on_stack(&t.timer);
1797
1798 __set_current_state(TASK_RUNNING);
1799
1800 return !t.task ? 0 : -EINTR;
1801}
1802
1803/**
1804 * schedule_hrtimeout_range - sleep until timeout
1805 * @expires: timeout value (ktime_t)
1806 * @delta: slack in expires timeout (ktime_t)
1807 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1808 *
1809 * Make the current task sleep until the given expiry time has
1810 * elapsed. The routine will return immediately unless
1811 * the current task state has been set (see set_current_state()).
1812 *
1813 * The @delta argument gives the kernel the freedom to schedule the
1814 * actual wakeup to a time that is both power and performance friendly.
1815 * The kernel give the normal best effort behavior for "@expires+@delta",
1816 * but may decide to fire the timer earlier, but no earlier than @expires.
1817 *
1818 * You can set the task state as follows -
1819 *
1820 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1821 * pass before the routine returns.
1822 *
1823 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1824 * delivered to the current task.
1825 *
1826 * The current task state is guaranteed to be TASK_RUNNING when this
1827 * routine returns.
1828 *
1829 * Returns 0 when the timer has expired otherwise -EINTR
1830 */
1831int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1832 const enum hrtimer_mode mode)
1833{
1834 return schedule_hrtimeout_range_clock(expires, delta, mode,
1835 CLOCK_MONOTONIC);
1836}
1837EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1838
1839/**
1840 * schedule_hrtimeout - sleep until timeout
1841 * @expires: timeout value (ktime_t)
1842 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1843 *
1844 * Make the current task sleep until the given expiry time has
1845 * elapsed. The routine will return immediately unless
1846 * the current task state has been set (see set_current_state()).
1847 *
1848 * You can set the task state as follows -
1849 *
1850 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1851 * pass before the routine returns.
1852 *
1853 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1854 * delivered to the current task.
1855 *
1856 * The current task state is guaranteed to be TASK_RUNNING when this
1857 * routine returns.
1858 *
1859 * Returns 0 when the timer has expired otherwise -EINTR
1860 */
1861int __sched schedule_hrtimeout(ktime_t *expires,
1862 const enum hrtimer_mode mode)
1863{
1864 return schedule_hrtimeout_range(expires, 0, mode);
1865}
1866EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
new file mode 100644
index 000000000000..8d262b467573
--- /dev/null
+++ b/kernel/time/itimer.c
@@ -0,0 +1,301 @@
1/*
2 * linux/kernel/itimer.c
3 *
4 * Copyright (C) 1992 Darren Senn
5 */
6
7/* These are all the functions necessary to implement itimers */
8
9#include <linux/mm.h>
10#include <linux/interrupt.h>
11#include <linux/syscalls.h>
12#include <linux/time.h>
13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
16
17#include <asm/uaccess.h>
18
19/**
20 * itimer_get_remtime - get remaining time for the timer
21 *
22 * @timer: the timer to read
23 *
24 * Returns the delta between the expiry time and now, which can be
25 * less than zero or 1usec for an pending expired timer
26 */
27static struct timeval itimer_get_remtime(struct hrtimer *timer)
28{
29 ktime_t rem = hrtimer_get_remaining(timer);
30
31 /*
32 * Racy but safe: if the itimer expires after the above
33 * hrtimer_get_remtime() call but before this condition
34 * then we return 0 - which is correct.
35 */
36 if (hrtimer_active(timer)) {
37 if (rem.tv64 <= 0)
38 rem.tv64 = NSEC_PER_USEC;
39 } else
40 rem.tv64 = 0;
41
42 return ktime_to_timeval(rem);
43}
44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (cval) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime.utime + cputime.stime;
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cval < t)
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cval - t;
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
79int do_getitimer(int which, struct itimerval *value)
80{
81 struct task_struct *tsk = current;
82
83 switch (which) {
84 case ITIMER_REAL:
85 spin_lock_irq(&tsk->sighand->siglock);
86 value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
87 value->it_interval =
88 ktime_to_timeval(tsk->signal->it_real_incr);
89 spin_unlock_irq(&tsk->sighand->siglock);
90 break;
91 case ITIMER_VIRTUAL:
92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
93 break;
94 case ITIMER_PROF:
95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
96 break;
97 default:
98 return(-EINVAL);
99 }
100 return 0;
101}
102
103SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
104{
105 int error = -EFAULT;
106 struct itimerval get_buffer;
107
108 if (value) {
109 error = do_getitimer(which, &get_buffer);
110 if (!error &&
111 copy_to_user(value, &get_buffer, sizeof(get_buffer)))
112 error = -EFAULT;
113 }
114 return error;
115}
116
117
118/*
119 * The timer is automagically restarted, when interval != 0
120 */
121enum hrtimer_restart it_real_fn(struct hrtimer *timer)
122{
123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer);
125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
128
129 return HRTIMER_NORESTART;
130}
131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
151
152 nval = timeval_to_cputime(&value->it_value);
153 ns_nval = timeval_to_ns(&value->it_value);
154 ninterval = timeval_to_cputime(&value->it_interval);
155 ns_ninterval = timeval_to_ns(&value->it_interval);
156
157 error = cputime_sub_ns(nval, ns_nval);
158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
159
160 spin_lock_irq(&tsk->sighand->siglock);
161
162 cval = it->expires;
163 cinterval = it->incr;
164 if (cval || nval) {
165 if (nval > 0)
166 nval += cputime_one_jiffy;
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 it->error = error;
172 it->incr_error = incr_error;
173 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
174 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
175
176 spin_unlock_irq(&tsk->sighand->siglock);
177
178 if (ovalue) {
179 cputime_to_timeval(cval, &ovalue->it_value);
180 cputime_to_timeval(cinterval, &ovalue->it_interval);
181 }
182}
183
184/*
185 * Returns true if the timeval is in canonical form
186 */
187#define timeval_valid(t) \
188 (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
189
190int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
191{
192 struct task_struct *tsk = current;
193 struct hrtimer *timer;
194 ktime_t expires;
195
196 /*
197 * Validate the timevals in value.
198 */
199 if (!timeval_valid(&value->it_value) ||
200 !timeval_valid(&value->it_interval))
201 return -EINVAL;
202
203 switch (which) {
204 case ITIMER_REAL:
205again:
206 spin_lock_irq(&tsk->sighand->siglock);
207 timer = &tsk->signal->real_timer;
208 if (ovalue) {
209 ovalue->it_value = itimer_get_remtime(timer);
210 ovalue->it_interval
211 = ktime_to_timeval(tsk->signal->it_real_incr);
212 }
213 /* We are sharing ->siglock with it_real_fn() */
214 if (hrtimer_try_to_cancel(timer) < 0) {
215 spin_unlock_irq(&tsk->sighand->siglock);
216 goto again;
217 }
218 expires = timeval_to_ktime(value->it_value);
219 if (expires.tv64 != 0) {
220 tsk->signal->it_real_incr =
221 timeval_to_ktime(value->it_interval);
222 hrtimer_start(timer, expires, HRTIMER_MODE_REL);
223 } else
224 tsk->signal->it_real_incr.tv64 = 0;
225
226 trace_itimer_state(ITIMER_REAL, value, 0);
227 spin_unlock_irq(&tsk->sighand->siglock);
228 break;
229 case ITIMER_VIRTUAL:
230 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
231 break;
232 case ITIMER_PROF:
233 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
234 break;
235 default:
236 return -EINVAL;
237 }
238 return 0;
239}
240
241/**
242 * alarm_setitimer - set alarm in seconds
243 *
244 * @seconds: number of seconds until alarm
245 * 0 disables the alarm
246 *
247 * Returns the remaining time in seconds of a pending timer or 0 when
248 * the timer is not active.
249 *
250 * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
251 * negative timeval settings which would cause immediate expiry.
252 */
253unsigned int alarm_setitimer(unsigned int seconds)
254{
255 struct itimerval it_new, it_old;
256
257#if BITS_PER_LONG < 64
258 if (seconds > INT_MAX)
259 seconds = INT_MAX;
260#endif
261 it_new.it_value.tv_sec = seconds;
262 it_new.it_value.tv_usec = 0;
263 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
264
265 do_setitimer(ITIMER_REAL, &it_new, &it_old);
266
267 /*
268 * We can't return 0 if we have an alarm pending ... And we'd
269 * better return too much than too little anyway
270 */
271 if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
272 it_old.it_value.tv_usec >= 500000)
273 it_old.it_value.tv_sec++;
274
275 return it_old.it_value.tv_sec;
276}
277
278SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
279 struct itimerval __user *, ovalue)
280{
281 struct itimerval set_buffer, get_buffer;
282 int error;
283
284 if (value) {
285 if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
286 return -EFAULT;
287 } else {
288 memset(&set_buffer, 0, sizeof(set_buffer));
289 printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
290 " Misfeature support will be removed\n",
291 current->comm);
292 }
293
294 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
295 if (error || !ovalue)
296 return error;
297
298 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
299 return -EFAULT;
300 return 0;
301}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 33db43a39515..87a346fd6d61 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
466 466
467static void sync_cmos_clock(struct work_struct *work) 467static void sync_cmos_clock(struct work_struct *work)
468{ 468{
469 struct timespec now, next; 469 struct timespec64 now;
470 struct timespec next;
470 int fail = 1; 471 int fail = 1;
471 472
472 /* 473 /*
@@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work)
485 return; 486 return;
486 } 487 }
487 488
488 getnstimeofday(&now); 489 getnstimeofday64(&now);
489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
490 struct timespec adjust = now; 491 struct timespec adjust = timespec64_to_timespec(now);
491 492
492 fail = -ENODEV; 493 fail = -ENODEV;
493 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
@@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { }
531/* 532/*
532 * Propagate a new txc->status value into the NTP state: 533 * Propagate a new txc->status value into the NTP state:
533 */ 534 */
534static inline void process_adj_status(struct timex *txc, struct timespec *ts) 535static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
535{ 536{
536 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 537 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
537 time_state = TIME_OK; 538 time_state = TIME_OK;
@@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
554 555
555 556
556static inline void process_adjtimex_modes(struct timex *txc, 557static inline void process_adjtimex_modes(struct timex *txc,
557 struct timespec *ts, 558 struct timespec64 *ts,
558 s32 *time_tai) 559 s32 *time_tai)
559{ 560{
560 if (txc->modes & ADJ_STATUS) 561 if (txc->modes & ADJ_STATUS)
@@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc)
640 * adjtimex mainly allows reading (and writing, if superuser) of 641 * adjtimex mainly allows reading (and writing, if superuser) of
641 * kernel time-keeping variables. used by xntpd. 642 * kernel time-keeping variables. used by xntpd.
642 */ 643 */
643int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) 644int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
644{ 645{
645 int result; 646 int result;
646 647
@@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
684 /* fill PPS status fields */ 685 /* fill PPS status fields */
685 pps_fill_timex(txc); 686 pps_fill_timex(txc);
686 687
687 txc->time.tv_sec = ts->tv_sec; 688 txc->time.tv_sec = (time_t)ts->tv_sec;
688 txc->time.tv_usec = ts->tv_nsec; 689 txc->time.tv_usec = ts->tv_nsec;
689 if (!(time_status & STA_NANO)) 690 if (!(time_status & STA_NANO))
690 txc->time.tv_usec /= NSEC_PER_USEC; 691 txc->time.tv_usec /= NSEC_PER_USEC;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 1950cb4ca2a4..bbd102ad9df7 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
7extern u64 ntp_tick_length(void); 7extern u64 ntp_tick_length(void);
8extern int second_overflow(unsigned long secs); 8extern int second_overflow(unsigned long secs);
9extern int ntp_validate_timex(struct timex *); 9extern int ntp_validate_timex(struct timex *);
10extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); 10extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
11extern void __hardpps(const struct timespec *, const struct timespec *); 11extern void __hardpps(const struct timespec *, const struct timespec *);
12#endif /* _LINUX_NTP_INTERNAL_H */ 12#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
new file mode 100644
index 000000000000..3b8946416a5f
--- /dev/null
+++ b/kernel/time/posix-cpu-timers.c
@@ -0,0 +1,1490 @@
1/*
2 * Implement CPU time clocks for the POSIX clock interface.
3 */
4
5#include <linux/sched.h>
6#include <linux/posix-timers.h>
7#include <linux/errno.h>
8#include <linux/math64.h>
9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
12#include <linux/random.h>
13#include <linux/tick.h>
14#include <linux/workqueue.h>
15
16/*
17 * Called after updating RLIMIT_CPU to run cpu timer and update
18 * tsk->signal->cputime_expires expiration cache if necessary. Needs
19 * siglock protection since other code may update expiration cache as
20 * well.
21 */
22void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
23{
24 cputime_t cputime = secs_to_cputime(rlim_new);
25
26 spin_lock_irq(&task->sighand->siglock);
27 set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
28 spin_unlock_irq(&task->sighand->siglock);
29}
30
31static int check_clock(const clockid_t which_clock)
32{
33 int error = 0;
34 struct task_struct *p;
35 const pid_t pid = CPUCLOCK_PID(which_clock);
36
37 if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
38 return -EINVAL;
39
40 if (pid == 0)
41 return 0;
42
43 rcu_read_lock();
44 p = find_task_by_vpid(pid);
45 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
46 same_thread_group(p, current) : has_group_leader_pid(p))) {
47 error = -EINVAL;
48 }
49 rcu_read_unlock();
50
51 return error;
52}
53
54static inline unsigned long long
55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
56{
57 unsigned long long ret;
58
59 ret = 0; /* high half always zero when .cpu used */
60 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
61 ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
62 } else {
63 ret = cputime_to_expires(timespec_to_cputime(tp));
64 }
65 return ret;
66}
67
68static void sample_to_timespec(const clockid_t which_clock,
69 unsigned long long expires,
70 struct timespec *tp)
71{
72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
73 *tp = ns_to_timespec(expires);
74 else
75 cputime_to_timespec((__force cputime_t)expires, tp);
76}
77
78/*
79 * Update expiry time from increment, and increase overrun count,
80 * given the current clock sample.
81 */
82static void bump_cpu_timer(struct k_itimer *timer,
83 unsigned long long now)
84{
85 int i;
86 unsigned long long delta, incr;
87
88 if (timer->it.cpu.incr == 0)
89 return;
90
91 if (now < timer->it.cpu.expires)
92 return;
93
94 incr = timer->it.cpu.incr;
95 delta = now + incr - timer->it.cpu.expires;
96
97 /* Don't use (incr*2 < delta), incr*2 might overflow. */
98 for (i = 0; incr < delta - incr; i++)
99 incr = incr << 1;
100
101 for (; i >= 0; incr >>= 1, i--) {
102 if (delta < incr)
103 continue;
104
105 timer->it.cpu.expires += incr;
106 timer->it_overrun += 1 << i;
107 delta -= incr;
108 }
109}
110
111/**
112 * task_cputime_zero - Check a task_cputime struct for all zero fields.
113 *
114 * @cputime: The struct to compare.
115 *
116 * Checks @cputime to see if all fields are zero. Returns true if all fields
117 * are zero, false if any field is nonzero.
118 */
119static inline int task_cputime_zero(const struct task_cputime *cputime)
120{
121 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
122 return 1;
123 return 0;
124}
125
126static inline unsigned long long prof_ticks(struct task_struct *p)
127{
128 cputime_t utime, stime;
129
130 task_cputime(p, &utime, &stime);
131
132 return cputime_to_expires(utime + stime);
133}
134static inline unsigned long long virt_ticks(struct task_struct *p)
135{
136 cputime_t utime;
137
138 task_cputime(p, &utime, NULL);
139
140 return cputime_to_expires(utime);
141}
142
143static int
144posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
145{
146 int error = check_clock(which_clock);
147 if (!error) {
148 tp->tv_sec = 0;
149 tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
150 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
151 /*
152 * If sched_clock is using a cycle counter, we
153 * don't have any idea of its true resolution
154 * exported, but it is much more than 1s/HZ.
155 */
156 tp->tv_nsec = 1;
157 }
158 }
159 return error;
160}
161
162static int
163posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
164{
165 /*
166 * You can never reset a CPU clock, but we check for other errors
167 * in the call before failing with EPERM.
168 */
169 int error = check_clock(which_clock);
170 if (error == 0) {
171 error = -EPERM;
172 }
173 return error;
174}
175
176
177/*
178 * Sample a per-thread clock for the given task.
179 */
180static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
181 unsigned long long *sample)
182{
183 switch (CPUCLOCK_WHICH(which_clock)) {
184 default:
185 return -EINVAL;
186 case CPUCLOCK_PROF:
187 *sample = prof_ticks(p);
188 break;
189 case CPUCLOCK_VIRT:
190 *sample = virt_ticks(p);
191 break;
192 case CPUCLOCK_SCHED:
193 *sample = task_sched_runtime(p);
194 break;
195 }
196 return 0;
197}
198
199static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
200{
201 if (b->utime > a->utime)
202 a->utime = b->utime;
203
204 if (b->stime > a->stime)
205 a->stime = b->stime;
206
207 if (b->sum_exec_runtime > a->sum_exec_runtime)
208 a->sum_exec_runtime = b->sum_exec_runtime;
209}
210
211void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
212{
213 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
214 struct task_cputime sum;
215 unsigned long flags;
216
217 if (!cputimer->running) {
218 /*
219 * The POSIX timer interface allows for absolute time expiry
220 * values through the TIMER_ABSTIME flag, therefore we have
221 * to synchronize the timer to the clock every time we start
222 * it.
223 */
224 thread_group_cputime(tsk, &sum);
225 raw_spin_lock_irqsave(&cputimer->lock, flags);
226 cputimer->running = 1;
227 update_gt_cputime(&cputimer->cputime, &sum);
228 } else
229 raw_spin_lock_irqsave(&cputimer->lock, flags);
230 *times = cputimer->cputime;
231 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
232}
233
234/*
235 * Sample a process (thread group) clock for the given group_leader task.
236 * Must be called with task sighand lock held for safe while_each_thread()
237 * traversal.
238 */
239static int cpu_clock_sample_group(const clockid_t which_clock,
240 struct task_struct *p,
241 unsigned long long *sample)
242{
243 struct task_cputime cputime;
244
245 switch (CPUCLOCK_WHICH(which_clock)) {
246 default:
247 return -EINVAL;
248 case CPUCLOCK_PROF:
249 thread_group_cputime(p, &cputime);
250 *sample = cputime_to_expires(cputime.utime + cputime.stime);
251 break;
252 case CPUCLOCK_VIRT:
253 thread_group_cputime(p, &cputime);
254 *sample = cputime_to_expires(cputime.utime);
255 break;
256 case CPUCLOCK_SCHED:
257 thread_group_cputime(p, &cputime);
258 *sample = cputime.sum_exec_runtime;
259 break;
260 }
261 return 0;
262}
263
264static int posix_cpu_clock_get_task(struct task_struct *tsk,
265 const clockid_t which_clock,
266 struct timespec *tp)
267{
268 int err = -EINVAL;
269 unsigned long long rtn;
270
271 if (CPUCLOCK_PERTHREAD(which_clock)) {
272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 }
292
293 if (!err)
294 sample_to_timespec(which_clock, rtn, tp);
295
296 return err;
297}
298
299
300static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
301{
302 const pid_t pid = CPUCLOCK_PID(which_clock);
303 int err = -EINVAL;
304
305 if (pid == 0) {
306 /*
307 * Special case constant value for our own clocks.
308 * We don't have to do any lookup to find ourselves.
309 */
310 err = posix_cpu_clock_get_task(current, which_clock, tp);
311 } else {
312 /*
313 * Find the given PID, and validate that the caller
314 * should be able to see it.
315 */
316 struct task_struct *p;
317 rcu_read_lock();
318 p = find_task_by_vpid(pid);
319 if (p)
320 err = posix_cpu_clock_get_task(p, which_clock, tp);
321 rcu_read_unlock();
322 }
323
324 return err;
325}
326
327
328/*
329 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
330 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
331 * new timer already all-zeros initialized.
332 */
333static int posix_cpu_timer_create(struct k_itimer *new_timer)
334{
335 int ret = 0;
336 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
337 struct task_struct *p;
338
339 if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
340 return -EINVAL;
341
342 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
343
344 rcu_read_lock();
345 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
346 if (pid == 0) {
347 p = current;
348 } else {
349 p = find_task_by_vpid(pid);
350 if (p && !same_thread_group(p, current))
351 p = NULL;
352 }
353 } else {
354 if (pid == 0) {
355 p = current->group_leader;
356 } else {
357 p = find_task_by_vpid(pid);
358 if (p && !has_group_leader_pid(p))
359 p = NULL;
360 }
361 }
362 new_timer->it.cpu.task = p;
363 if (p) {
364 get_task_struct(p);
365 } else {
366 ret = -EINVAL;
367 }
368 rcu_read_unlock();
369
370 return ret;
371}
372
373/*
374 * Clean up a CPU-clock timer that is about to be destroyed.
375 * This is called from timer deletion with the timer already locked.
376 * If we return TIMER_RETRY, it's necessary to release the timer's lock
377 * and try again. (This happens when the timer is in the middle of firing.)
378 */
379static int posix_cpu_timer_del(struct k_itimer *timer)
380{
381 int ret = 0;
382 unsigned long flags;
383 struct sighand_struct *sighand;
384 struct task_struct *p = timer->it.cpu.task;
385
386 WARN_ON_ONCE(p == NULL);
387
388 /*
389 * Protect against sighand release/switch in exit/exec and process/
390 * thread timer list entry concurrent read/writes.
391 */
392 sighand = lock_task_sighand(p, &flags);
393 if (unlikely(sighand == NULL)) {
394 /*
395 * We raced with the reaping of the task.
396 * The deletion should have cleared us off the list.
397 */
398 WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
399 } else {
400 if (timer->it.cpu.firing)
401 ret = TIMER_RETRY;
402 else
403 list_del(&timer->it.cpu.entry);
404
405 unlock_task_sighand(p, &flags);
406 }
407
408 if (!ret)
409 put_task_struct(p);
410
411 return ret;
412}
413
414static void cleanup_timers_list(struct list_head *head)
415{
416 struct cpu_timer_list *timer, *next;
417
418 list_for_each_entry_safe(timer, next, head, entry)
419 list_del_init(&timer->entry);
420}
421
422/*
423 * Clean out CPU timers still ticking when a thread exited. The task
424 * pointer is cleared, and the expiry time is replaced with the residual
425 * time for later timer_gettime calls to return.
426 * This must be called with the siglock held.
427 */
428static void cleanup_timers(struct list_head *head)
429{
430 cleanup_timers_list(head);
431 cleanup_timers_list(++head);
432 cleanup_timers_list(++head);
433}
434
435/*
436 * These are both called with the siglock held, when the current thread
437 * is being reaped. When the final (leader) thread in the group is reaped,
438 * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
439 */
440void posix_cpu_timers_exit(struct task_struct *tsk)
441{
442 add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
443 sizeof(unsigned long long));
444 cleanup_timers(tsk->cpu_timers);
445
446}
447void posix_cpu_timers_exit_group(struct task_struct *tsk)
448{
449 cleanup_timers(tsk->signal->cpu_timers);
450}
451
452static inline int expires_gt(cputime_t expires, cputime_t new_exp)
453{
454 return expires == 0 || expires > new_exp;
455}
456
457/*
458 * Insert the timer on the appropriate list before any timers that
459 * expire later. This must be called with the sighand lock held.
460 */
461static void arm_timer(struct k_itimer *timer)
462{
463 struct task_struct *p = timer->it.cpu.task;
464 struct list_head *head, *listpos;
465 struct task_cputime *cputime_expires;
466 struct cpu_timer_list *const nt = &timer->it.cpu;
467 struct cpu_timer_list *next;
468
469 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
470 head = p->cpu_timers;
471 cputime_expires = &p->cputime_expires;
472 } else {
473 head = p->signal->cpu_timers;
474 cputime_expires = &p->signal->cputime_expires;
475 }
476 head += CPUCLOCK_WHICH(timer->it_clock);
477
478 listpos = head;
479 list_for_each_entry(next, head, entry) {
480 if (nt->expires < next->expires)
481 break;
482 listpos = &next->entry;
483 }
484 list_add(&nt->entry, listpos);
485
486 if (listpos == head) {
487 unsigned long long exp = nt->expires;
488
489 /*
490 * We are the new earliest-expiring POSIX 1.b timer, hence
491 * need to update expiration cache. Take into account that
492 * for process timers we share expiration cache with itimers
493 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
494 */
495
496 switch (CPUCLOCK_WHICH(timer->it_clock)) {
497 case CPUCLOCK_PROF:
498 if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
499 cputime_expires->prof_exp = expires_to_cputime(exp);
500 break;
501 case CPUCLOCK_VIRT:
502 if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
503 cputime_expires->virt_exp = expires_to_cputime(exp);
504 break;
505 case CPUCLOCK_SCHED:
506 if (cputime_expires->sched_exp == 0 ||
507 cputime_expires->sched_exp > exp)
508 cputime_expires->sched_exp = exp;
509 break;
510 }
511 }
512}
513
514/*
515 * The timer is locked, fire it and arrange for its reload.
516 */
517static void cpu_timer_fire(struct k_itimer *timer)
518{
519 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
520 /*
521 * User don't want any signal.
522 */
523 timer->it.cpu.expires = 0;
524 } else if (unlikely(timer->sigq == NULL)) {
525 /*
526 * This a special case for clock_nanosleep,
527 * not a normal timer from sys_timer_create.
528 */
529 wake_up_process(timer->it_process);
530 timer->it.cpu.expires = 0;
531 } else if (timer->it.cpu.incr == 0) {
532 /*
533 * One-shot timer. Clear it as soon as it's fired.
534 */
535 posix_timer_event(timer, 0);
536 timer->it.cpu.expires = 0;
537 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
538 /*
539 * The signal did not get queued because the signal
540 * was ignored, so we won't get any callback to
541 * reload the timer. But we need to keep it
542 * ticking in case the signal is deliverable next time.
543 */
544 posix_cpu_timer_schedule(timer);
545 }
546}
547
548/*
549 * Sample a process (thread group) timer for the given group_leader task.
550 * Must be called with task sighand lock held for safe while_each_thread()
551 * traversal.
552 */
553static int cpu_timer_sample_group(const clockid_t which_clock,
554 struct task_struct *p,
555 unsigned long long *sample)
556{
557 struct task_cputime cputime;
558
559 thread_group_cputimer(p, &cputime);
560 switch (CPUCLOCK_WHICH(which_clock)) {
561 default:
562 return -EINVAL;
563 case CPUCLOCK_PROF:
564 *sample = cputime_to_expires(cputime.utime + cputime.stime);
565 break;
566 case CPUCLOCK_VIRT:
567 *sample = cputime_to_expires(cputime.utime);
568 break;
569 case CPUCLOCK_SCHED:
570 *sample = cputime.sum_exec_runtime + task_delta_exec(p);
571 break;
572 }
573 return 0;
574}
575
576#ifdef CONFIG_NO_HZ_FULL
577static void nohz_kick_work_fn(struct work_struct *work)
578{
579 tick_nohz_full_kick_all();
580}
581
582static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
583
584/*
585 * We need the IPIs to be sent from sane process context.
586 * The posix cpu timers are always set with irqs disabled.
587 */
588static void posix_cpu_timer_kick_nohz(void)
589{
590 if (context_tracking_is_enabled())
591 schedule_work(&nohz_kick_work);
592}
593
594bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
595{
596 if (!task_cputime_zero(&tsk->cputime_expires))
597 return false;
598
599 if (tsk->signal->cputimer.running)
600 return false;
601
602 return true;
603}
604#else
605static inline void posix_cpu_timer_kick_nohz(void) { }
606#endif
607
608/*
609 * Guts of sys_timer_settime for CPU timers.
610 * This is called with the timer locked and interrupts disabled.
611 * If we return TIMER_RETRY, it's necessary to release the timer's lock
612 * and try again. (This happens when the timer is in the middle of firing.)
613 */
614static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
615 struct itimerspec *new, struct itimerspec *old)
616{
617 unsigned long flags;
618 struct sighand_struct *sighand;
619 struct task_struct *p = timer->it.cpu.task;
620 unsigned long long old_expires, new_expires, old_incr, val;
621 int ret;
622
623 WARN_ON_ONCE(p == NULL);
624
625 new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
626
627 /*
628 * Protect against sighand release/switch in exit/exec and p->cpu_timers
629 * and p->signal->cpu_timers read/write in arm_timer()
630 */
631 sighand = lock_task_sighand(p, &flags);
632 /*
633 * If p has just been reaped, we can no
634 * longer get any information about it at all.
635 */
636 if (unlikely(sighand == NULL)) {
637 return -ESRCH;
638 }
639
640 /*
641 * Disarm any old timer after extracting its expiry time.
642 */
643 WARN_ON_ONCE(!irqs_disabled());
644
645 ret = 0;
646 old_incr = timer->it.cpu.incr;
647 old_expires = timer->it.cpu.expires;
648 if (unlikely(timer->it.cpu.firing)) {
649 timer->it.cpu.firing = -1;
650 ret = TIMER_RETRY;
651 } else
652 list_del_init(&timer->it.cpu.entry);
653
654 /*
655 * We need to sample the current value to convert the new
656 * value from to relative and absolute, and to convert the
657 * old value from absolute to relative. To set a process
658 * timer, we need a sample to balance the thread expiry
659 * times (in arm_timer). With an absolute time, we must
660 * check if it's already passed. In short, we need a sample.
661 */
662 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
663 cpu_clock_sample(timer->it_clock, p, &val);
664 } else {
665 cpu_timer_sample_group(timer->it_clock, p, &val);
666 }
667
668 if (old) {
669 if (old_expires == 0) {
670 old->it_value.tv_sec = 0;
671 old->it_value.tv_nsec = 0;
672 } else {
673 /*
674 * Update the timer in case it has
675 * overrun already. If it has,
676 * we'll report it as having overrun
677 * and with the next reloaded timer
678 * already ticking, though we are
679 * swallowing that pending
680 * notification here to install the
681 * new setting.
682 */
683 bump_cpu_timer(timer, val);
684 if (val < timer->it.cpu.expires) {
685 old_expires = timer->it.cpu.expires - val;
686 sample_to_timespec(timer->it_clock,
687 old_expires,
688 &old->it_value);
689 } else {
690 old->it_value.tv_nsec = 1;
691 old->it_value.tv_sec = 0;
692 }
693 }
694 }
695
696 if (unlikely(ret)) {
697 /*
698 * We are colliding with the timer actually firing.
699 * Punt after filling in the timer's old value, and
700 * disable this firing since we are already reporting
701 * it as an overrun (thanks to bump_cpu_timer above).
702 */
703 unlock_task_sighand(p, &flags);
704 goto out;
705 }
706
707 if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
708 new_expires += val;
709 }
710
711 /*
712 * Install the new expiry time (or zero).
713 * For a timer with no notification action, we don't actually
714 * arm the timer (we'll just fake it for timer_gettime).
715 */
716 timer->it.cpu.expires = new_expires;
717 if (new_expires != 0 && val < new_expires) {
718 arm_timer(timer);
719 }
720
721 unlock_task_sighand(p, &flags);
722 /*
723 * Install the new reload setting, and
724 * set up the signal and overrun bookkeeping.
725 */
726 timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
727 &new->it_interval);
728
729 /*
730 * This acts as a modification timestamp for the timer,
731 * so any automatic reload attempt will punt on seeing
732 * that we have reset the timer manually.
733 */
734 timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
735 ~REQUEUE_PENDING;
736 timer->it_overrun_last = 0;
737 timer->it_overrun = -1;
738
739 if (new_expires != 0 && !(val < new_expires)) {
740 /*
741 * The designated time already passed, so we notify
742 * immediately, even if the thread never runs to
743 * accumulate more time on this clock.
744 */
745 cpu_timer_fire(timer);
746 }
747
748 ret = 0;
749 out:
750 if (old) {
751 sample_to_timespec(timer->it_clock,
752 old_incr, &old->it_interval);
753 }
754 if (!ret)
755 posix_cpu_timer_kick_nohz();
756 return ret;
757}
758
759static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
760{
761 unsigned long long now;
762 struct task_struct *p = timer->it.cpu.task;
763
764 WARN_ON_ONCE(p == NULL);
765
766 /*
767 * Easy part: convert the reload time.
768 */
769 sample_to_timespec(timer->it_clock,
770 timer->it.cpu.incr, &itp->it_interval);
771
772 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
773 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
774 return;
775 }
776
777 /*
778 * Sample the clock to take the difference with the expiry time.
779 */
780 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
781 cpu_clock_sample(timer->it_clock, p, &now);
782 } else {
783 struct sighand_struct *sighand;
784 unsigned long flags;
785
786 /*
787 * Protect against sighand release/switch in exit/exec and
788 * also make timer sampling safe if it ends up calling
789 * thread_group_cputime().
790 */
791 sighand = lock_task_sighand(p, &flags);
792 if (unlikely(sighand == NULL)) {
793 /*
794 * The process has been reaped.
795 * We can't even collect a sample any more.
796 * Call the timer disarmed, nothing else to do.
797 */
798 timer->it.cpu.expires = 0;
799 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
800 &itp->it_value);
801 } else {
802 cpu_timer_sample_group(timer->it_clock, p, &now);
803 unlock_task_sighand(p, &flags);
804 }
805 }
806
807 if (now < timer->it.cpu.expires) {
808 sample_to_timespec(timer->it_clock,
809 timer->it.cpu.expires - now,
810 &itp->it_value);
811 } else {
812 /*
813 * The timer should have expired already, but the firing
814 * hasn't taken place yet. Say it's just about to expire.
815 */
816 itp->it_value.tv_nsec = 1;
817 itp->it_value.tv_sec = 0;
818 }
819}
820
821static unsigned long long
822check_timers_list(struct list_head *timers,
823 struct list_head *firing,
824 unsigned long long curr)
825{
826 int maxfire = 20;
827
828 while (!list_empty(timers)) {
829 struct cpu_timer_list *t;
830
831 t = list_first_entry(timers, struct cpu_timer_list, entry);
832
833 if (!--maxfire || curr < t->expires)
834 return t->expires;
835
836 t->firing = 1;
837 list_move_tail(&t->entry, firing);
838 }
839
840 return 0;
841}
842
843/*
844 * Check for any per-thread CPU timers that have fired and move them off
845 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
846 * tsk->it_*_expires values to reflect the remaining thread CPU timers.
847 */
848static void check_thread_timers(struct task_struct *tsk,
849 struct list_head *firing)
850{
851 struct list_head *timers = tsk->cpu_timers;
852 struct signal_struct *const sig = tsk->signal;
853 struct task_cputime *tsk_expires = &tsk->cputime_expires;
854 unsigned long long expires;
855 unsigned long soft;
856
857 expires = check_timers_list(timers, firing, prof_ticks(tsk));
858 tsk_expires->prof_exp = expires_to_cputime(expires);
859
860 expires = check_timers_list(++timers, firing, virt_ticks(tsk));
861 tsk_expires->virt_exp = expires_to_cputime(expires);
862
863 tsk_expires->sched_exp = check_timers_list(++timers, firing,
864 tsk->se.sum_exec_runtime);
865
866 /*
867 * Check for the special case thread timers.
868 */
869 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
870 if (soft != RLIM_INFINITY) {
871 unsigned long hard =
872 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
873
874 if (hard != RLIM_INFINITY &&
875 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
876 /*
877 * At the hard limit, we just die.
878 * No need to calculate anything else now.
879 */
880 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
881 return;
882 }
883 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
884 /*
885 * At the soft limit, send a SIGXCPU every second.
886 */
887 if (soft < hard) {
888 soft += USEC_PER_SEC;
889 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
890 }
891 printk(KERN_INFO
892 "RT Watchdog Timeout: %s[%d]\n",
893 tsk->comm, task_pid_nr(tsk));
894 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
895 }
896 }
897}
898
899static void stop_process_timers(struct signal_struct *sig)
900{
901 struct thread_group_cputimer *cputimer = &sig->cputimer;
902 unsigned long flags;
903
904 raw_spin_lock_irqsave(&cputimer->lock, flags);
905 cputimer->running = 0;
906 raw_spin_unlock_irqrestore(&cputimer->lock, flags);
907}
908
909static u32 onecputick;
910
911static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
912 unsigned long long *expires,
913 unsigned long long cur_time, int signo)
914{
915 if (!it->expires)
916 return;
917
918 if (cur_time >= it->expires) {
919 if (it->incr) {
920 it->expires += it->incr;
921 it->error += it->incr_error;
922 if (it->error >= onecputick) {
923 it->expires -= cputime_one_jiffy;
924 it->error -= onecputick;
925 }
926 } else {
927 it->expires = 0;
928 }
929
930 trace_itimer_expire(signo == SIGPROF ?
931 ITIMER_PROF : ITIMER_VIRTUAL,
932 tsk->signal->leader_pid, cur_time);
933 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
934 }
935
936 if (it->expires && (!*expires || it->expires < *expires)) {
937 *expires = it->expires;
938 }
939}
940
941/*
942 * Check for any per-thread CPU timers that have fired and move them
943 * off the tsk->*_timers list onto the firing list. Per-thread timers
944 * have already been taken off.
945 */
946static void check_process_timers(struct task_struct *tsk,
947 struct list_head *firing)
948{
949 struct signal_struct *const sig = tsk->signal;
950 unsigned long long utime, ptime, virt_expires, prof_expires;
951 unsigned long long sum_sched_runtime, sched_expires;
952 struct list_head *timers = sig->cpu_timers;
953 struct task_cputime cputime;
954 unsigned long soft;
955
956 /*
957 * Collect the current process totals.
958 */
959 thread_group_cputimer(tsk, &cputime);
960 utime = cputime_to_expires(cputime.utime);
961 ptime = utime + cputime_to_expires(cputime.stime);
962 sum_sched_runtime = cputime.sum_exec_runtime;
963
964 prof_expires = check_timers_list(timers, firing, ptime);
965 virt_expires = check_timers_list(++timers, firing, utime);
966 sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
967
968 /*
969 * Check for the special case process timers.
970 */
971 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
972 SIGPROF);
973 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
974 SIGVTALRM);
975 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
976 if (soft != RLIM_INFINITY) {
977 unsigned long psecs = cputime_to_secs(ptime);
978 unsigned long hard =
979 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
980 cputime_t x;
981 if (psecs >= hard) {
982 /*
983 * At the hard limit, we just die.
984 * No need to calculate anything else now.
985 */
986 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
987 return;
988 }
989 if (psecs >= soft) {
990 /*
991 * At the soft limit, send a SIGXCPU every second.
992 */
993 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
994 if (soft < hard) {
995 soft++;
996 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
997 }
998 }
999 x = secs_to_cputime(soft);
1000 if (!prof_expires || x < prof_expires) {
1001 prof_expires = x;
1002 }
1003 }
1004
1005 sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
1006 sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
1007 sig->cputime_expires.sched_exp = sched_expires;
1008 if (task_cputime_zero(&sig->cputime_expires))
1009 stop_process_timers(sig);
1010}
1011
1012/*
1013 * This is called from the signal code (via do_schedule_next_timer)
1014 * when the last timer signal was delivered and we have to reload the timer.
1015 */
1016void posix_cpu_timer_schedule(struct k_itimer *timer)
1017{
1018 struct sighand_struct *sighand;
1019 unsigned long flags;
1020 struct task_struct *p = timer->it.cpu.task;
1021 unsigned long long now;
1022
1023 WARN_ON_ONCE(p == NULL);
1024
1025 /*
1026 * Fetch the current sample and update the timer's expiry time.
1027 */
1028 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1029 cpu_clock_sample(timer->it_clock, p, &now);
1030 bump_cpu_timer(timer, now);
1031 if (unlikely(p->exit_state))
1032 goto out;
1033
1034 /* Protect timer list r/w in arm_timer() */
1035 sighand = lock_task_sighand(p, &flags);
1036 if (!sighand)
1037 goto out;
1038 } else {
1039 /*
1040 * Protect arm_timer() and timer sampling in case of call to
1041 * thread_group_cputime().
1042 */
1043 sighand = lock_task_sighand(p, &flags);
1044 if (unlikely(sighand == NULL)) {
1045 /*
1046 * The process has been reaped.
1047 * We can't even collect a sample any more.
1048 */
1049 timer->it.cpu.expires = 0;
1050 goto out;
1051 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1052 unlock_task_sighand(p, &flags);
1053 /* Optimizations: if the process is dying, no need to rearm */
1054 goto out;
1055 }
1056 cpu_timer_sample_group(timer->it_clock, p, &now);
1057 bump_cpu_timer(timer, now);
1058 /* Leave the sighand locked for the call below. */
1059 }
1060
1061 /*
1062 * Now re-arm for the new expiry time.
1063 */
1064 WARN_ON_ONCE(!irqs_disabled());
1065 arm_timer(timer);
1066 unlock_task_sighand(p, &flags);
1067
1068 /* Kick full dynticks CPUs in case they need to tick on the new timer */
1069 posix_cpu_timer_kick_nohz();
1070out:
1071 timer->it_overrun_last = timer->it_overrun;
1072 timer->it_overrun = -1;
1073 ++timer->it_requeue_pending;
1074}
1075
1076/**
1077 * task_cputime_expired - Compare two task_cputime entities.
1078 *
1079 * @sample: The task_cputime structure to be checked for expiration.
1080 * @expires: Expiration times, against which @sample will be checked.
1081 *
1082 * Checks @sample against @expires to see if any field of @sample has expired.
1083 * Returns true if any field of the former is greater than the corresponding
1084 * field of the latter if the latter field is set. Otherwise returns false.
1085 */
1086static inline int task_cputime_expired(const struct task_cputime *sample,
1087 const struct task_cputime *expires)
1088{
1089 if (expires->utime && sample->utime >= expires->utime)
1090 return 1;
1091 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1092 return 1;
1093 if (expires->sum_exec_runtime != 0 &&
1094 sample->sum_exec_runtime >= expires->sum_exec_runtime)
1095 return 1;
1096 return 0;
1097}
1098
1099/**
1100 * fastpath_timer_check - POSIX CPU timers fast path.
1101 *
1102 * @tsk: The task (thread) being checked.
1103 *
1104 * Check the task and thread group timers. If both are zero (there are no
1105 * timers set) return false. Otherwise snapshot the task and thread group
1106 * timers and compare them with the corresponding expiration times. Return
1107 * true if a timer has expired, else return false.
1108 */
1109static inline int fastpath_timer_check(struct task_struct *tsk)
1110{
1111 struct signal_struct *sig;
1112 cputime_t utime, stime;
1113
1114 task_cputime(tsk, &utime, &stime);
1115
1116 if (!task_cputime_zero(&tsk->cputime_expires)) {
1117 struct task_cputime task_sample = {
1118 .utime = utime,
1119 .stime = stime,
1120 .sum_exec_runtime = tsk->se.sum_exec_runtime
1121 };
1122
1123 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1124 return 1;
1125 }
1126
1127 sig = tsk->signal;
1128 if (sig->cputimer.running) {
1129 struct task_cputime group_sample;
1130
1131 raw_spin_lock(&sig->cputimer.lock);
1132 group_sample = sig->cputimer.cputime;
1133 raw_spin_unlock(&sig->cputimer.lock);
1134
1135 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1136 return 1;
1137 }
1138
1139 return 0;
1140}
1141
1142/*
1143 * This is called from the timer interrupt handler. The irq handler has
1144 * already updated our counts. We need to check if any timers fire now.
1145 * Interrupts are disabled.
1146 */
1147void run_posix_cpu_timers(struct task_struct *tsk)
1148{
1149 LIST_HEAD(firing);
1150 struct k_itimer *timer, *next;
1151 unsigned long flags;
1152
1153 WARN_ON_ONCE(!irqs_disabled());
1154
1155 /*
1156 * The fast path checks that there are no expired thread or thread
1157 * group timers. If that's so, just return.
1158 */
1159 if (!fastpath_timer_check(tsk))
1160 return;
1161
1162 if (!lock_task_sighand(tsk, &flags))
1163 return;
1164 /*
1165 * Here we take off tsk->signal->cpu_timers[N] and
1166 * tsk->cpu_timers[N] all the timers that are firing, and
1167 * put them on the firing list.
1168 */
1169 check_thread_timers(tsk, &firing);
1170 /*
1171 * If there are any active process wide timers (POSIX 1.b, itimers,
1172 * RLIMIT_CPU) cputimer must be running.
1173 */
1174 if (tsk->signal->cputimer.running)
1175 check_process_timers(tsk, &firing);
1176
1177 /*
1178 * We must release these locks before taking any timer's lock.
1179 * There is a potential race with timer deletion here, as the
1180 * siglock now protects our private firing list. We have set
1181 * the firing flag in each timer, so that a deletion attempt
1182 * that gets the timer lock before we do will give it up and
1183 * spin until we've taken care of that timer below.
1184 */
1185 unlock_task_sighand(tsk, &flags);
1186
1187 /*
1188 * Now that all the timers on our list have the firing flag,
1189 * no one will touch their list entries but us. We'll take
1190 * each timer's lock before clearing its firing flag, so no
1191 * timer call will interfere.
1192 */
1193 list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1194 int cpu_firing;
1195
1196 spin_lock(&timer->it_lock);
1197 list_del_init(&timer->it.cpu.entry);
1198 cpu_firing = timer->it.cpu.firing;
1199 timer->it.cpu.firing = 0;
1200 /*
1201 * The firing flag is -1 if we collided with a reset
1202 * of the timer, which already reported this
1203 * almost-firing as an overrun. So don't generate an event.
1204 */
1205 if (likely(cpu_firing >= 0))
1206 cpu_timer_fire(timer);
1207 spin_unlock(&timer->it_lock);
1208 }
1209}
1210
1211/*
1212 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1213 * The tsk->sighand->siglock must be held by the caller.
1214 */
1215void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1216 cputime_t *newval, cputime_t *oldval)
1217{
1218 unsigned long long now;
1219
1220 WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
1221 cpu_timer_sample_group(clock_idx, tsk, &now);
1222
1223 if (oldval) {
1224 /*
1225 * We are setting itimer. The *oldval is absolute and we update
1226 * it to be relative, *newval argument is relative and we update
1227 * it to be absolute.
1228 */
1229 if (*oldval) {
1230 if (*oldval <= now) {
1231 /* Just about to fire. */
1232 *oldval = cputime_one_jiffy;
1233 } else {
1234 *oldval -= now;
1235 }
1236 }
1237
1238 if (!*newval)
1239 goto out;
1240 *newval += now;
1241 }
1242
1243 /*
1244 * Update expiration cache if we are the earliest timer, or eventually
1245 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1246 */
1247 switch (clock_idx) {
1248 case CPUCLOCK_PROF:
1249 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1250 tsk->signal->cputime_expires.prof_exp = *newval;
1251 break;
1252 case CPUCLOCK_VIRT:
1253 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1254 tsk->signal->cputime_expires.virt_exp = *newval;
1255 break;
1256 }
1257out:
1258 posix_cpu_timer_kick_nohz();
1259}
1260
1261static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1262 struct timespec *rqtp, struct itimerspec *it)
1263{
1264 struct k_itimer timer;
1265 int error;
1266
1267 /*
1268 * Set up a temporary timer and then wait for it to go off.
1269 */
1270 memset(&timer, 0, sizeof timer);
1271 spin_lock_init(&timer.it_lock);
1272 timer.it_clock = which_clock;
1273 timer.it_overrun = -1;
1274 error = posix_cpu_timer_create(&timer);
1275 timer.it_process = current;
1276 if (!error) {
1277 static struct itimerspec zero_it;
1278
1279 memset(it, 0, sizeof *it);
1280 it->it_value = *rqtp;
1281
1282 spin_lock_irq(&timer.it_lock);
1283 error = posix_cpu_timer_set(&timer, flags, it, NULL);
1284 if (error) {
1285 spin_unlock_irq(&timer.it_lock);
1286 return error;
1287 }
1288
1289 while (!signal_pending(current)) {
1290 if (timer.it.cpu.expires == 0) {
1291 /*
1292 * Our timer fired and was reset, below
1293 * deletion can not fail.
1294 */
1295 posix_cpu_timer_del(&timer);
1296 spin_unlock_irq(&timer.it_lock);
1297 return 0;
1298 }
1299
1300 /*
1301 * Block until cpu_timer_fire (or a signal) wakes us.
1302 */
1303 __set_current_state(TASK_INTERRUPTIBLE);
1304 spin_unlock_irq(&timer.it_lock);
1305 schedule();
1306 spin_lock_irq(&timer.it_lock);
1307 }
1308
1309 /*
1310 * We were interrupted by a signal.
1311 */
1312 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1313 error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
1314 if (!error) {
1315 /*
1316 * Timer is now unarmed, deletion can not fail.
1317 */
1318 posix_cpu_timer_del(&timer);
1319 }
1320 spin_unlock_irq(&timer.it_lock);
1321
1322 while (error == TIMER_RETRY) {
1323 /*
1324 * We need to handle case when timer was or is in the
1325 * middle of firing. In other cases we already freed
1326 * resources.
1327 */
1328 spin_lock_irq(&timer.it_lock);
1329 error = posix_cpu_timer_del(&timer);
1330 spin_unlock_irq(&timer.it_lock);
1331 }
1332
1333 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1334 /*
1335 * It actually did fire already.
1336 */
1337 return 0;
1338 }
1339
1340 error = -ERESTART_RESTARTBLOCK;
1341 }
1342
1343 return error;
1344}
1345
1346static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1347
1348static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1349 struct timespec *rqtp, struct timespec __user *rmtp)
1350{
1351 struct restart_block *restart_block =
1352 &current_thread_info()->restart_block;
1353 struct itimerspec it;
1354 int error;
1355
1356 /*
1357 * Diagnose required errors first.
1358 */
1359 if (CPUCLOCK_PERTHREAD(which_clock) &&
1360 (CPUCLOCK_PID(which_clock) == 0 ||
1361 CPUCLOCK_PID(which_clock) == current->pid))
1362 return -EINVAL;
1363
1364 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1365
1366 if (error == -ERESTART_RESTARTBLOCK) {
1367
1368 if (flags & TIMER_ABSTIME)
1369 return -ERESTARTNOHAND;
1370 /*
1371 * Report back to the user the time still remaining.
1372 */
1373 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1374 return -EFAULT;
1375
1376 restart_block->fn = posix_cpu_nsleep_restart;
1377 restart_block->nanosleep.clockid = which_clock;
1378 restart_block->nanosleep.rmtp = rmtp;
1379 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1380 }
1381 return error;
1382}
1383
1384static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1385{
1386 clockid_t which_clock = restart_block->nanosleep.clockid;
1387 struct timespec t;
1388 struct itimerspec it;
1389 int error;
1390
1391 t = ns_to_timespec(restart_block->nanosleep.expires);
1392
1393 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1394
1395 if (error == -ERESTART_RESTARTBLOCK) {
1396 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1397 /*
1398 * Report back to the user the time still remaining.
1399 */
1400 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1401 return -EFAULT;
1402
1403 restart_block->nanosleep.expires = timespec_to_ns(&t);
1404 }
1405 return error;
1406
1407}
1408
1409#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1410#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1411
1412static int process_cpu_clock_getres(const clockid_t which_clock,
1413 struct timespec *tp)
1414{
1415 return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1416}
1417static int process_cpu_clock_get(const clockid_t which_clock,
1418 struct timespec *tp)
1419{
1420 return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1421}
1422static int process_cpu_timer_create(struct k_itimer *timer)
1423{
1424 timer->it_clock = PROCESS_CLOCK;
1425 return posix_cpu_timer_create(timer);
1426}
1427static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1428 struct timespec *rqtp,
1429 struct timespec __user *rmtp)
1430{
1431 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1432}
1433static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1434{
1435 return -EINVAL;
1436}
1437static int thread_cpu_clock_getres(const clockid_t which_clock,
1438 struct timespec *tp)
1439{
1440 return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1441}
1442static int thread_cpu_clock_get(const clockid_t which_clock,
1443 struct timespec *tp)
1444{
1445 return posix_cpu_clock_get(THREAD_CLOCK, tp);
1446}
1447static int thread_cpu_timer_create(struct k_itimer *timer)
1448{
1449 timer->it_clock = THREAD_CLOCK;
1450 return posix_cpu_timer_create(timer);
1451}
1452
1453struct k_clock clock_posix_cpu = {
1454 .clock_getres = posix_cpu_clock_getres,
1455 .clock_set = posix_cpu_clock_set,
1456 .clock_get = posix_cpu_clock_get,
1457 .timer_create = posix_cpu_timer_create,
1458 .nsleep = posix_cpu_nsleep,
1459 .nsleep_restart = posix_cpu_nsleep_restart,
1460 .timer_set = posix_cpu_timer_set,
1461 .timer_del = posix_cpu_timer_del,
1462 .timer_get = posix_cpu_timer_get,
1463};
1464
1465static __init int init_posix_cpu_timers(void)
1466{
1467 struct k_clock process = {
1468 .clock_getres = process_cpu_clock_getres,
1469 .clock_get = process_cpu_clock_get,
1470 .timer_create = process_cpu_timer_create,
1471 .nsleep = process_cpu_nsleep,
1472 .nsleep_restart = process_cpu_nsleep_restart,
1473 };
1474 struct k_clock thread = {
1475 .clock_getres = thread_cpu_clock_getres,
1476 .clock_get = thread_cpu_clock_get,
1477 .timer_create = thread_cpu_timer_create,
1478 };
1479 struct timespec ts;
1480
1481 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1482 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1483
1484 cputime_to_timespec(cputime_one_jiffy, &ts);
1485 onecputick = ts.tv_nsec;
1486 WARN_ON(ts.tv_sec != 0);
1487
1488 return 0;
1489}
1490__initcall(init_posix_cpu_timers);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
new file mode 100644
index 000000000000..42b463ad90f2
--- /dev/null
+++ b/kernel/time/posix-timers.c
@@ -0,0 +1,1123 @@
1/*
2 * linux/kernel/posix-timers.c
3 *
4 *
5 * 2002-10-15 Posix Clocks & timers
6 * by George Anzinger george@mvista.com
7 *
8 * Copyright (C) 2002 2003 by MontaVista Software.
9 *
10 * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
11 * Copyright (C) 2004 Boris Hu
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or (at
16 * your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
28 */
29
30/* These are all the functions necessary to implement
31 * POSIX clocks & timers
32 */
33#include <linux/mm.h>
34#include <linux/interrupt.h>
35#include <linux/slab.h>
36#include <linux/time.h>
37#include <linux/mutex.h>
38
39#include <asm/uaccess.h>
40#include <linux/list.h>
41#include <linux/init.h>
42#include <linux/compiler.h>
43#include <linux/hash.h>
44#include <linux/posix-clock.h>
45#include <linux/posix-timers.h>
46#include <linux/syscalls.h>
47#include <linux/wait.h>
48#include <linux/workqueue.h>
49#include <linux/export.h>
50#include <linux/hashtable.h>
51
52#include "timekeeping.h"
53
54/*
55 * Management arrays for POSIX timers. Timers are now kept in static hash table
56 * with 512 entries.
57 * Timer ids are allocated by local routine, which selects proper hash head by
58 * key, constructed from current->signal address and per signal struct counter.
59 * This keeps timer ids unique per process, but now they can intersect between
60 * processes.
61 */
62
63/*
64 * Lets keep our timers in a slab cache :-)
65 */
66static struct kmem_cache *posix_timers_cache;
67
68static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
69static DEFINE_SPINLOCK(hash_lock);
70
71/*
72 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
73 * SIGEV values. Here we put out an error if this assumption fails.
74 */
75#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
76 ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
77#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
78#endif
79
80/*
81 * parisc wants ENOTSUP instead of EOPNOTSUPP
82 */
83#ifndef ENOTSUP
84# define ENANOSLEEP_NOTSUP EOPNOTSUPP
85#else
86# define ENANOSLEEP_NOTSUP ENOTSUP
87#endif
88
89/*
90 * The timer ID is turned into a timer address by idr_find().
91 * Verifying a valid ID consists of:
92 *
93 * a) checking that idr_find() returns other than -1.
94 * b) checking that the timer id matches the one in the timer itself.
95 * c) that the timer owner is in the callers thread group.
96 */
97
98/*
99 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
100 * to implement others. This structure defines the various
101 * clocks.
102 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as
105 * much resolution as the system can muster. In some cases this
106 * resolution may depend on the underlying clock hardware and
107 * may not be quantifiable until run time, and only then is the
108 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation...
110 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * handle various clock functions.
113 *
114 * The standard POSIX timer management code assumes the
115 * following: 1.) The k_itimer struct (sched.h) is used for
116 * the timer. 2.) The list, it_lock, it_clock, it_id and
117 * it_pid fields are not modified by timer code.
118 *
119 * Permissions: It is assumed that the clock_settime() function defined
120 * for each clock will take care of permission checks. Some
121 * clocks may be set able by any user (i.e. local process
122 * clocks) others not. Currently the only set able clock we
123 * have is CLOCK_REALTIME and its high res counter part, both of
124 * which we beg off on and pass to do_sys_settimeofday().
125 */
126
127static struct k_clock posix_clocks[MAX_CLOCKS];
128
129/*
130 * These ones are defined below.
131 */
132static int common_nsleep(const clockid_t, int flags, struct timespec *t,
133 struct timespec __user *rmtp);
134static int common_timer_create(struct k_itimer *new_timer);
135static void common_timer_get(struct k_itimer *, struct itimerspec *);
136static int common_timer_set(struct k_itimer *, int,
137 struct itimerspec *, struct itimerspec *);
138static int common_timer_del(struct k_itimer *timer);
139
140static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
141
142static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
143
144#define lock_timer(tid, flags) \
145({ struct k_itimer *__timr; \
146 __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
147 __timr; \
148})
149
150static int hash(struct signal_struct *sig, unsigned int nr)
151{
152 return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
153}
154
155static struct k_itimer *__posix_timers_find(struct hlist_head *head,
156 struct signal_struct *sig,
157 timer_t id)
158{
159 struct k_itimer *timer;
160
161 hlist_for_each_entry_rcu(timer, head, t_hash) {
162 if ((timer->it_signal == sig) && (timer->it_id == id))
163 return timer;
164 }
165 return NULL;
166}
167
168static struct k_itimer *posix_timer_by_id(timer_t id)
169{
170 struct signal_struct *sig = current->signal;
171 struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
172
173 return __posix_timers_find(head, sig, id);
174}
175
176static int posix_timer_add(struct k_itimer *timer)
177{
178 struct signal_struct *sig = current->signal;
179 int first_free_id = sig->posix_timer_id;
180 struct hlist_head *head;
181 int ret = -ENOENT;
182
183 do {
184 spin_lock(&hash_lock);
185 head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
186 if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
187 hlist_add_head_rcu(&timer->t_hash, head);
188 ret = sig->posix_timer_id;
189 }
190 if (++sig->posix_timer_id < 0)
191 sig->posix_timer_id = 0;
192 if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
193 /* Loop over all possible ids completed */
194 ret = -EAGAIN;
195 spin_unlock(&hash_lock);
196 } while (ret == -ENOENT);
197 return ret;
198}
199
200static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
201{
202 spin_unlock_irqrestore(&timr->it_lock, flags);
203}
204
205/* Get clock_realtime */
206static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
207{
208 ktime_get_real_ts(tp);
209 return 0;
210}
211
212/* Set clock_realtime */
213static int posix_clock_realtime_set(const clockid_t which_clock,
214 const struct timespec *tp)
215{
216 return do_sys_settimeofday(tp, NULL);
217}
218
219static int posix_clock_realtime_adj(const clockid_t which_clock,
220 struct timex *t)
221{
222 return do_adjtimex(t);
223}
224
225/*
226 * Get monotonic time for posix timers
227 */
228static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
229{
230 ktime_get_ts(tp);
231 return 0;
232}
233
234/*
235 * Get monotonic-raw time for posix timers
236 */
237static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
238{
239 getrawmonotonic(tp);
240 return 0;
241}
242
243
244static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
245{
246 *tp = current_kernel_time();
247 return 0;
248}
249
250static int posix_get_monotonic_coarse(clockid_t which_clock,
251 struct timespec *tp)
252{
253 *tp = get_monotonic_coarse();
254 return 0;
255}
256
257static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
258{
259 *tp = ktime_to_timespec(KTIME_LOW_RES);
260 return 0;
261}
262
263static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
264{
265 get_monotonic_boottime(tp);
266 return 0;
267}
268
269static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
270{
271 timekeeping_clocktai(tp);
272 return 0;
273}
274
275/*
276 * Initialize everything, well, just everything in Posix clocks/timers ;)
277 */
278static __init int init_posix_timers(void)
279{
280 struct k_clock clock_realtime = {
281 .clock_getres = hrtimer_get_res,
282 .clock_get = posix_clock_realtime_get,
283 .clock_set = posix_clock_realtime_set,
284 .clock_adj = posix_clock_realtime_adj,
285 .nsleep = common_nsleep,
286 .nsleep_restart = hrtimer_nanosleep_restart,
287 .timer_create = common_timer_create,
288 .timer_set = common_timer_set,
289 .timer_get = common_timer_get,
290 .timer_del = common_timer_del,
291 };
292 struct k_clock clock_monotonic = {
293 .clock_getres = hrtimer_get_res,
294 .clock_get = posix_ktime_get_ts,
295 .nsleep = common_nsleep,
296 .nsleep_restart = hrtimer_nanosleep_restart,
297 .timer_create = common_timer_create,
298 .timer_set = common_timer_set,
299 .timer_get = common_timer_get,
300 .timer_del = common_timer_del,
301 };
302 struct k_clock clock_monotonic_raw = {
303 .clock_getres = hrtimer_get_res,
304 .clock_get = posix_get_monotonic_raw,
305 };
306 struct k_clock clock_realtime_coarse = {
307 .clock_getres = posix_get_coarse_res,
308 .clock_get = posix_get_realtime_coarse,
309 };
310 struct k_clock clock_monotonic_coarse = {
311 .clock_getres = posix_get_coarse_res,
312 .clock_get = posix_get_monotonic_coarse,
313 };
314 struct k_clock clock_tai = {
315 .clock_getres = hrtimer_get_res,
316 .clock_get = posix_get_tai,
317 .nsleep = common_nsleep,
318 .nsleep_restart = hrtimer_nanosleep_restart,
319 .timer_create = common_timer_create,
320 .timer_set = common_timer_set,
321 .timer_get = common_timer_get,
322 .timer_del = common_timer_del,
323 };
324 struct k_clock clock_boottime = {
325 .clock_getres = hrtimer_get_res,
326 .clock_get = posix_get_boottime,
327 .nsleep = common_nsleep,
328 .nsleep_restart = hrtimer_nanosleep_restart,
329 .timer_create = common_timer_create,
330 .timer_set = common_timer_set,
331 .timer_get = common_timer_get,
332 .timer_del = common_timer_del,
333 };
334
335 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
336 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
337 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
338 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
339 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
340 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
341 posix_timers_register_clock(CLOCK_TAI, &clock_tai);
342
343 posix_timers_cache = kmem_cache_create("posix_timers_cache",
344 sizeof (struct k_itimer), 0, SLAB_PANIC,
345 NULL);
346 return 0;
347}
348
349__initcall(init_posix_timers);
350
351static void schedule_next_timer(struct k_itimer *timr)
352{
353 struct hrtimer *timer = &timr->it.real.timer;
354
355 if (timr->it.real.interval.tv64 == 0)
356 return;
357
358 timr->it_overrun += (unsigned int) hrtimer_forward(timer,
359 timer->base->get_time(),
360 timr->it.real.interval);
361
362 timr->it_overrun_last = timr->it_overrun;
363 timr->it_overrun = -1;
364 ++timr->it_requeue_pending;
365 hrtimer_restart(timer);
366}
367
368/*
369 * This function is exported for use by the signal deliver code. It is
370 * called just prior to the info block being released and passes that
371 * block to us. It's function is to update the overrun entry AND to
372 * restart the timer. It should only be called if the timer is to be
373 * restarted (i.e. we have flagged this in the sys_private entry of the
374 * info block).
375 *
376 * To protect against the timer going away while the interrupt is queued,
377 * we require that the it_requeue_pending flag be set.
378 */
379void do_schedule_next_timer(struct siginfo *info)
380{
381 struct k_itimer *timr;
382 unsigned long flags;
383
384 timr = lock_timer(info->si_tid, &flags);
385
386 if (timr && timr->it_requeue_pending == info->si_sys_private) {
387 if (timr->it_clock < 0)
388 posix_cpu_timer_schedule(timr);
389 else
390 schedule_next_timer(timr);
391
392 info->si_overrun += timr->it_overrun_last;
393 }
394
395 if (timr)
396 unlock_timer(timr, flags);
397}
398
399int posix_timer_event(struct k_itimer *timr, int si_private)
400{
401 struct task_struct *task;
402 int shared, ret = -1;
403 /*
404 * FIXME: if ->sigq is queued we can race with
405 * dequeue_signal()->do_schedule_next_timer().
406 *
407 * If dequeue_signal() sees the "right" value of
408 * si_sys_private it calls do_schedule_next_timer().
409 * We re-queue ->sigq and drop ->it_lock().
410 * do_schedule_next_timer() locks the timer
411 * and re-schedules it while ->sigq is pending.
412 * Not really bad, but not that we want.
413 */
414 timr->sigq->info.si_sys_private = si_private;
415
416 rcu_read_lock();
417 task = pid_task(timr->it_pid, PIDTYPE_PID);
418 if (task) {
419 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
420 ret = send_sigqueue(timr->sigq, task, shared);
421 }
422 rcu_read_unlock();
423 /* If we failed to send the signal the timer stops. */
424 return ret > 0;
425}
426EXPORT_SYMBOL_GPL(posix_timer_event);
427
428/*
429 * This function gets called when a POSIX.1b interval timer expires. It
430 * is used as a callback from the kernel internal timer. The
431 * run_timer_list code ALWAYS calls with interrupts on.
432
433 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
434 */
435static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
436{
437 struct k_itimer *timr;
438 unsigned long flags;
439 int si_private = 0;
440 enum hrtimer_restart ret = HRTIMER_NORESTART;
441
442 timr = container_of(timer, struct k_itimer, it.real.timer);
443 spin_lock_irqsave(&timr->it_lock, flags);
444
445 if (timr->it.real.interval.tv64 != 0)
446 si_private = ++timr->it_requeue_pending;
447
448 if (posix_timer_event(timr, si_private)) {
449 /*
450 * signal was not sent because of sig_ignor
451 * we will not get a call back to restart it AND
452 * it should be restarted.
453 */
454 if (timr->it.real.interval.tv64 != 0) {
455 ktime_t now = hrtimer_cb_get_time(timer);
456
457 /*
458 * FIXME: What we really want, is to stop this
459 * timer completely and restart it in case the
460 * SIG_IGN is removed. This is a non trivial
461 * change which involves sighand locking
462 * (sigh !), which we don't want to do late in
463 * the release cycle.
464 *
465 * For now we just let timers with an interval
466 * less than a jiffie expire every jiffie to
467 * avoid softirq starvation in case of SIG_IGN
468 * and a very small interval, which would put
469 * the timer right back on the softirq pending
470 * list. By moving now ahead of time we trick
471 * hrtimer_forward() to expire the timer
472 * later, while we still maintain the overrun
473 * accuracy, but have some inconsistency in
474 * the timer_gettime() case. This is at least
475 * better than a starved softirq. A more
476 * complex fix which solves also another related
477 * inconsistency is already in the pipeline.
478 */
479#ifdef CONFIG_HIGH_RES_TIMERS
480 {
481 ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
482
483 if (timr->it.real.interval.tv64 < kj.tv64)
484 now = ktime_add(now, kj);
485 }
486#endif
487 timr->it_overrun += (unsigned int)
488 hrtimer_forward(timer, now,
489 timr->it.real.interval);
490 ret = HRTIMER_RESTART;
491 ++timr->it_requeue_pending;
492 }
493 }
494
495 unlock_timer(timr, flags);
496 return ret;
497}
498
499static struct pid *good_sigevent(sigevent_t * event)
500{
501 struct task_struct *rtn = current->group_leader;
502
503 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
504 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
505 !same_thread_group(rtn, current) ||
506 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
507 return NULL;
508
509 if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
510 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
511 return NULL;
512
513 return task_pid(rtn);
514}
515
516void posix_timers_register_clock(const clockid_t clock_id,
517 struct k_clock *new_clock)
518{
519 if ((unsigned) clock_id >= MAX_CLOCKS) {
520 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
521 clock_id);
522 return;
523 }
524
525 if (!new_clock->clock_get) {
526 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
527 clock_id);
528 return;
529 }
530 if (!new_clock->clock_getres) {
531 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
532 clock_id);
533 return;
534 }
535
536 posix_clocks[clock_id] = *new_clock;
537}
538EXPORT_SYMBOL_GPL(posix_timers_register_clock);
539
540static struct k_itimer * alloc_posix_timer(void)
541{
542 struct k_itimer *tmr;
543 tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
544 if (!tmr)
545 return tmr;
546 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
547 kmem_cache_free(posix_timers_cache, tmr);
548 return NULL;
549 }
550 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
551 return tmr;
552}
553
554static void k_itimer_rcu_free(struct rcu_head *head)
555{
556 struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
557
558 kmem_cache_free(posix_timers_cache, tmr);
559}
560
561#define IT_ID_SET 1
562#define IT_ID_NOT_SET 0
563static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
564{
565 if (it_id_set) {
566 unsigned long flags;
567 spin_lock_irqsave(&hash_lock, flags);
568 hlist_del_rcu(&tmr->t_hash);
569 spin_unlock_irqrestore(&hash_lock, flags);
570 }
571 put_pid(tmr->it_pid);
572 sigqueue_free(tmr->sigq);
573 call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
574}
575
576static struct k_clock *clockid_to_kclock(const clockid_t id)
577{
578 if (id < 0)
579 return (id & CLOCKFD_MASK) == CLOCKFD ?
580 &clock_posix_dynamic : &clock_posix_cpu;
581
582 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
583 return NULL;
584 return &posix_clocks[id];
585}
586
587static int common_timer_create(struct k_itimer *new_timer)
588{
589 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
590 return 0;
591}
592
593/* Create a POSIX.1b interval timer. */
594
595SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
596 struct sigevent __user *, timer_event_spec,
597 timer_t __user *, created_timer_id)
598{
599 struct k_clock *kc = clockid_to_kclock(which_clock);
600 struct k_itimer *new_timer;
601 int error, new_timer_id;
602 sigevent_t event;
603 int it_id_set = IT_ID_NOT_SET;
604
605 if (!kc)
606 return -EINVAL;
607 if (!kc->timer_create)
608 return -EOPNOTSUPP;
609
610 new_timer = alloc_posix_timer();
611 if (unlikely(!new_timer))
612 return -EAGAIN;
613
614 spin_lock_init(&new_timer->it_lock);
615 new_timer_id = posix_timer_add(new_timer);
616 if (new_timer_id < 0) {
617 error = new_timer_id;
618 goto out;
619 }
620
621 it_id_set = IT_ID_SET;
622 new_timer->it_id = (timer_t) new_timer_id;
623 new_timer->it_clock = which_clock;
624 new_timer->it_overrun = -1;
625
626 if (timer_event_spec) {
627 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
628 error = -EFAULT;
629 goto out;
630 }
631 rcu_read_lock();
632 new_timer->it_pid = get_pid(good_sigevent(&event));
633 rcu_read_unlock();
634 if (!new_timer->it_pid) {
635 error = -EINVAL;
636 goto out;
637 }
638 } else {
639 event.sigev_notify = SIGEV_SIGNAL;
640 event.sigev_signo = SIGALRM;
641 event.sigev_value.sival_int = new_timer->it_id;
642 new_timer->it_pid = get_pid(task_tgid(current));
643 }
644
645 new_timer->it_sigev_notify = event.sigev_notify;
646 new_timer->sigq->info.si_signo = event.sigev_signo;
647 new_timer->sigq->info.si_value = event.sigev_value;
648 new_timer->sigq->info.si_tid = new_timer->it_id;
649 new_timer->sigq->info.si_code = SI_TIMER;
650
651 if (copy_to_user(created_timer_id,
652 &new_timer_id, sizeof (new_timer_id))) {
653 error = -EFAULT;
654 goto out;
655 }
656
657 error = kc->timer_create(new_timer);
658 if (error)
659 goto out;
660
661 spin_lock_irq(&current->sighand->siglock);
662 new_timer->it_signal = current->signal;
663 list_add(&new_timer->list, &current->signal->posix_timers);
664 spin_unlock_irq(&current->sighand->siglock);
665
666 return 0;
667 /*
668 * In the case of the timer belonging to another task, after
669 * the task is unlocked, the timer is owned by the other task
670 * and may cease to exist at any time. Don't use or modify
671 * new_timer after the unlock call.
672 */
673out:
674 release_posix_timer(new_timer, it_id_set);
675 return error;
676}
677
678/*
679 * Locking issues: We need to protect the result of the id look up until
680 * we get the timer locked down so it is not deleted under us. The
681 * removal is done under the idr spinlock so we use that here to bridge
682 * the find to the timer lock. To avoid a dead lock, the timer id MUST
683 * be release with out holding the timer lock.
684 */
685static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
686{
687 struct k_itimer *timr;
688
689 /*
690 * timer_t could be any type >= int and we want to make sure any
691 * @timer_id outside positive int range fails lookup.
692 */
693 if ((unsigned long long)timer_id > INT_MAX)
694 return NULL;
695
696 rcu_read_lock();
697 timr = posix_timer_by_id(timer_id);
698 if (timr) {
699 spin_lock_irqsave(&timr->it_lock, *flags);
700 if (timr->it_signal == current->signal) {
701 rcu_read_unlock();
702 return timr;
703 }
704 spin_unlock_irqrestore(&timr->it_lock, *flags);
705 }
706 rcu_read_unlock();
707
708 return NULL;
709}
710
711/*
712 * Get the time remaining on a POSIX.1b interval timer. This function
713 * is ALWAYS called with spin_lock_irq on the timer, thus it must not
714 * mess with irq.
715 *
716 * We have a couple of messes to clean up here. First there is the case
717 * of a timer that has a requeue pending. These timers should appear to
718 * be in the timer list with an expiry as if we were to requeue them
719 * now.
720 *
721 * The second issue is the SIGEV_NONE timer which may be active but is
722 * not really ever put in the timer list (to save system resources).
723 * This timer may be expired, and if so, we will do it here. Otherwise
724 * it is the same as a requeue pending timer WRT to what we should
725 * report.
726 */
727static void
728common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
729{
730 ktime_t now, remaining, iv;
731 struct hrtimer *timer = &timr->it.real.timer;
732
733 memset(cur_setting, 0, sizeof(struct itimerspec));
734
735 iv = timr->it.real.interval;
736
737 /* interval timer ? */
738 if (iv.tv64)
739 cur_setting->it_interval = ktime_to_timespec(iv);
740 else if (!hrtimer_active(timer) &&
741 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
742 return;
743
744 now = timer->base->get_time();
745
746 /*
747 * When a requeue is pending or this is a SIGEV_NONE
748 * timer move the expiry time forward by intervals, so
749 * expiry is > now.
750 */
751 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
752 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
753 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
754
755 remaining = ktime_sub(hrtimer_get_expires(timer), now);
756 /* Return 0 only, when the timer is expired and not pending */
757 if (remaining.tv64 <= 0) {
758 /*
759 * A single shot SIGEV_NONE timer must return 0, when
760 * it is expired !
761 */
762 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
763 cur_setting->it_value.tv_nsec = 1;
764 } else
765 cur_setting->it_value = ktime_to_timespec(remaining);
766}
767
768/* Get the time remaining on a POSIX.1b interval timer. */
769SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
770 struct itimerspec __user *, setting)
771{
772 struct itimerspec cur_setting;
773 struct k_itimer *timr;
774 struct k_clock *kc;
775 unsigned long flags;
776 int ret = 0;
777
778 timr = lock_timer(timer_id, &flags);
779 if (!timr)
780 return -EINVAL;
781
782 kc = clockid_to_kclock(timr->it_clock);
783 if (WARN_ON_ONCE(!kc || !kc->timer_get))
784 ret = -EINVAL;
785 else
786 kc->timer_get(timr, &cur_setting);
787
788 unlock_timer(timr, flags);
789
790 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
791 return -EFAULT;
792
793 return ret;
794}
795
796/*
797 * Get the number of overruns of a POSIX.1b interval timer. This is to
798 * be the overrun of the timer last delivered. At the same time we are
799 * accumulating overruns on the next timer. The overrun is frozen when
800 * the signal is delivered, either at the notify time (if the info block
801 * is not queued) or at the actual delivery time (as we are informed by
802 * the call back to do_schedule_next_timer(). So all we need to do is
803 * to pick up the frozen overrun.
804 */
805SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
806{
807 struct k_itimer *timr;
808 int overrun;
809 unsigned long flags;
810
811 timr = lock_timer(timer_id, &flags);
812 if (!timr)
813 return -EINVAL;
814
815 overrun = timr->it_overrun_last;
816 unlock_timer(timr, flags);
817
818 return overrun;
819}
820
821/* Set a POSIX.1b interval timer. */
822/* timr->it_lock is taken. */
823static int
824common_timer_set(struct k_itimer *timr, int flags,
825 struct itimerspec *new_setting, struct itimerspec *old_setting)
826{
827 struct hrtimer *timer = &timr->it.real.timer;
828 enum hrtimer_mode mode;
829
830 if (old_setting)
831 common_timer_get(timr, old_setting);
832
833 /* disable the timer */
834 timr->it.real.interval.tv64 = 0;
835 /*
836 * careful here. If smp we could be in the "fire" routine which will
837 * be spinning as we hold the lock. But this is ONLY an SMP issue.
838 */
839 if (hrtimer_try_to_cancel(timer) < 0)
840 return TIMER_RETRY;
841
842 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
843 ~REQUEUE_PENDING;
844 timr->it_overrun_last = 0;
845
846 /* switch off the timer when it_value is zero */
847 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
848 return 0;
849
850 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
851 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
852 timr->it.real.timer.function = posix_timer_fn;
853
854 hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
855
856 /* Convert interval */
857 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
858
859 /* SIGEV_NONE timers are not queued ! See common_timer_get */
860 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
861 /* Setup correct expiry time for relative timers */
862 if (mode == HRTIMER_MODE_REL) {
863 hrtimer_add_expires(timer, timer->base->get_time());
864 }
865 return 0;
866 }
867
868 hrtimer_start_expires(timer, mode);
869 return 0;
870}
871
872/* Set a POSIX.1b interval timer */
873SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
874 const struct itimerspec __user *, new_setting,
875 struct itimerspec __user *, old_setting)
876{
877 struct k_itimer *timr;
878 struct itimerspec new_spec, old_spec;
879 int error = 0;
880 unsigned long flag;
881 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
882 struct k_clock *kc;
883
884 if (!new_setting)
885 return -EINVAL;
886
887 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
888 return -EFAULT;
889
890 if (!timespec_valid(&new_spec.it_interval) ||
891 !timespec_valid(&new_spec.it_value))
892 return -EINVAL;
893retry:
894 timr = lock_timer(timer_id, &flag);
895 if (!timr)
896 return -EINVAL;
897
898 kc = clockid_to_kclock(timr->it_clock);
899 if (WARN_ON_ONCE(!kc || !kc->timer_set))
900 error = -EINVAL;
901 else
902 error = kc->timer_set(timr, flags, &new_spec, rtn);
903
904 unlock_timer(timr, flag);
905 if (error == TIMER_RETRY) {
906 rtn = NULL; // We already got the old time...
907 goto retry;
908 }
909
910 if (old_setting && !error &&
911 copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
912 error = -EFAULT;
913
914 return error;
915}
916
917static int common_timer_del(struct k_itimer *timer)
918{
919 timer->it.real.interval.tv64 = 0;
920
921 if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
922 return TIMER_RETRY;
923 return 0;
924}
925
926static inline int timer_delete_hook(struct k_itimer *timer)
927{
928 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
929
930 if (WARN_ON_ONCE(!kc || !kc->timer_del))
931 return -EINVAL;
932 return kc->timer_del(timer);
933}
934
935/* Delete a POSIX.1b interval timer. */
936SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
937{
938 struct k_itimer *timer;
939 unsigned long flags;
940
941retry_delete:
942 timer = lock_timer(timer_id, &flags);
943 if (!timer)
944 return -EINVAL;
945
946 if (timer_delete_hook(timer) == TIMER_RETRY) {
947 unlock_timer(timer, flags);
948 goto retry_delete;
949 }
950
951 spin_lock(&current->sighand->siglock);
952 list_del(&timer->list);
953 spin_unlock(&current->sighand->siglock);
954 /*
955 * This keeps any tasks waiting on the spin lock from thinking
956 * they got something (see the lock code above).
957 */
958 timer->it_signal = NULL;
959
960 unlock_timer(timer, flags);
961 release_posix_timer(timer, IT_ID_SET);
962 return 0;
963}
964
965/*
966 * return timer owned by the process, used by exit_itimers
967 */
968static void itimer_delete(struct k_itimer *timer)
969{
970 unsigned long flags;
971
972retry_delete:
973 spin_lock_irqsave(&timer->it_lock, flags);
974
975 if (timer_delete_hook(timer) == TIMER_RETRY) {
976 unlock_timer(timer, flags);
977 goto retry_delete;
978 }
979 list_del(&timer->list);
980 /*
981 * This keeps any tasks waiting on the spin lock from thinking
982 * they got something (see the lock code above).
983 */
984 timer->it_signal = NULL;
985
986 unlock_timer(timer, flags);
987 release_posix_timer(timer, IT_ID_SET);
988}
989
990/*
991 * This is called by do_exit or de_thread, only when there are no more
992 * references to the shared signal_struct.
993 */
994void exit_itimers(struct signal_struct *sig)
995{
996 struct k_itimer *tmr;
997
998 while (!list_empty(&sig->posix_timers)) {
999 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
1000 itimer_delete(tmr);
1001 }
1002}
1003
1004SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
1005 const struct timespec __user *, tp)
1006{
1007 struct k_clock *kc = clockid_to_kclock(which_clock);
1008 struct timespec new_tp;
1009
1010 if (!kc || !kc->clock_set)
1011 return -EINVAL;
1012
1013 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
1014 return -EFAULT;
1015
1016 return kc->clock_set(which_clock, &new_tp);
1017}
1018
1019SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
1020 struct timespec __user *,tp)
1021{
1022 struct k_clock *kc = clockid_to_kclock(which_clock);
1023 struct timespec kernel_tp;
1024 int error;
1025
1026 if (!kc)
1027 return -EINVAL;
1028
1029 error = kc->clock_get(which_clock, &kernel_tp);
1030
1031 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
1032 error = -EFAULT;
1033
1034 return error;
1035}
1036
1037SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
1038 struct timex __user *, utx)
1039{
1040 struct k_clock *kc = clockid_to_kclock(which_clock);
1041 struct timex ktx;
1042 int err;
1043
1044 if (!kc)
1045 return -EINVAL;
1046 if (!kc->clock_adj)
1047 return -EOPNOTSUPP;
1048
1049 if (copy_from_user(&ktx, utx, sizeof(ktx)))
1050 return -EFAULT;
1051
1052 err = kc->clock_adj(which_clock, &ktx);
1053
1054 if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
1055 return -EFAULT;
1056
1057 return err;
1058}
1059
1060SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
1061 struct timespec __user *, tp)
1062{
1063 struct k_clock *kc = clockid_to_kclock(which_clock);
1064 struct timespec rtn_tp;
1065 int error;
1066
1067 if (!kc)
1068 return -EINVAL;
1069
1070 error = kc->clock_getres(which_clock, &rtn_tp);
1071
1072 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
1073 error = -EFAULT;
1074
1075 return error;
1076}
1077
1078/*
1079 * nanosleep for monotonic and realtime clocks
1080 */
1081static int common_nsleep(const clockid_t which_clock, int flags,
1082 struct timespec *tsave, struct timespec __user *rmtp)
1083{
1084 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
1085 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
1086 which_clock);
1087}
1088
1089SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1090 const struct timespec __user *, rqtp,
1091 struct timespec __user *, rmtp)
1092{
1093 struct k_clock *kc = clockid_to_kclock(which_clock);
1094 struct timespec t;
1095
1096 if (!kc)
1097 return -EINVAL;
1098 if (!kc->nsleep)
1099 return -ENANOSLEEP_NOTSUP;
1100
1101 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1102 return -EFAULT;
1103
1104 if (!timespec_valid(&t))
1105 return -EINVAL;
1106
1107 return kc->nsleep(which_clock, flags, &t, rmtp);
1108}
1109
1110/*
1111 * This will restart clock_nanosleep. This is required only by
1112 * compat_clock_nanosleep_restart for now.
1113 */
1114long clock_nanosleep_restart(struct restart_block *restart_block)
1115{
1116 clockid_t which_clock = restart_block->nanosleep.clockid;
1117 struct k_clock *kc = clockid_to_kclock(which_clock);
1118
1119 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1120 return -EINVAL;
1121
1122 return kc->nsleep_restart(restart_block);
1123}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7ab92b19965a..c19c1d84b6f3 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
4#include <linux/hrtimer.h> 4#include <linux/hrtimer.h>
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7#include "timekeeping.h"
8
7extern seqlock_t jiffies_lock; 9extern seqlock_t jiffies_lock;
8 10
9#define CS_NAME_LEN 32 11#define CS_NAME_LEN 32
diff --git a/kernel/time/time.c b/kernel/time/time.c
new file mode 100644
index 000000000000..f0294ba14634
--- /dev/null
+++ b/kernel/time/time.c
@@ -0,0 +1,778 @@
1/*
2 * linux/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * This file contains the interface functions for the various
7 * time related system calls: time, stime, gettimeofday, settimeofday,
8 * adjtime
9 */
10/*
11 * Modification history kernel/time.c
12 *
13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched/core.c and adjtimex()
15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe
18 * kernel PLL updated to 1994-12-13 specs (rfc-1589)
19 * 1999-01-16 Ulrich Windl
20 * Introduced error checking for many cases in adjtimex().
21 * Updated NTP code according to technical memorandum Jan '96
22 * "A Kernel Model for Precision Timekeeping" by Dave Mills
23 * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
24 * (Even though the technical memorandum forbids it)
25 * 2004-07-14 Christoph Lameter
26 * Added getnstimeofday to allow the posix timer functions to return
27 * with nanosecond accuracy
28 */
29
30#include <linux/export.h>
31#include <linux/timex.h>
32#include <linux/capability.h>
33#include <linux/timekeeper_internal.h>
34#include <linux/errno.h>
35#include <linux/syscalls.h>
36#include <linux/security.h>
37#include <linux/fs.h>
38#include <linux/math64.h>
39#include <linux/ptrace.h>
40
41#include <asm/uaccess.h>
42#include <asm/unistd.h>
43
44#include "timeconst.h"
45#include "timekeeping.h"
46
47/*
48 * The timezone where the local system is located. Used as a default by some
49 * programs who obtain this value by using gettimeofday.
50 */
51struct timezone sys_tz;
52
53EXPORT_SYMBOL(sys_tz);
54
55#ifdef __ARCH_WANT_SYS_TIME
56
57/*
58 * sys_time() can be implemented in user-level using
59 * sys_gettimeofday(). Is this for backwards compatibility? If so,
60 * why not move it into the appropriate arch directory (for those
61 * architectures that need it).
62 */
63SYSCALL_DEFINE1(time, time_t __user *, tloc)
64{
65 time_t i = get_seconds();
66
67 if (tloc) {
68 if (put_user(i,tloc))
69 return -EFAULT;
70 }
71 force_successful_syscall_return();
72 return i;
73}
74
75/*
76 * sys_stime() can be implemented in user-level using
77 * sys_settimeofday(). Is this for backwards compatibility? If so,
78 * why not move it into the appropriate arch directory (for those
79 * architectures that need it).
80 */
81
82SYSCALL_DEFINE1(stime, time_t __user *, tptr)
83{
84 struct timespec tv;
85 int err;
86
87 if (get_user(tv.tv_sec, tptr))
88 return -EFAULT;
89
90 tv.tv_nsec = 0;
91
92 err = security_settime(&tv, NULL);
93 if (err)
94 return err;
95
96 do_settimeofday(&tv);
97 return 0;
98}
99
100#endif /* __ARCH_WANT_SYS_TIME */
101
102SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
103 struct timezone __user *, tz)
104{
105 if (likely(tv != NULL)) {
106 struct timeval ktv;
107 do_gettimeofday(&ktv);
108 if (copy_to_user(tv, &ktv, sizeof(ktv)))
109 return -EFAULT;
110 }
111 if (unlikely(tz != NULL)) {
112 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
113 return -EFAULT;
114 }
115 return 0;
116}
117
118/*
119 * Indicates if there is an offset between the system clock and the hardware
120 * clock/persistent clock/rtc.
121 */
122int persistent_clock_is_local;
123
124/*
125 * Adjust the time obtained from the CMOS to be UTC time instead of
126 * local time.
127 *
128 * This is ugly, but preferable to the alternatives. Otherwise we
129 * would either need to write a program to do it in /etc/rc (and risk
130 * confusion if the program gets run more than once; it would also be
131 * hard to make the program warp the clock precisely n hours) or
132 * compile in the timezone information into the kernel. Bad, bad....
133 *
134 * - TYT, 1992-01-01
135 *
136 * The best thing to do is to keep the CMOS clock in universal time (UTC)
137 * as real UNIX machines always do it. This avoids all headaches about
138 * daylight saving times and warping kernel clocks.
139 */
140static inline void warp_clock(void)
141{
142 if (sys_tz.tz_minuteswest != 0) {
143 struct timespec adjust;
144
145 persistent_clock_is_local = 1;
146 adjust.tv_sec = sys_tz.tz_minuteswest * 60;
147 adjust.tv_nsec = 0;
148 timekeeping_inject_offset(&adjust);
149 }
150}
151
152/*
153 * In case for some reason the CMOS clock has not already been running
154 * in UTC, but in some local time: The first time we set the timezone,
155 * we will warp the clock so that it is ticking UTC time instead of
156 * local time. Presumably, if someone is setting the timezone then we
157 * are running in an environment where the programs understand about
158 * timezones. This should be done at boot time in the /etc/rc script,
159 * as soon as possible, so that the clock can be set right. Otherwise,
160 * various programs will get confused when the clock gets warped.
161 */
162
163int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
164{
165 static int firsttime = 1;
166 int error = 0;
167
168 if (tv && !timespec_valid(tv))
169 return -EINVAL;
170
171 error = security_settime(tv, tz);
172 if (error)
173 return error;
174
175 if (tz) {
176 sys_tz = *tz;
177 update_vsyscall_tz();
178 if (firsttime) {
179 firsttime = 0;
180 if (!tv)
181 warp_clock();
182 }
183 }
184 if (tv)
185 return do_settimeofday(tv);
186 return 0;
187}
188
189SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
190 struct timezone __user *, tz)
191{
192 struct timeval user_tv;
193 struct timespec new_ts;
194 struct timezone new_tz;
195
196 if (tv) {
197 if (copy_from_user(&user_tv, tv, sizeof(*tv)))
198 return -EFAULT;
199 new_ts.tv_sec = user_tv.tv_sec;
200 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
201 }
202 if (tz) {
203 if (copy_from_user(&new_tz, tz, sizeof(*tz)))
204 return -EFAULT;
205 }
206
207 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
208}
209
210SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
211{
212 struct timex txc; /* Local copy of parameter */
213 int ret;
214
215 /* Copy the user data space into the kernel copy
216 * structure. But bear in mind that the structures
217 * may change
218 */
219 if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
220 return -EFAULT;
221 ret = do_adjtimex(&txc);
222 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
223}
224
225/**
226 * current_fs_time - Return FS time
227 * @sb: Superblock.
228 *
229 * Return the current time truncated to the time granularity supported by
230 * the fs.
231 */
232struct timespec current_fs_time(struct super_block *sb)
233{
234 struct timespec now = current_kernel_time();
235 return timespec_trunc(now, sb->s_time_gran);
236}
237EXPORT_SYMBOL(current_fs_time);
238
239/*
240 * Convert jiffies to milliseconds and back.
241 *
242 * Avoid unnecessary multiplications/divisions in the
243 * two most common HZ cases:
244 */
245unsigned int jiffies_to_msecs(const unsigned long j)
246{
247#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
248 return (MSEC_PER_SEC / HZ) * j;
249#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
250 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
251#else
252# if BITS_PER_LONG == 32
253 return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
254# else
255 return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
256# endif
257#endif
258}
259EXPORT_SYMBOL(jiffies_to_msecs);
260
261unsigned int jiffies_to_usecs(const unsigned long j)
262{
263#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
264 return (USEC_PER_SEC / HZ) * j;
265#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
266 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
267#else
268# if BITS_PER_LONG == 32
269 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
270# else
271 return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
272# endif
273#endif
274}
275EXPORT_SYMBOL(jiffies_to_usecs);
276
277/**
278 * timespec_trunc - Truncate timespec to a granularity
279 * @t: Timespec
280 * @gran: Granularity in ns.
281 *
282 * Truncate a timespec to a granularity. gran must be smaller than a second.
283 * Always rounds down.
284 *
285 * This function should be only used for timestamps returned by
286 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
287 * it doesn't handle the better resolution of the latter.
288 */
289struct timespec timespec_trunc(struct timespec t, unsigned gran)
290{
291 /*
292 * Division is pretty slow so avoid it for common cases.
293 * Currently current_kernel_time() never returns better than
294 * jiffies resolution. Exploit that.
295 */
296 if (gran <= jiffies_to_usecs(1) * 1000) {
297 /* nothing */
298 } else if (gran == 1000000000) {
299 t.tv_nsec = 0;
300 } else {
301 t.tv_nsec -= t.tv_nsec % gran;
302 }
303 return t;
304}
305EXPORT_SYMBOL(timespec_trunc);
306
307/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
308 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
309 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
310 *
311 * [For the Julian calendar (which was used in Russia before 1917,
312 * Britain & colonies before 1752, anywhere else before 1582,
313 * and is still in use by some communities) leave out the
314 * -year/100+year/400 terms, and add 10.]
315 *
316 * This algorithm was first published by Gauss (I think).
317 *
318 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
319 * machines where long is 32-bit! (However, as time_t is signed, we
320 * will already get problems at other places on 2038-01-19 03:14:08)
321 */
322unsigned long
323mktime(const unsigned int year0, const unsigned int mon0,
324 const unsigned int day, const unsigned int hour,
325 const unsigned int min, const unsigned int sec)
326{
327 unsigned int mon = mon0, year = year0;
328
329 /* 1..12 -> 11,12,1..10 */
330 if (0 >= (int) (mon -= 2)) {
331 mon += 12; /* Puts Feb last since it has leap day */
332 year -= 1;
333 }
334
335 return ((((unsigned long)
336 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
337 year*365 - 719499
338 )*24 + hour /* now have hours */
339 )*60 + min /* now have minutes */
340 )*60 + sec; /* finally seconds */
341}
342
343EXPORT_SYMBOL(mktime);
344
345/**
346 * set_normalized_timespec - set timespec sec and nsec parts and normalize
347 *
348 * @ts: pointer to timespec variable to be set
349 * @sec: seconds to set
350 * @nsec: nanoseconds to set
351 *
352 * Set seconds and nanoseconds field of a timespec variable and
353 * normalize to the timespec storage format
354 *
355 * Note: The tv_nsec part is always in the range of
356 * 0 <= tv_nsec < NSEC_PER_SEC
357 * For negative values only the tv_sec field is negative !
358 */
359void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
360{
361 while (nsec >= NSEC_PER_SEC) {
362 /*
363 * The following asm() prevents the compiler from
364 * optimising this loop into a modulo operation. See
365 * also __iter_div_u64_rem() in include/linux/time.h
366 */
367 asm("" : "+rm"(nsec));
368 nsec -= NSEC_PER_SEC;
369 ++sec;
370 }
371 while (nsec < 0) {
372 asm("" : "+rm"(nsec));
373 nsec += NSEC_PER_SEC;
374 --sec;
375 }
376 ts->tv_sec = sec;
377 ts->tv_nsec = nsec;
378}
379EXPORT_SYMBOL(set_normalized_timespec);
380
381/**
382 * ns_to_timespec - Convert nanoseconds to timespec
383 * @nsec: the nanoseconds value to be converted
384 *
385 * Returns the timespec representation of the nsec parameter.
386 */
387struct timespec ns_to_timespec(const s64 nsec)
388{
389 struct timespec ts;
390 s32 rem;
391
392 if (!nsec)
393 return (struct timespec) {0, 0};
394
395 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
396 if (unlikely(rem < 0)) {
397 ts.tv_sec--;
398 rem += NSEC_PER_SEC;
399 }
400 ts.tv_nsec = rem;
401
402 return ts;
403}
404EXPORT_SYMBOL(ns_to_timespec);
405
406/**
407 * ns_to_timeval - Convert nanoseconds to timeval
408 * @nsec: the nanoseconds value to be converted
409 *
410 * Returns the timeval representation of the nsec parameter.
411 */
412struct timeval ns_to_timeval(const s64 nsec)
413{
414 struct timespec ts = ns_to_timespec(nsec);
415 struct timeval tv;
416
417 tv.tv_sec = ts.tv_sec;
418 tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
419
420 return tv;
421}
422EXPORT_SYMBOL(ns_to_timeval);
423
424#if BITS_PER_LONG == 32
425/**
426 * set_normalized_timespec - set timespec sec and nsec parts and normalize
427 *
428 * @ts: pointer to timespec variable to be set
429 * @sec: seconds to set
430 * @nsec: nanoseconds to set
431 *
432 * Set seconds and nanoseconds field of a timespec variable and
433 * normalize to the timespec storage format
434 *
435 * Note: The tv_nsec part is always in the range of
436 * 0 <= tv_nsec < NSEC_PER_SEC
437 * For negative values only the tv_sec field is negative !
438 */
439void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
440{
441 while (nsec >= NSEC_PER_SEC) {
442 /*
443 * The following asm() prevents the compiler from
444 * optimising this loop into a modulo operation. See
445 * also __iter_div_u64_rem() in include/linux/time.h
446 */
447 asm("" : "+rm"(nsec));
448 nsec -= NSEC_PER_SEC;
449 ++sec;
450 }
451 while (nsec < 0) {
452 asm("" : "+rm"(nsec));
453 nsec += NSEC_PER_SEC;
454 --sec;
455 }
456 ts->tv_sec = sec;
457 ts->tv_nsec = nsec;
458}
459EXPORT_SYMBOL(set_normalized_timespec64);
460
461/**
462 * ns_to_timespec64 - Convert nanoseconds to timespec64
463 * @nsec: the nanoseconds value to be converted
464 *
465 * Returns the timespec64 representation of the nsec parameter.
466 */
467struct timespec64 ns_to_timespec64(const s64 nsec)
468{
469 struct timespec64 ts;
470 s32 rem;
471
472 if (!nsec)
473 return (struct timespec64) {0, 0};
474
475 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
476 if (unlikely(rem < 0)) {
477 ts.tv_sec--;
478 rem += NSEC_PER_SEC;
479 }
480 ts.tv_nsec = rem;
481
482 return ts;
483}
484EXPORT_SYMBOL(ns_to_timespec64);
485#endif
486/*
487 * When we convert to jiffies then we interpret incoming values
488 * the following way:
489 *
490 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
491 *
492 * - 'too large' values [that would result in larger than
493 * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
494 *
495 * - all other values are converted to jiffies by either multiplying
496 * the input value by a factor or dividing it with a factor
497 *
498 * We must also be careful about 32-bit overflows.
499 */
500unsigned long msecs_to_jiffies(const unsigned int m)
501{
502 /*
503 * Negative value, means infinite timeout:
504 */
505 if ((int)m < 0)
506 return MAX_JIFFY_OFFSET;
507
508#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
509 /*
510 * HZ is equal to or smaller than 1000, and 1000 is a nice
511 * round multiple of HZ, divide with the factor between them,
512 * but round upwards:
513 */
514 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
515#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
516 /*
517 * HZ is larger than 1000, and HZ is a nice round multiple of
518 * 1000 - simply multiply with the factor between them.
519 *
520 * But first make sure the multiplication result cannot
521 * overflow:
522 */
523 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
524 return MAX_JIFFY_OFFSET;
525
526 return m * (HZ / MSEC_PER_SEC);
527#else
528 /*
529 * Generic case - multiply, round and divide. But first
530 * check that if we are doing a net multiplication, that
531 * we wouldn't overflow:
532 */
533 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
534 return MAX_JIFFY_OFFSET;
535
536 return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
537 >> MSEC_TO_HZ_SHR32;
538#endif
539}
540EXPORT_SYMBOL(msecs_to_jiffies);
541
542unsigned long usecs_to_jiffies(const unsigned int u)
543{
544 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
545 return MAX_JIFFY_OFFSET;
546#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
547 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
548#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
549 return u * (HZ / USEC_PER_SEC);
550#else
551 return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
552 >> USEC_TO_HZ_SHR32;
553#endif
554}
555EXPORT_SYMBOL(usecs_to_jiffies);
556
557/*
558 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
559 * that a remainder subtract here would not do the right thing as the
560 * resolution values don't fall on second boundries. I.e. the line:
561 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
562 *
563 * Rather, we just shift the bits off the right.
564 *
565 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
566 * value to a scaled second value.
567 */
568unsigned long
569timespec_to_jiffies(const struct timespec *value)
570{
571 unsigned long sec = value->tv_sec;
572 long nsec = value->tv_nsec + TICK_NSEC - 1;
573
574 if (sec >= MAX_SEC_IN_JIFFIES){
575 sec = MAX_SEC_IN_JIFFIES;
576 nsec = 0;
577 }
578 return (((u64)sec * SEC_CONVERSION) +
579 (((u64)nsec * NSEC_CONVERSION) >>
580 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
581
582}
583EXPORT_SYMBOL(timespec_to_jiffies);
584
585void
586jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
587{
588 /*
589 * Convert jiffies to nanoseconds and separate with
590 * one divide.
591 */
592 u32 rem;
593 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
594 NSEC_PER_SEC, &rem);
595 value->tv_nsec = rem;
596}
597EXPORT_SYMBOL(jiffies_to_timespec);
598
599/* Same for "timeval"
600 *
601 * Well, almost. The problem here is that the real system resolution is
602 * in nanoseconds and the value being converted is in micro seconds.
603 * Also for some machines (those that use HZ = 1024, in-particular),
604 * there is a LARGE error in the tick size in microseconds.
605
606 * The solution we use is to do the rounding AFTER we convert the
607 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
608 * Instruction wise, this should cost only an additional add with carry
609 * instruction above the way it was done above.
610 */
611unsigned long
612timeval_to_jiffies(const struct timeval *value)
613{
614 unsigned long sec = value->tv_sec;
615 long usec = value->tv_usec;
616
617 if (sec >= MAX_SEC_IN_JIFFIES){
618 sec = MAX_SEC_IN_JIFFIES;
619 usec = 0;
620 }
621 return (((u64)sec * SEC_CONVERSION) +
622 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
623 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
624}
625EXPORT_SYMBOL(timeval_to_jiffies);
626
627void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
628{
629 /*
630 * Convert jiffies to nanoseconds and separate with
631 * one divide.
632 */
633 u32 rem;
634
635 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
636 NSEC_PER_SEC, &rem);
637 value->tv_usec = rem / NSEC_PER_USEC;
638}
639EXPORT_SYMBOL(jiffies_to_timeval);
640
641/*
642 * Convert jiffies/jiffies_64 to clock_t and back.
643 */
644clock_t jiffies_to_clock_t(unsigned long x)
645{
646#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
647# if HZ < USER_HZ
648 return x * (USER_HZ / HZ);
649# else
650 return x / (HZ / USER_HZ);
651# endif
652#else
653 return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
654#endif
655}
656EXPORT_SYMBOL(jiffies_to_clock_t);
657
658unsigned long clock_t_to_jiffies(unsigned long x)
659{
660#if (HZ % USER_HZ)==0
661 if (x >= ~0UL / (HZ / USER_HZ))
662 return ~0UL;
663 return x * (HZ / USER_HZ);
664#else
665 /* Don't worry about loss of precision here .. */
666 if (x >= ~0UL / HZ * USER_HZ)
667 return ~0UL;
668
669 /* .. but do try to contain it here */
670 return div_u64((u64)x * HZ, USER_HZ);
671#endif
672}
673EXPORT_SYMBOL(clock_t_to_jiffies);
674
675u64 jiffies_64_to_clock_t(u64 x)
676{
677#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
678# if HZ < USER_HZ
679 x = div_u64(x * USER_HZ, HZ);
680# elif HZ > USER_HZ
681 x = div_u64(x, HZ / USER_HZ);
682# else
683 /* Nothing to do */
684# endif
685#else
686 /*
687 * There are better ways that don't overflow early,
688 * but even this doesn't overflow in hundreds of years
689 * in 64 bits, so..
690 */
691 x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
692#endif
693 return x;
694}
695EXPORT_SYMBOL(jiffies_64_to_clock_t);
696
697u64 nsec_to_clock_t(u64 x)
698{
699#if (NSEC_PER_SEC % USER_HZ) == 0
700 return div_u64(x, NSEC_PER_SEC / USER_HZ);
701#elif (USER_HZ % 512) == 0
702 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
703#else
704 /*
705 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
706 * overflow after 64.99 years.
707 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
708 */
709 return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
710#endif
711}
712
713/**
714 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
715 *
716 * @n: nsecs in u64
717 *
718 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
719 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
720 * for scheduler, not for use in device drivers to calculate timeout value.
721 *
722 * note:
723 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
724 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
725 */
726u64 nsecs_to_jiffies64(u64 n)
727{
728#if (NSEC_PER_SEC % HZ) == 0
729 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
730 return div_u64(n, NSEC_PER_SEC / HZ);
731#elif (HZ % 512) == 0
732 /* overflow after 292 years if HZ = 1024 */
733 return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
734#else
735 /*
736 * Generic case - optimized for cases where HZ is a multiple of 3.
737 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
738 */
739 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
740#endif
741}
742
743/**
744 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
745 *
746 * @n: nsecs in u64
747 *
748 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
749 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
750 * for scheduler, not for use in device drivers to calculate timeout value.
751 *
752 * note:
753 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
754 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
755 */
756unsigned long nsecs_to_jiffies(u64 n)
757{
758 return (unsigned long)nsecs_to_jiffies64(n);
759}
760EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
761
762/*
763 * Add two timespec values and do a safety check for overflow.
764 * It's assumed that both values are valid (>= 0)
765 */
766struct timespec timespec_add_safe(const struct timespec lhs,
767 const struct timespec rhs)
768{
769 struct timespec res;
770
771 set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
772 lhs.tv_nsec + rhs.tv_nsec);
773
774 if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
775 res.tv_sec = TIME_T_MAX;
776
777 return res;
778}
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/time/timeconst.bc
@@ -0,0 +1,108 @@
1scale=0
2
3define gcd(a,b) {
4 auto t;
5 while (b) {
6 t = b;
7 b = a % b;
8 a = t;
9 }
10 return a;
11}
12
13/* Division by reciprocal multiplication. */
14define fmul(b,n,d) {
15 return (2^b*n+d-1)/d;
16}
17
18/* Adjustment factor when a ceiling value is used. Use as:
19 (imul * n) + (fmulxx * n + fadjxx) >> xx) */
20define fadj(b,n,d) {
21 auto v;
22 d = d/gcd(n,d);
23 v = 2^b*(d-1)/d;
24 return v;
25}
26
27/* Compute the appropriate mul/adj values as well as a shift count,
28 which brings the mul value into the range 2^b-1 <= x < 2^b. Such
29 a shift value will be correct in the signed integer range and off
30 by at most one in the upper half of the unsigned range. */
31define fmuls(b,n,d) {
32 auto s, m;
33 for (s = 0; 1; s++) {
34 m = fmul(s,n,d);
35 if (m >= 2^(b-1))
36 return s;
37 }
38 return 0;
39}
40
41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n"
45
46 print "#ifndef KERNEL_TIMECONST_H\n"
47 print "#define KERNEL_TIMECONST_H\n\n"
48
49 print "#include <linux/param.h>\n"
50 print "#include <linux/types.h>\n\n"
51
52 print "#if HZ != ", hz, "\n"
53 print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
54 print "#endif\n\n"
55
56 if (hz < 2) {
57 print "#error Totally bogus HZ value!\n"
58 } else {
59 s=fmuls(32,1000,hz)
60 obase=16
61 print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
62 print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
63 obase=10
64 print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
65
66 s=fmuls(32,hz,1000)
67 obase=16
68 print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
69 print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
70 obase=10
71 print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
72
73 obase=10
74 cd=gcd(hz,1000)
75 print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
76 print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
77 print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
78 print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
79 print "\n"
80
81 s=fmuls(32,1000000,hz)
82 obase=16
83 print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
84 print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
85 obase=10
86 print "#define HZ_TO_USEC_SHR32\t", s, "\n"
87
88 s=fmuls(32,hz,1000000)
89 obase=16
90 print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
91 print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
92 obase=10
93 print "#define USEC_TO_HZ_SHR32\t", s, "\n"
94
95 obase=10
96 cd=gcd(hz,1000000)
97 print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
98 print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
99 print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
100 print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
101 print "\n"
102
103 print "#endif /* KERNEL_TIMECONST_H */\n"
104 }
105 halt
106}
107
108timeconst(hz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 32d8d6aaedb8..f36b02838a47 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,11 +32,34 @@
32#define TK_MIRROR (1 << 1) 32#define TK_MIRROR (1 << 1)
33#define TK_CLOCK_WAS_SET (1 << 2) 33#define TK_CLOCK_WAS_SET (1 << 2)
34 34
35static struct timekeeper timekeeper; 35/*
36 * The most important data for readout fits into a single 64 byte
37 * cache line.
38 */
39static struct {
40 seqcount_t seq;
41 struct timekeeper timekeeper;
42} tk_core ____cacheline_aligned;
43
36static DEFINE_RAW_SPINLOCK(timekeeper_lock); 44static DEFINE_RAW_SPINLOCK(timekeeper_lock);
37static seqcount_t timekeeper_seq;
38static struct timekeeper shadow_timekeeper; 45static struct timekeeper shadow_timekeeper;
39 46
47/**
48 * struct tk_fast - NMI safe timekeeper
49 * @seq: Sequence counter for protecting updates. The lowest bit
50 * is the index for the tk_read_base array
51 * @base: tk_read_base array. Access is indexed by the lowest bit of
52 * @seq.
53 *
54 * See @update_fast_timekeeper() below.
55 */
56struct tk_fast {
57 seqcount_t seq;
58 struct tk_read_base base[2];
59};
60
61static struct tk_fast tk_fast_mono ____cacheline_aligned;
62
40/* flag for if timekeeping is suspended */ 63/* flag for if timekeeping is suspended */
41int __read_mostly timekeeping_suspended; 64int __read_mostly timekeeping_suspended;
42 65
@@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false;
45 68
46static inline void tk_normalize_xtime(struct timekeeper *tk) 69static inline void tk_normalize_xtime(struct timekeeper *tk)
47{ 70{
48 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 71 while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
49 tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; 72 tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
50 tk->xtime_sec++; 73 tk->xtime_sec++;
51 } 74 }
52} 75}
53 76
54static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 77static inline struct timespec64 tk_xtime(struct timekeeper *tk)
78{
79 struct timespec64 ts;
80
81 ts.tv_sec = tk->xtime_sec;
82 ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
83 return ts;
84}
85
86static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
55{ 87{
56 tk->xtime_sec = ts->tv_sec; 88 tk->xtime_sec = ts->tv_sec;
57 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; 89 tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
58} 90}
59 91
60static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) 92static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
61{ 93{
62 tk->xtime_sec += ts->tv_sec; 94 tk->xtime_sec += ts->tv_sec;
63 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; 95 tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
64 tk_normalize_xtime(tk); 96 tk_normalize_xtime(tk);
65} 97}
66 98
67static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) 99static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
68{ 100{
69 struct timespec tmp; 101 struct timespec64 tmp;
70 102
71 /* 103 /*
72 * Verify consistency of: offset_real = -wall_to_monotonic 104 * Verify consistency of: offset_real = -wall_to_monotonic
73 * before modifying anything 105 * before modifying anything
74 */ 106 */
75 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, 107 set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
76 -tk->wall_to_monotonic.tv_nsec); 108 -tk->wall_to_monotonic.tv_nsec);
77 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); 109 WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
78 tk->wall_to_monotonic = wtm; 110 tk->wall_to_monotonic = wtm;
79 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 111 set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
80 tk->offs_real = timespec_to_ktime(tmp); 112 tk->offs_real = timespec64_to_ktime(tmp);
81 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); 113 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
82} 114}
83 115
84static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 116static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
85{ 117{
86 /* Verify consistency before modifying */ 118 tk->offs_boot = ktime_add(tk->offs_boot, delta);
87 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
88
89 tk->total_sleep_time = t;
90 tk->offs_boot = timespec_to_ktime(t);
91} 119}
92 120
93/** 121/**
@@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
107 u64 tmp, ntpinterval; 135 u64 tmp, ntpinterval;
108 struct clocksource *old_clock; 136 struct clocksource *old_clock;
109 137
110 old_clock = tk->clock; 138 old_clock = tk->tkr.clock;
111 tk->clock = clock; 139 tk->tkr.clock = clock;
112 tk->cycle_last = clock->cycle_last = clock->read(clock); 140 tk->tkr.read = clock->read;
141 tk->tkr.mask = clock->mask;
142 tk->tkr.cycle_last = tk->tkr.read(clock);
113 143
114 /* Do the ns -> cycle conversion first, using original mult */ 144 /* Do the ns -> cycle conversion first, using original mult */
115 tmp = NTP_INTERVAL_LENGTH; 145 tmp = NTP_INTERVAL_LENGTH;
@@ -133,78 +163,212 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
133 if (old_clock) { 163 if (old_clock) {
134 int shift_change = clock->shift - old_clock->shift; 164 int shift_change = clock->shift - old_clock->shift;
135 if (shift_change < 0) 165 if (shift_change < 0)
136 tk->xtime_nsec >>= -shift_change; 166 tk->tkr.xtime_nsec >>= -shift_change;
137 else 167 else
138 tk->xtime_nsec <<= shift_change; 168 tk->tkr.xtime_nsec <<= shift_change;
139 } 169 }
140 tk->shift = clock->shift; 170 tk->tkr.shift = clock->shift;
141 171
142 tk->ntp_error = 0; 172 tk->ntp_error = 0;
143 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 173 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
174 tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
144 175
145 /* 176 /*
146 * The timekeeper keeps its own mult values for the currently 177 * The timekeeper keeps its own mult values for the currently
147 * active clocksource. These value will be adjusted via NTP 178 * active clocksource. These value will be adjusted via NTP
148 * to counteract clock drifting. 179 * to counteract clock drifting.
149 */ 180 */
150 tk->mult = clock->mult; 181 tk->tkr.mult = clock->mult;
182 tk->ntp_err_mult = 0;
151} 183}
152 184
153/* Timekeeper helper functions. */ 185/* Timekeeper helper functions. */
154 186
155#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 187#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
156u32 (*arch_gettimeoffset)(void); 188static u32 default_arch_gettimeoffset(void) { return 0; }
157 189u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
158u32 get_arch_timeoffset(void)
159{
160 if (likely(arch_gettimeoffset))
161 return arch_gettimeoffset();
162 return 0;
163}
164#else 190#else
165static inline u32 get_arch_timeoffset(void) { return 0; } 191static inline u32 arch_gettimeoffset(void) { return 0; }
166#endif 192#endif
167 193
168static inline s64 timekeeping_get_ns(struct timekeeper *tk) 194static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
169{ 195{
170 cycle_t cycle_now, cycle_delta; 196 cycle_t cycle_now, delta;
171 struct clocksource *clock;
172 s64 nsec; 197 s64 nsec;
173 198
174 /* read clocksource: */ 199 /* read clocksource: */
175 clock = tk->clock; 200 cycle_now = tkr->read(tkr->clock);
176 cycle_now = clock->read(clock);
177 201
178 /* calculate the delta since the last update_wall_time: */ 202 /* calculate the delta since the last update_wall_time: */
179 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 203 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
180 204
181 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 205 nsec = delta * tkr->mult + tkr->xtime_nsec;
182 nsec >>= tk->shift; 206 nsec >>= tkr->shift;
183 207
184 /* If arch requires, add in get_arch_timeoffset() */ 208 /* If arch requires, add in get_arch_timeoffset() */
185 return nsec + get_arch_timeoffset(); 209 return nsec + arch_gettimeoffset();
186} 210}
187 211
188static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 212static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
189{ 213{
190 cycle_t cycle_now, cycle_delta; 214 struct clocksource *clock = tk->tkr.clock;
191 struct clocksource *clock; 215 cycle_t cycle_now, delta;
192 s64 nsec; 216 s64 nsec;
193 217
194 /* read clocksource: */ 218 /* read clocksource: */
195 clock = tk->clock; 219 cycle_now = tk->tkr.read(clock);
196 cycle_now = clock->read(clock);
197 220
198 /* calculate the delta since the last update_wall_time: */ 221 /* calculate the delta since the last update_wall_time: */
199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 222 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
200 223
201 /* convert delta to nanoseconds. */ 224 /* convert delta to nanoseconds. */
202 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 225 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
203 226
204 /* If arch requires, add in get_arch_timeoffset() */ 227 /* If arch requires, add in get_arch_timeoffset() */
205 return nsec + get_arch_timeoffset(); 228 return nsec + arch_gettimeoffset();
229}
230
231/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tk: The timekeeper from which we take the update
234 * @tkf: The fast timekeeper to update
235 * @tbase: The time base for the fast timekeeper (mono/raw)
236 *
237 * We want to use this from any context including NMI and tracing /
238 * instrumenting the timekeeping code itself.
239 *
240 * So we handle this differently than the other timekeeping accessor
241 * functions which retry when the sequence count has changed. The
242 * update side does:
243 *
244 * smp_wmb(); <- Ensure that the last base[1] update is visible
245 * tkf->seq++;
246 * smp_wmb(); <- Ensure that the seqcount update is visible
247 * update(tkf->base[0], tk);
248 * smp_wmb(); <- Ensure that the base[0] update is visible
249 * tkf->seq++;
250 * smp_wmb(); <- Ensure that the seqcount update is visible
251 * update(tkf->base[1], tk);
252 *
253 * The reader side does:
254 *
255 * do {
256 * seq = tkf->seq;
257 * smp_rmb();
258 * idx = seq & 0x01;
259 * now = now(tkf->base[idx]);
260 * smp_rmb();
261 * } while (seq != tkf->seq)
262 *
263 * As long as we update base[0] readers are forced off to
264 * base[1]. Once base[0] is updated readers are redirected to base[0]
265 * and the base[1] update takes place.
266 *
267 * So if a NMI hits the update of base[0] then it will use base[1]
268 * which is still consistent. In the worst case this can result is a
269 * slightly wrong timestamp (a few nanoseconds). See
270 * @ktime_get_mono_fast_ns.
271 */
272static void update_fast_timekeeper(struct timekeeper *tk)
273{
274 struct tk_read_base *base = tk_fast_mono.base;
275
276 /* Force readers off to base[1] */
277 raw_write_seqcount_latch(&tk_fast_mono.seq);
278
279 /* Update base[0] */
280 memcpy(base, &tk->tkr, sizeof(*base));
281
282 /* Force readers back to base[0] */
283 raw_write_seqcount_latch(&tk_fast_mono.seq);
284
285 /* Update base[1] */
286 memcpy(base + 1, base, sizeof(*base));
206} 287}
207 288
289/**
290 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
291 *
292 * This timestamp is not guaranteed to be monotonic across an update.
293 * The timestamp is calculated by:
294 *
295 * now = base_mono + clock_delta * slope
296 *
297 * So if the update lowers the slope, readers who are forced to the
298 * not yet updated second array are still using the old steeper slope.
299 *
300 * tmono
301 * ^
302 * | o n
303 * | o n
304 * | u
305 * | o
306 * |o
307 * |12345678---> reader order
308 *
309 * o = old slope
310 * u = update
311 * n = new slope
312 *
313 * So reader 6 will observe time going backwards versus reader 5.
314 *
315 * While other CPUs are likely to be able observe that, the only way
316 * for a CPU local observation is when an NMI hits in the middle of
317 * the update. Timestamps taken from that NMI context might be ahead
318 * of the following timestamps. Callers need to be aware of that and
319 * deal with it.
320 */
321u64 notrace ktime_get_mono_fast_ns(void)
322{
323 struct tk_read_base *tkr;
324 unsigned int seq;
325 u64 now;
326
327 do {
328 seq = raw_read_seqcount(&tk_fast_mono.seq);
329 tkr = tk_fast_mono.base + (seq & 0x01);
330 now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
331
332 } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
333 return now;
334}
335EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
336
337#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
338
339static inline void update_vsyscall(struct timekeeper *tk)
340{
341 struct timespec xt;
342
343 xt = timespec64_to_timespec(tk_xtime(tk));
344 update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
345 tk->tkr.cycle_last);
346}
347
348static inline void old_vsyscall_fixup(struct timekeeper *tk)
349{
350 s64 remainder;
351
352 /*
353 * Store only full nanoseconds into xtime_nsec after rounding
354 * it up and add the remainder to the error difference.
355 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
356 * by truncating the remainder in vsyscalls. However, it causes
357 * additional work to be done in timekeeping_adjust(). Once
358 * the vsyscall implementations are converted to use xtime_nsec
359 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
360 * users are removed, this can be killed.
361 */
362 remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
363 tk->tkr.xtime_nsec -= remainder;
364 tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
365 tk->ntp_error += remainder << tk->ntp_error_shift;
366 tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
367}
368#else
369#define old_vsyscall_fixup(tk)
370#endif
371
208static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 372static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
209 373
210static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) 374static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -217,7 +381,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
217 */ 381 */
218int pvclock_gtod_register_notifier(struct notifier_block *nb) 382int pvclock_gtod_register_notifier(struct notifier_block *nb)
219{ 383{
220 struct timekeeper *tk = &timekeeper; 384 struct timekeeper *tk = &tk_core.timekeeper;
221 unsigned long flags; 385 unsigned long flags;
222 int ret; 386 int ret;
223 387
@@ -247,6 +411,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
247} 411}
248EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 412EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
249 413
414/*
415 * Update the ktime_t based scalar nsec members of the timekeeper
416 */
417static inline void tk_update_ktime_data(struct timekeeper *tk)
418{
419 s64 nsec;
420
421 /*
422 * The xtime based monotonic readout is:
423 * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
424 * The ktime based monotonic readout is:
425 * nsec = base_mono + now();
426 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
427 */
428 nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
429 nsec *= NSEC_PER_SEC;
430 nsec += tk->wall_to_monotonic.tv_nsec;
431 tk->tkr.base_mono = ns_to_ktime(nsec);
432
433 /* Update the monotonic raw base */
434 tk->base_raw = timespec64_to_ktime(tk->raw_time);
435}
436
250/* must hold timekeeper_lock */ 437/* must hold timekeeper_lock */
251static void timekeeping_update(struct timekeeper *tk, unsigned int action) 438static void timekeeping_update(struct timekeeper *tk, unsigned int action)
252{ 439{
@@ -257,8 +444,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
257 update_vsyscall(tk); 444 update_vsyscall(tk);
258 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); 445 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
259 446
447 tk_update_ktime_data(tk);
448
260 if (action & TK_MIRROR) 449 if (action & TK_MIRROR)
261 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); 450 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
451 sizeof(tk_core.timekeeper));
452
453 update_fast_timekeeper(tk);
262} 454}
263 455
264/** 456/**
@@ -270,49 +462,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
270 */ 462 */
271static void timekeeping_forward_now(struct timekeeper *tk) 463static void timekeeping_forward_now(struct timekeeper *tk)
272{ 464{
273 cycle_t cycle_now, cycle_delta; 465 struct clocksource *clock = tk->tkr.clock;
274 struct clocksource *clock; 466 cycle_t cycle_now, delta;
275 s64 nsec; 467 s64 nsec;
276 468
277 clock = tk->clock; 469 cycle_now = tk->tkr.read(clock);
278 cycle_now = clock->read(clock); 470 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
279 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 471 tk->tkr.cycle_last = cycle_now;
280 tk->cycle_last = clock->cycle_last = cycle_now;
281 472
282 tk->xtime_nsec += cycle_delta * tk->mult; 473 tk->tkr.xtime_nsec += delta * tk->tkr.mult;
283 474
284 /* If arch requires, add in get_arch_timeoffset() */ 475 /* If arch requires, add in get_arch_timeoffset() */
285 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; 476 tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
286 477
287 tk_normalize_xtime(tk); 478 tk_normalize_xtime(tk);
288 479
289 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 480 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
290 timespec_add_ns(&tk->raw_time, nsec); 481 timespec64_add_ns(&tk->raw_time, nsec);
291} 482}
292 483
293/** 484/**
294 * __getnstimeofday - Returns the time of day in a timespec. 485 * __getnstimeofday64 - Returns the time of day in a timespec64.
295 * @ts: pointer to the timespec to be set 486 * @ts: pointer to the timespec to be set
296 * 487 *
297 * Updates the time of day in the timespec. 488 * Updates the time of day in the timespec.
298 * Returns 0 on success, or -ve when suspended (timespec will be undefined). 489 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
299 */ 490 */
300int __getnstimeofday(struct timespec *ts) 491int __getnstimeofday64(struct timespec64 *ts)
301{ 492{
302 struct timekeeper *tk = &timekeeper; 493 struct timekeeper *tk = &tk_core.timekeeper;
303 unsigned long seq; 494 unsigned long seq;
304 s64 nsecs = 0; 495 s64 nsecs = 0;
305 496
306 do { 497 do {
307 seq = read_seqcount_begin(&timekeeper_seq); 498 seq = read_seqcount_begin(&tk_core.seq);
308 499
309 ts->tv_sec = tk->xtime_sec; 500 ts->tv_sec = tk->xtime_sec;
310 nsecs = timekeeping_get_ns(tk); 501 nsecs = timekeeping_get_ns(&tk->tkr);
311 502
312 } while (read_seqcount_retry(&timekeeper_seq, seq)); 503 } while (read_seqcount_retry(&tk_core.seq, seq));
313 504
314 ts->tv_nsec = 0; 505 ts->tv_nsec = 0;
315 timespec_add_ns(ts, nsecs); 506 timespec64_add_ns(ts, nsecs);
316 507
317 /* 508 /*
318 * Do not bail out early, in case there were callers still using 509 * Do not bail out early, in case there were callers still using
@@ -322,116 +513,138 @@ int __getnstimeofday(struct timespec *ts)
322 return -EAGAIN; 513 return -EAGAIN;
323 return 0; 514 return 0;
324} 515}
325EXPORT_SYMBOL(__getnstimeofday); 516EXPORT_SYMBOL(__getnstimeofday64);
326 517
327/** 518/**
328 * getnstimeofday - Returns the time of day in a timespec. 519 * getnstimeofday64 - Returns the time of day in a timespec64.
329 * @ts: pointer to the timespec to be set 520 * @ts: pointer to the timespec to be set
330 * 521 *
331 * Returns the time of day in a timespec (WARN if suspended). 522 * Returns the time of day in a timespec (WARN if suspended).
332 */ 523 */
333void getnstimeofday(struct timespec *ts) 524void getnstimeofday64(struct timespec64 *ts)
334{ 525{
335 WARN_ON(__getnstimeofday(ts)); 526 WARN_ON(__getnstimeofday64(ts));
336} 527}
337EXPORT_SYMBOL(getnstimeofday); 528EXPORT_SYMBOL(getnstimeofday64);
338 529
339ktime_t ktime_get(void) 530ktime_t ktime_get(void)
340{ 531{
341 struct timekeeper *tk = &timekeeper; 532 struct timekeeper *tk = &tk_core.timekeeper;
342 unsigned int seq; 533 unsigned int seq;
343 s64 secs, nsecs; 534 ktime_t base;
535 s64 nsecs;
344 536
345 WARN_ON(timekeeping_suspended); 537 WARN_ON(timekeeping_suspended);
346 538
347 do { 539 do {
348 seq = read_seqcount_begin(&timekeeper_seq); 540 seq = read_seqcount_begin(&tk_core.seq);
349 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 541 base = tk->tkr.base_mono;
350 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 542 nsecs = timekeeping_get_ns(&tk->tkr);
351 543
352 } while (read_seqcount_retry(&timekeeper_seq, seq)); 544 } while (read_seqcount_retry(&tk_core.seq, seq));
353 /* 545
354 * Use ktime_set/ktime_add_ns to create a proper ktime on 546 return ktime_add_ns(base, nsecs);
355 * 32-bit architectures without CONFIG_KTIME_SCALAR.
356 */
357 return ktime_add_ns(ktime_set(secs, 0), nsecs);
358} 547}
359EXPORT_SYMBOL_GPL(ktime_get); 548EXPORT_SYMBOL_GPL(ktime_get);
360 549
361/** 550static ktime_t *offsets[TK_OFFS_MAX] = {
362 * ktime_get_ts - get the monotonic clock in timespec format 551 [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
363 * @ts: pointer to timespec variable 552 [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
364 * 553 [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
365 * The function calculates the monotonic clock from the realtime 554};
366 * clock and the wall_to_monotonic offset and stores the result 555
367 * in normalized timespec format in the variable pointed to by @ts. 556ktime_t ktime_get_with_offset(enum tk_offsets offs)
368 */
369void ktime_get_ts(struct timespec *ts)
370{ 557{
371 struct timekeeper *tk = &timekeeper; 558 struct timekeeper *tk = &tk_core.timekeeper;
372 struct timespec tomono;
373 s64 nsec;
374 unsigned int seq; 559 unsigned int seq;
560 ktime_t base, *offset = offsets[offs];
561 s64 nsecs;
375 562
376 WARN_ON(timekeeping_suspended); 563 WARN_ON(timekeeping_suspended);
377 564
378 do { 565 do {
379 seq = read_seqcount_begin(&timekeeper_seq); 566 seq = read_seqcount_begin(&tk_core.seq);
380 ts->tv_sec = tk->xtime_sec; 567 base = ktime_add(tk->tkr.base_mono, *offset);
381 nsec = timekeeping_get_ns(tk); 568 nsecs = timekeeping_get_ns(&tk->tkr);
382 tomono = tk->wall_to_monotonic;
383 569
384 } while (read_seqcount_retry(&timekeeper_seq, seq)); 570 } while (read_seqcount_retry(&tk_core.seq, seq));
385 571
386 ts->tv_sec += tomono.tv_sec; 572 return ktime_add_ns(base, nsecs);
387 ts->tv_nsec = 0;
388 timespec_add_ns(ts, nsec + tomono.tv_nsec);
389}
390EXPORT_SYMBOL_GPL(ktime_get_ts);
391 573
574}
575EXPORT_SYMBOL_GPL(ktime_get_with_offset);
392 576
393/** 577/**
394 * timekeeping_clocktai - Returns the TAI time of day in a timespec 578 * ktime_mono_to_any() - convert mononotic time to any other time
395 * @ts: pointer to the timespec to be set 579 * @tmono: time to convert.
396 * 580 * @offs: which offset to use
397 * Returns the time of day in a timespec.
398 */ 581 */
399void timekeeping_clocktai(struct timespec *ts) 582ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
400{ 583{
401 struct timekeeper *tk = &timekeeper; 584 ktime_t *offset = offsets[offs];
402 unsigned long seq; 585 unsigned long seq;
403 u64 nsecs; 586 ktime_t tconv;
404
405 WARN_ON(timekeeping_suspended);
406 587
407 do { 588 do {
408 seq = read_seqcount_begin(&timekeeper_seq); 589 seq = read_seqcount_begin(&tk_core.seq);
590 tconv = ktime_add(tmono, *offset);
591 } while (read_seqcount_retry(&tk_core.seq, seq));
409 592
410 ts->tv_sec = tk->xtime_sec + tk->tai_offset; 593 return tconv;
411 nsecs = timekeeping_get_ns(tk); 594}
595EXPORT_SYMBOL_GPL(ktime_mono_to_any);
412 596
413 } while (read_seqcount_retry(&timekeeper_seq, seq)); 597/**
598 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
599 */
600ktime_t ktime_get_raw(void)
601{
602 struct timekeeper *tk = &tk_core.timekeeper;
603 unsigned int seq;
604 ktime_t base;
605 s64 nsecs;
414 606
415 ts->tv_nsec = 0; 607 do {
416 timespec_add_ns(ts, nsecs); 608 seq = read_seqcount_begin(&tk_core.seq);
609 base = tk->base_raw;
610 nsecs = timekeeping_get_ns_raw(tk);
417 611
418} 612 } while (read_seqcount_retry(&tk_core.seq, seq));
419EXPORT_SYMBOL(timekeeping_clocktai);
420 613
614 return ktime_add_ns(base, nsecs);
615}
616EXPORT_SYMBOL_GPL(ktime_get_raw);
421 617
422/** 618/**
423 * ktime_get_clocktai - Returns the TAI time of day in a ktime 619 * ktime_get_ts64 - get the monotonic clock in timespec64 format
620 * @ts: pointer to timespec variable
424 * 621 *
425 * Returns the time of day in a ktime. 622 * The function calculates the monotonic clock from the realtime
623 * clock and the wall_to_monotonic offset and stores the result
624 * in normalized timespec format in the variable pointed to by @ts.
426 */ 625 */
427ktime_t ktime_get_clocktai(void) 626void ktime_get_ts64(struct timespec64 *ts)
428{ 627{
429 struct timespec ts; 628 struct timekeeper *tk = &tk_core.timekeeper;
629 struct timespec64 tomono;
630 s64 nsec;
631 unsigned int seq;
632
633 WARN_ON(timekeeping_suspended);
430 634
431 timekeeping_clocktai(&ts); 635 do {
432 return timespec_to_ktime(ts); 636 seq = read_seqcount_begin(&tk_core.seq);
637 ts->tv_sec = tk->xtime_sec;
638 nsec = timekeeping_get_ns(&tk->tkr);
639 tomono = tk->wall_to_monotonic;
640
641 } while (read_seqcount_retry(&tk_core.seq, seq));
642
643 ts->tv_sec += tomono.tv_sec;
644 ts->tv_nsec = 0;
645 timespec64_add_ns(ts, nsec + tomono.tv_nsec);
433} 646}
434EXPORT_SYMBOL(ktime_get_clocktai); 647EXPORT_SYMBOL_GPL(ktime_get_ts64);
435 648
436#ifdef CONFIG_NTP_PPS 649#ifdef CONFIG_NTP_PPS
437 650
@@ -446,23 +659,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);
446 */ 659 */
447void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 660void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
448{ 661{
449 struct timekeeper *tk = &timekeeper; 662 struct timekeeper *tk = &tk_core.timekeeper;
450 unsigned long seq; 663 unsigned long seq;
451 s64 nsecs_raw, nsecs_real; 664 s64 nsecs_raw, nsecs_real;
452 665
453 WARN_ON_ONCE(timekeeping_suspended); 666 WARN_ON_ONCE(timekeeping_suspended);
454 667
455 do { 668 do {
456 seq = read_seqcount_begin(&timekeeper_seq); 669 seq = read_seqcount_begin(&tk_core.seq);
457 670
458 *ts_raw = tk->raw_time; 671 *ts_raw = timespec64_to_timespec(tk->raw_time);
459 ts_real->tv_sec = tk->xtime_sec; 672 ts_real->tv_sec = tk->xtime_sec;
460 ts_real->tv_nsec = 0; 673 ts_real->tv_nsec = 0;
461 674
462 nsecs_raw = timekeeping_get_ns_raw(tk); 675 nsecs_raw = timekeeping_get_ns_raw(tk);
463 nsecs_real = timekeeping_get_ns(tk); 676 nsecs_real = timekeeping_get_ns(&tk->tkr);
464 677
465 } while (read_seqcount_retry(&timekeeper_seq, seq)); 678 } while (read_seqcount_retry(&tk_core.seq, seq));
466 679
467 timespec_add_ns(ts_raw, nsecs_raw); 680 timespec_add_ns(ts_raw, nsecs_raw);
468 timespec_add_ns(ts_real, nsecs_real); 681 timespec_add_ns(ts_real, nsecs_real);
@@ -479,9 +692,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
479 */ 692 */
480void do_gettimeofday(struct timeval *tv) 693void do_gettimeofday(struct timeval *tv)
481{ 694{
482 struct timespec now; 695 struct timespec64 now;
483 696
484 getnstimeofday(&now); 697 getnstimeofday64(&now);
485 tv->tv_sec = now.tv_sec; 698 tv->tv_sec = now.tv_sec;
486 tv->tv_usec = now.tv_nsec/1000; 699 tv->tv_usec = now.tv_nsec/1000;
487} 700}
@@ -495,15 +708,15 @@ EXPORT_SYMBOL(do_gettimeofday);
495 */ 708 */
496int do_settimeofday(const struct timespec *tv) 709int do_settimeofday(const struct timespec *tv)
497{ 710{
498 struct timekeeper *tk = &timekeeper; 711 struct timekeeper *tk = &tk_core.timekeeper;
499 struct timespec ts_delta, xt; 712 struct timespec64 ts_delta, xt, tmp;
500 unsigned long flags; 713 unsigned long flags;
501 714
502 if (!timespec_valid_strict(tv)) 715 if (!timespec_valid_strict(tv))
503 return -EINVAL; 716 return -EINVAL;
504 717
505 raw_spin_lock_irqsave(&timekeeper_lock, flags); 718 raw_spin_lock_irqsave(&timekeeper_lock, flags);
506 write_seqcount_begin(&timekeeper_seq); 719 write_seqcount_begin(&tk_core.seq);
507 720
508 timekeeping_forward_now(tk); 721 timekeeping_forward_now(tk);
509 722
@@ -511,13 +724,14 @@ int do_settimeofday(const struct timespec *tv)
511 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 724 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
512 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 725 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
513 726
514 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); 727 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
515 728
516 tk_set_xtime(tk, tv); 729 tmp = timespec_to_timespec64(*tv);
730 tk_set_xtime(tk, &tmp);
517 731
518 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 732 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
519 733
520 write_seqcount_end(&timekeeper_seq); 734 write_seqcount_end(&tk_core.seq);
521 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 735 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
522 736
523 /* signal hrtimers about time change */ 737 /* signal hrtimers about time change */
@@ -535,33 +749,35 @@ EXPORT_SYMBOL(do_settimeofday);
535 */ 749 */
536int timekeeping_inject_offset(struct timespec *ts) 750int timekeeping_inject_offset(struct timespec *ts)
537{ 751{
538 struct timekeeper *tk = &timekeeper; 752 struct timekeeper *tk = &tk_core.timekeeper;
539 unsigned long flags; 753 unsigned long flags;
540 struct timespec tmp; 754 struct timespec64 ts64, tmp;
541 int ret = 0; 755 int ret = 0;
542 756
543 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 757 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
544 return -EINVAL; 758 return -EINVAL;
545 759
760 ts64 = timespec_to_timespec64(*ts);
761
546 raw_spin_lock_irqsave(&timekeeper_lock, flags); 762 raw_spin_lock_irqsave(&timekeeper_lock, flags);
547 write_seqcount_begin(&timekeeper_seq); 763 write_seqcount_begin(&tk_core.seq);
548 764
549 timekeeping_forward_now(tk); 765 timekeeping_forward_now(tk);
550 766
551 /* Make sure the proposed value is valid */ 767 /* Make sure the proposed value is valid */
552 tmp = timespec_add(tk_xtime(tk), *ts); 768 tmp = timespec64_add(tk_xtime(tk), ts64);
553 if (!timespec_valid_strict(&tmp)) { 769 if (!timespec64_valid_strict(&tmp)) {
554 ret = -EINVAL; 770 ret = -EINVAL;
555 goto error; 771 goto error;
556 } 772 }
557 773
558 tk_xtime_add(tk, ts); 774 tk_xtime_add(tk, &ts64);
559 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 775 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
560 776
561error: /* even if we error out, we forwarded the time, so call update */ 777error: /* even if we error out, we forwarded the time, so call update */
562 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 778 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
563 779
564 write_seqcount_end(&timekeeper_seq); 780 write_seqcount_end(&tk_core.seq);
565 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 781 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
566 782
567 /* signal hrtimers about time change */ 783 /* signal hrtimers about time change */
@@ -578,14 +794,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
578 */ 794 */
579s32 timekeeping_get_tai_offset(void) 795s32 timekeeping_get_tai_offset(void)
580{ 796{
581 struct timekeeper *tk = &timekeeper; 797 struct timekeeper *tk = &tk_core.timekeeper;
582 unsigned int seq; 798 unsigned int seq;
583 s32 ret; 799 s32 ret;
584 800
585 do { 801 do {
586 seq = read_seqcount_begin(&timekeeper_seq); 802 seq = read_seqcount_begin(&tk_core.seq);
587 ret = tk->tai_offset; 803 ret = tk->tai_offset;
588 } while (read_seqcount_retry(&timekeeper_seq, seq)); 804 } while (read_seqcount_retry(&tk_core.seq, seq));
589 805
590 return ret; 806 return ret;
591} 807}
@@ -606,14 +822,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
606 */ 822 */
607void timekeeping_set_tai_offset(s32 tai_offset) 823void timekeeping_set_tai_offset(s32 tai_offset)
608{ 824{
609 struct timekeeper *tk = &timekeeper; 825 struct timekeeper *tk = &tk_core.timekeeper;
610 unsigned long flags; 826 unsigned long flags;
611 827
612 raw_spin_lock_irqsave(&timekeeper_lock, flags); 828 raw_spin_lock_irqsave(&timekeeper_lock, flags);
613 write_seqcount_begin(&timekeeper_seq); 829 write_seqcount_begin(&tk_core.seq);
614 __timekeeping_set_tai_offset(tk, tai_offset); 830 __timekeeping_set_tai_offset(tk, tai_offset);
615 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 831 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
616 write_seqcount_end(&timekeeper_seq); 832 write_seqcount_end(&tk_core.seq);
617 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 833 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
618 clock_was_set(); 834 clock_was_set();
619} 835}
@@ -625,14 +841,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)
625 */ 841 */
626static int change_clocksource(void *data) 842static int change_clocksource(void *data)
627{ 843{
628 struct timekeeper *tk = &timekeeper; 844 struct timekeeper *tk = &tk_core.timekeeper;
629 struct clocksource *new, *old; 845 struct clocksource *new, *old;
630 unsigned long flags; 846 unsigned long flags;
631 847
632 new = (struct clocksource *) data; 848 new = (struct clocksource *) data;
633 849
634 raw_spin_lock_irqsave(&timekeeper_lock, flags); 850 raw_spin_lock_irqsave(&timekeeper_lock, flags);
635 write_seqcount_begin(&timekeeper_seq); 851 write_seqcount_begin(&tk_core.seq);
636 852
637 timekeeping_forward_now(tk); 853 timekeeping_forward_now(tk);
638 /* 854 /*
@@ -641,7 +857,7 @@ static int change_clocksource(void *data)
641 */ 857 */
642 if (try_module_get(new->owner)) { 858 if (try_module_get(new->owner)) {
643 if (!new->enable || new->enable(new) == 0) { 859 if (!new->enable || new->enable(new) == 0) {
644 old = tk->clock; 860 old = tk->tkr.clock;
645 tk_setup_internals(tk, new); 861 tk_setup_internals(tk, new);
646 if (old->disable) 862 if (old->disable)
647 old->disable(old); 863 old->disable(old);
@@ -652,7 +868,7 @@ static int change_clocksource(void *data)
652 } 868 }
653 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 869 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
654 870
655 write_seqcount_end(&timekeeper_seq); 871 write_seqcount_end(&tk_core.seq);
656 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 872 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
657 873
658 return 0; 874 return 0;
@@ -667,29 +883,14 @@ static int change_clocksource(void *data)
667 */ 883 */
668int timekeeping_notify(struct clocksource *clock) 884int timekeeping_notify(struct clocksource *clock)
669{ 885{
670 struct timekeeper *tk = &timekeeper; 886 struct timekeeper *tk = &tk_core.timekeeper;
671 887
672 if (tk->clock == clock) 888 if (tk->tkr.clock == clock)
673 return 0; 889 return 0;
674 stop_machine(change_clocksource, clock, NULL); 890 stop_machine(change_clocksource, clock, NULL);
675 tick_clock_notify(); 891 tick_clock_notify();
676 return tk->clock == clock ? 0 : -1; 892 return tk->tkr.clock == clock ? 0 : -1;
677}
678
679/**
680 * ktime_get_real - get the real (wall-) time in ktime_t format
681 *
682 * returns the time in ktime_t format
683 */
684ktime_t ktime_get_real(void)
685{
686 struct timespec now;
687
688 getnstimeofday(&now);
689
690 return timespec_to_ktime(now);
691} 893}
692EXPORT_SYMBOL_GPL(ktime_get_real);
693 894
694/** 895/**
695 * getrawmonotonic - Returns the raw monotonic time in a timespec 896 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -699,18 +900,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
699 */ 900 */
700void getrawmonotonic(struct timespec *ts) 901void getrawmonotonic(struct timespec *ts)
701{ 902{
702 struct timekeeper *tk = &timekeeper; 903 struct timekeeper *tk = &tk_core.timekeeper;
904 struct timespec64 ts64;
703 unsigned long seq; 905 unsigned long seq;
704 s64 nsecs; 906 s64 nsecs;
705 907
706 do { 908 do {
707 seq = read_seqcount_begin(&timekeeper_seq); 909 seq = read_seqcount_begin(&tk_core.seq);
708 nsecs = timekeeping_get_ns_raw(tk); 910 nsecs = timekeeping_get_ns_raw(tk);
709 *ts = tk->raw_time; 911 ts64 = tk->raw_time;
710 912
711 } while (read_seqcount_retry(&timekeeper_seq, seq)); 913 } while (read_seqcount_retry(&tk_core.seq, seq));
712 914
713 timespec_add_ns(ts, nsecs); 915 timespec64_add_ns(&ts64, nsecs);
916 *ts = timespec64_to_timespec(ts64);
714} 917}
715EXPORT_SYMBOL(getrawmonotonic); 918EXPORT_SYMBOL(getrawmonotonic);
716 919
@@ -719,16 +922,16 @@ EXPORT_SYMBOL(getrawmonotonic);
719 */ 922 */
720int timekeeping_valid_for_hres(void) 923int timekeeping_valid_for_hres(void)
721{ 924{
722 struct timekeeper *tk = &timekeeper; 925 struct timekeeper *tk = &tk_core.timekeeper;
723 unsigned long seq; 926 unsigned long seq;
724 int ret; 927 int ret;
725 928
726 do { 929 do {
727 seq = read_seqcount_begin(&timekeeper_seq); 930 seq = read_seqcount_begin(&tk_core.seq);
728 931
729 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 932 ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
730 933
731 } while (read_seqcount_retry(&timekeeper_seq, seq)); 934 } while (read_seqcount_retry(&tk_core.seq, seq));
732 935
733 return ret; 936 return ret;
734} 937}
@@ -738,16 +941,16 @@ int timekeeping_valid_for_hres(void)
738 */ 941 */
739u64 timekeeping_max_deferment(void) 942u64 timekeeping_max_deferment(void)
740{ 943{
741 struct timekeeper *tk = &timekeeper; 944 struct timekeeper *tk = &tk_core.timekeeper;
742 unsigned long seq; 945 unsigned long seq;
743 u64 ret; 946 u64 ret;
744 947
745 do { 948 do {
746 seq = read_seqcount_begin(&timekeeper_seq); 949 seq = read_seqcount_begin(&tk_core.seq);
747 950
748 ret = tk->clock->max_idle_ns; 951 ret = tk->tkr.clock->max_idle_ns;
749 952
750 } while (read_seqcount_retry(&timekeeper_seq, seq)); 953 } while (read_seqcount_retry(&tk_core.seq, seq));
751 954
752 return ret; 955 return ret;
753} 956}
@@ -787,14 +990,15 @@ void __weak read_boot_clock(struct timespec *ts)
787 */ 990 */
788void __init timekeeping_init(void) 991void __init timekeeping_init(void)
789{ 992{
790 struct timekeeper *tk = &timekeeper; 993 struct timekeeper *tk = &tk_core.timekeeper;
791 struct clocksource *clock; 994 struct clocksource *clock;
792 unsigned long flags; 995 unsigned long flags;
793 struct timespec now, boot, tmp; 996 struct timespec64 now, boot, tmp;
794 997 struct timespec ts;
795 read_persistent_clock(&now);
796 998
797 if (!timespec_valid_strict(&now)) { 999 read_persistent_clock(&ts);
1000 now = timespec_to_timespec64(ts);
1001 if (!timespec64_valid_strict(&now)) {
798 pr_warn("WARNING: Persistent clock returned invalid value!\n" 1002 pr_warn("WARNING: Persistent clock returned invalid value!\n"
799 " Check your CMOS/BIOS settings.\n"); 1003 " Check your CMOS/BIOS settings.\n");
800 now.tv_sec = 0; 1004 now.tv_sec = 0;
@@ -802,8 +1006,9 @@ void __init timekeeping_init(void)
802 } else if (now.tv_sec || now.tv_nsec) 1006 } else if (now.tv_sec || now.tv_nsec)
803 persistent_clock_exist = true; 1007 persistent_clock_exist = true;
804 1008
805 read_boot_clock(&boot); 1009 read_boot_clock(&ts);
806 if (!timespec_valid_strict(&boot)) { 1010 boot = timespec_to_timespec64(ts);
1011 if (!timespec64_valid_strict(&boot)) {
807 pr_warn("WARNING: Boot clock returned invalid value!\n" 1012 pr_warn("WARNING: Boot clock returned invalid value!\n"
808 " Check your CMOS/BIOS settings.\n"); 1013 " Check your CMOS/BIOS settings.\n");
809 boot.tv_sec = 0; 1014 boot.tv_sec = 0;
@@ -811,7 +1016,7 @@ void __init timekeeping_init(void)
811 } 1016 }
812 1017
813 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1018 raw_spin_lock_irqsave(&timekeeper_lock, flags);
814 write_seqcount_begin(&timekeeper_seq); 1019 write_seqcount_begin(&tk_core.seq);
815 ntp_init(); 1020 ntp_init();
816 1021
817 clock = clocksource_default_clock(); 1022 clock = clocksource_default_clock();
@@ -822,24 +1027,21 @@ void __init timekeeping_init(void)
822 tk_set_xtime(tk, &now); 1027 tk_set_xtime(tk, &now);
823 tk->raw_time.tv_sec = 0; 1028 tk->raw_time.tv_sec = 0;
824 tk->raw_time.tv_nsec = 0; 1029 tk->raw_time.tv_nsec = 0;
1030 tk->base_raw.tv64 = 0;
825 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 1031 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
826 boot = tk_xtime(tk); 1032 boot = tk_xtime(tk);
827 1033
828 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); 1034 set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
829 tk_set_wall_to_mono(tk, tmp); 1035 tk_set_wall_to_mono(tk, tmp);
830 1036
831 tmp.tv_sec = 0; 1037 timekeeping_update(tk, TK_MIRROR);
832 tmp.tv_nsec = 0;
833 tk_set_sleep_time(tk, tmp);
834
835 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
836 1038
837 write_seqcount_end(&timekeeper_seq); 1039 write_seqcount_end(&tk_core.seq);
838 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1040 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
839} 1041}
840 1042
841/* time in seconds when suspend began */ 1043/* time in seconds when suspend began */
842static struct timespec timekeeping_suspend_time; 1044static struct timespec64 timekeeping_suspend_time;
843 1045
844/** 1046/**
845 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 1047 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
@@ -849,17 +1051,17 @@ static struct timespec timekeeping_suspend_time;
849 * adds the sleep offset to the timekeeping variables. 1051 * adds the sleep offset to the timekeeping variables.
850 */ 1052 */
851static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 1053static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 1054 struct timespec64 *delta)
853{ 1055{
854 if (!timespec_valid_strict(delta)) { 1056 if (!timespec64_valid_strict(delta)) {
855 printk_deferred(KERN_WARNING 1057 printk_deferred(KERN_WARNING
856 "__timekeeping_inject_sleeptime: Invalid " 1058 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n"); 1059 "sleep delta value!\n");
858 return; 1060 return;
859 } 1061 }
860 tk_xtime_add(tk, delta); 1062 tk_xtime_add(tk, delta);
861 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 1063 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
862 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 1064 tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
863 tk_debug_account_sleep_time(delta); 1065 tk_debug_account_sleep_time(delta);
864} 1066}
865 1067
@@ -875,7 +1077,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
875 */ 1077 */
876void timekeeping_inject_sleeptime(struct timespec *delta) 1078void timekeeping_inject_sleeptime(struct timespec *delta)
877{ 1079{
878 struct timekeeper *tk = &timekeeper; 1080 struct timekeeper *tk = &tk_core.timekeeper;
1081 struct timespec64 tmp;
879 unsigned long flags; 1082 unsigned long flags;
880 1083
881 /* 1084 /*
@@ -886,15 +1089,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
886 return; 1089 return;
887 1090
888 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1091 raw_spin_lock_irqsave(&timekeeper_lock, flags);
889 write_seqcount_begin(&timekeeper_seq); 1092 write_seqcount_begin(&tk_core.seq);
890 1093
891 timekeeping_forward_now(tk); 1094 timekeeping_forward_now(tk);
892 1095
893 __timekeeping_inject_sleeptime(tk, delta); 1096 tmp = timespec_to_timespec64(*delta);
1097 __timekeeping_inject_sleeptime(tk, &tmp);
894 1098
895 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 1099 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
896 1100
897 write_seqcount_end(&timekeeper_seq); 1101 write_seqcount_end(&tk_core.seq);
898 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1102 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
899 1103
900 /* signal hrtimers about time change */ 1104 /* signal hrtimers about time change */
@@ -910,20 +1114,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
910 */ 1114 */
911static void timekeeping_resume(void) 1115static void timekeeping_resume(void)
912{ 1116{
913 struct timekeeper *tk = &timekeeper; 1117 struct timekeeper *tk = &tk_core.timekeeper;
914 struct clocksource *clock = tk->clock; 1118 struct clocksource *clock = tk->tkr.clock;
915 unsigned long flags; 1119 unsigned long flags;
916 struct timespec ts_new, ts_delta; 1120 struct timespec64 ts_new, ts_delta;
1121 struct timespec tmp;
917 cycle_t cycle_now, cycle_delta; 1122 cycle_t cycle_now, cycle_delta;
918 bool suspendtime_found = false; 1123 bool suspendtime_found = false;
919 1124
920 read_persistent_clock(&ts_new); 1125 read_persistent_clock(&tmp);
1126 ts_new = timespec_to_timespec64(tmp);
921 1127
922 clockevents_resume(); 1128 clockevents_resume();
923 clocksource_resume(); 1129 clocksource_resume();
924 1130
925 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1131 raw_spin_lock_irqsave(&timekeeper_lock, flags);
926 write_seqcount_begin(&timekeeper_seq); 1132 write_seqcount_begin(&tk_core.seq);
927 1133
928 /* 1134 /*
929 * After system resumes, we need to calculate the suspended time and 1135 * After system resumes, we need to calculate the suspended time and
@@ -937,15 +1143,16 @@ static void timekeeping_resume(void)
937 * The less preferred source will only be tried if there is no better 1143 * The less preferred source will only be tried if there is no better
938 * usable source. The rtc part is handled separately in rtc core code. 1144 * usable source. The rtc part is handled separately in rtc core code.
939 */ 1145 */
940 cycle_now = clock->read(clock); 1146 cycle_now = tk->tkr.read(clock);
941 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && 1147 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
942 cycle_now > clock->cycle_last) { 1148 cycle_now > tk->tkr.cycle_last) {
943 u64 num, max = ULLONG_MAX; 1149 u64 num, max = ULLONG_MAX;
944 u32 mult = clock->mult; 1150 u32 mult = clock->mult;
945 u32 shift = clock->shift; 1151 u32 shift = clock->shift;
946 s64 nsec = 0; 1152 s64 nsec = 0;
947 1153
948 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 1154 cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
1155 tk->tkr.mask);
949 1156
950 /* 1157 /*
951 * "cycle_delta * mutl" may cause 64 bits overflow, if the 1158 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -960,10 +1167,10 @@ static void timekeeping_resume(void)
960 } 1167 }
961 nsec += ((u64) cycle_delta * mult) >> shift; 1168 nsec += ((u64) cycle_delta * mult) >> shift;
962 1169
963 ts_delta = ns_to_timespec(nsec); 1170 ts_delta = ns_to_timespec64(nsec);
964 suspendtime_found = true; 1171 suspendtime_found = true;
965 } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { 1172 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
966 ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); 1173 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
967 suspendtime_found = true; 1174 suspendtime_found = true;
968 } 1175 }
969 1176
@@ -971,11 +1178,11 @@ static void timekeeping_resume(void)
971 __timekeeping_inject_sleeptime(tk, &ts_delta); 1178 __timekeeping_inject_sleeptime(tk, &ts_delta);
972 1179
973 /* Re-base the last cycle value */ 1180 /* Re-base the last cycle value */
974 tk->cycle_last = clock->cycle_last = cycle_now; 1181 tk->tkr.cycle_last = cycle_now;
975 tk->ntp_error = 0; 1182 tk->ntp_error = 0;
976 timekeeping_suspended = 0; 1183 timekeeping_suspended = 0;
977 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1184 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
978 write_seqcount_end(&timekeeper_seq); 1185 write_seqcount_end(&tk_core.seq);
979 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1186 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
980 1187
981 touch_softlockup_watchdog(); 1188 touch_softlockup_watchdog();
@@ -988,12 +1195,14 @@ static void timekeeping_resume(void)
988 1195
989static int timekeeping_suspend(void) 1196static int timekeeping_suspend(void)
990{ 1197{
991 struct timekeeper *tk = &timekeeper; 1198 struct timekeeper *tk = &tk_core.timekeeper;
992 unsigned long flags; 1199 unsigned long flags;
993 struct timespec delta, delta_delta; 1200 struct timespec64 delta, delta_delta;
994 static struct timespec old_delta; 1201 static struct timespec64 old_delta;
1202 struct timespec tmp;
995 1203
996 read_persistent_clock(&timekeeping_suspend_time); 1204 read_persistent_clock(&tmp);
1205 timekeeping_suspend_time = timespec_to_timespec64(tmp);
997 1206
998 /* 1207 /*
999 * On some systems the persistent_clock can not be detected at 1208 * On some systems the persistent_clock can not be detected at
@@ -1004,7 +1213,7 @@ static int timekeeping_suspend(void)
1004 persistent_clock_exist = true; 1213 persistent_clock_exist = true;
1005 1214
1006 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1215 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1007 write_seqcount_begin(&timekeeper_seq); 1216 write_seqcount_begin(&tk_core.seq);
1008 timekeeping_forward_now(tk); 1217 timekeeping_forward_now(tk);
1009 timekeeping_suspended = 1; 1218 timekeeping_suspended = 1;
1010 1219
@@ -1014,8 +1223,8 @@ static int timekeeping_suspend(void)
1014 * try to compensate so the difference in system time 1223 * try to compensate so the difference in system time
1015 * and persistent_clock time stays close to constant. 1224 * and persistent_clock time stays close to constant.
1016 */ 1225 */
1017 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); 1226 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1018 delta_delta = timespec_sub(delta, old_delta); 1227 delta_delta = timespec64_sub(delta, old_delta);
1019 if (abs(delta_delta.tv_sec) >= 2) { 1228 if (abs(delta_delta.tv_sec) >= 2) {
1020 /* 1229 /*
1021 * if delta_delta is too large, assume time correction 1230 * if delta_delta is too large, assume time correction
@@ -1025,11 +1234,11 @@ static int timekeeping_suspend(void)
1025 } else { 1234 } else {
1026 /* Otherwise try to adjust old_system to compensate */ 1235 /* Otherwise try to adjust old_system to compensate */
1027 timekeeping_suspend_time = 1236 timekeeping_suspend_time =
1028 timespec_add(timekeeping_suspend_time, delta_delta); 1237 timespec64_add(timekeeping_suspend_time, delta_delta);
1029 } 1238 }
1030 1239
1031 timekeeping_update(tk, TK_MIRROR); 1240 timekeeping_update(tk, TK_MIRROR);
1032 write_seqcount_end(&timekeeper_seq); 1241 write_seqcount_end(&tk_core.seq);
1033 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1242 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1034 1243
1035 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1244 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1050,125 +1259,34 @@ static int __init timekeeping_init_ops(void)
1050 register_syscore_ops(&timekeeping_syscore_ops); 1259 register_syscore_ops(&timekeeping_syscore_ops);
1051 return 0; 1260 return 0;
1052} 1261}
1053
1054device_initcall(timekeeping_init_ops); 1262device_initcall(timekeeping_init_ops);
1055 1263
1056/* 1264/*
1057 * If the error is already larger, we look ahead even further 1265 * Apply a multiplier adjustment to the timekeeper
1058 * to compensate for late or lost adjustments.
1059 */ 1266 */
1060static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, 1267static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1061 s64 error, s64 *interval, 1268 s64 offset,
1062 s64 *offset) 1269 bool negative,
1270 int adj_scale)
1063{ 1271{
1064 s64 tick_error, i; 1272 s64 interval = tk->cycle_interval;
1065 u32 look_ahead, adj; 1273 s32 mult_adj = 1;
1066 s32 error2, mult;
1067
1068 /*
1069 * Use the current error value to determine how much to look ahead.
1070 * The larger the error the slower we adjust for it to avoid problems
1071 * with losing too many ticks, otherwise we would overadjust and
1072 * produce an even larger error. The smaller the adjustment the
1073 * faster we try to adjust for it, as lost ticks can do less harm
1074 * here. This is tuned so that an error of about 1 msec is adjusted
1075 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1076 */
1077 error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
1078 error2 = abs(error2);
1079 for (look_ahead = 0; error2 > 0; look_ahead++)
1080 error2 >>= 2;
1081 1274
1082 /* 1275 if (negative) {
1083 * Now calculate the error in (1 << look_ahead) ticks, but first 1276 mult_adj = -mult_adj;
1084 * remove the single look ahead already included in the error. 1277 interval = -interval;
1085 */ 1278 offset = -offset;
1086 tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
1087 tick_error -= tk->xtime_interval >> 1;
1088 error = ((error - tick_error) >> look_ahead) + tick_error;
1089
1090 /* Finally calculate the adjustment shift value. */
1091 i = *interval;
1092 mult = 1;
1093 if (error < 0) {
1094 error = -error;
1095 *interval = -*interval;
1096 *offset = -*offset;
1097 mult = -1;
1098 } 1279 }
1099 for (adj = 0; error > i; adj++) 1280 mult_adj <<= adj_scale;
1100 error >>= 1; 1281 interval <<= adj_scale;
1101 1282 offset <<= adj_scale;
1102 *interval <<= adj;
1103 *offset <<= adj;
1104 return mult << adj;
1105}
1106
1107/*
1108 * Adjust the multiplier to reduce the error value,
1109 * this is optimized for the most common adjustments of -1,0,1,
1110 * for other values we can do a bit more work.
1111 */
1112static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1113{
1114 s64 error, interval = tk->cycle_interval;
1115 int adj;
1116 1283
1117 /* 1284 /*
1118 * The point of this is to check if the error is greater than half
1119 * an interval.
1120 *
1121 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
1122 *
1123 * Note we subtract one in the shift, so that error is really error*2.
1124 * This "saves" dividing(shifting) interval twice, but keeps the
1125 * (error > interval) comparison as still measuring if error is
1126 * larger than half an interval.
1127 *
1128 * Note: It does not "save" on aggravation when reading the code.
1129 */
1130 error = tk->ntp_error >> (tk->ntp_error_shift - 1);
1131 if (error > interval) {
1132 /*
1133 * We now divide error by 4(via shift), which checks if
1134 * the error is greater than twice the interval.
1135 * If it is greater, we need a bigadjust, if its smaller,
1136 * we can adjust by 1.
1137 */
1138 error >>= 2;
1139 if (likely(error <= interval))
1140 adj = 1;
1141 else
1142 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
1143 } else {
1144 if (error < -interval) {
1145 /* See comment above, this is just switched for the negative */
1146 error >>= 2;
1147 if (likely(error >= -interval)) {
1148 adj = -1;
1149 interval = -interval;
1150 offset = -offset;
1151 } else {
1152 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
1153 }
1154 } else {
1155 goto out_adjust;
1156 }
1157 }
1158
1159 if (unlikely(tk->clock->maxadj &&
1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1161 printk_deferred_once(KERN_WARNING
1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1163 tk->clock->name, (long)tk->mult + adj,
1164 (long)tk->clock->mult + tk->clock->maxadj);
1165 }
1166 /*
1167 * So the following can be confusing. 1285 * So the following can be confusing.
1168 * 1286 *
1169 * To keep things simple, lets assume adj == 1 for now. 1287 * To keep things simple, lets assume mult_adj == 1 for now.
1170 * 1288 *
1171 * When adj != 1, remember that the interval and offset values 1289 * When mult_adj != 1, remember that the interval and offset values
1172 * have been appropriately scaled so the math is the same. 1290 * have been appropriately scaled so the math is the same.
1173 * 1291 *
1174 * The basic idea here is that we're increasing the multiplier 1292 * The basic idea here is that we're increasing the multiplier
@@ -1212,12 +1330,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1212 * 1330 *
1213 * XXX - TODO: Doc ntp_error calculation. 1331 * XXX - TODO: Doc ntp_error calculation.
1214 */ 1332 */
1215 tk->mult += adj; 1333 tk->tkr.mult += mult_adj;
1216 tk->xtime_interval += interval; 1334 tk->xtime_interval += interval;
1217 tk->xtime_nsec -= offset; 1335 tk->tkr.xtime_nsec -= offset;
1218 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 1336 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
1337}
1338
1339/*
1340 * Calculate the multiplier adjustment needed to match the frequency
1341 * specified by NTP
1342 */
1343static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
1344 s64 offset)
1345{
1346 s64 interval = tk->cycle_interval;
1347 s64 xinterval = tk->xtime_interval;
1348 s64 tick_error;
1349 bool negative;
1350 u32 adj;
1351
1352 /* Remove any current error adj from freq calculation */
1353 if (tk->ntp_err_mult)
1354 xinterval -= tk->cycle_interval;
1355
1356 tk->ntp_tick = ntp_tick_length();
1357
1358 /* Calculate current error per tick */
1359 tick_error = ntp_tick_length() >> tk->ntp_error_shift;
1360 tick_error -= (xinterval + tk->xtime_remainder);
1361
1362 /* Don't worry about correcting it if its small */
1363 if (likely((tick_error >= 0) && (tick_error <= interval)))
1364 return;
1365
1366 /* preserve the direction of correction */
1367 negative = (tick_error < 0);
1368
1369 /* Sort out the magnitude of the correction */
1370 tick_error = abs(tick_error);
1371 for (adj = 0; tick_error > interval; adj++)
1372 tick_error >>= 1;
1373
1374 /* scale the corrections */
1375 timekeeping_apply_adjustment(tk, offset, negative, adj);
1376}
1377
1378/*
1379 * Adjust the timekeeper's multiplier to the correct frequency
1380 * and also to reduce the accumulated error value.
1381 */
1382static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1383{
1384 /* Correct for the current frequency error */
1385 timekeeping_freqadjust(tk, offset);
1386
1387 /* Next make a small adjustment to fix any cumulative error */
1388 if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
1389 tk->ntp_err_mult = 1;
1390 timekeeping_apply_adjustment(tk, offset, 0, 0);
1391 } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
1392 /* Undo any existing error adjustment */
1393 timekeeping_apply_adjustment(tk, offset, 1, 0);
1394 tk->ntp_err_mult = 0;
1395 }
1396
1397 if (unlikely(tk->tkr.clock->maxadj &&
1398 (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
1399 printk_once(KERN_WARNING
1400 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1401 tk->tkr.clock->name, (long)tk->tkr.mult,
1402 (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
1403 }
1219 1404
1220out_adjust:
1221 /* 1405 /*
1222 * It may be possible that when we entered this function, xtime_nsec 1406 * It may be possible that when we entered this function, xtime_nsec
1223 * was very small. Further, if we're slightly speeding the clocksource 1407 * was very small. Further, if we're slightly speeding the clocksource
@@ -1232,12 +1416,11 @@ out_adjust:
1232 * We'll correct this error next time through this function, when 1416 * We'll correct this error next time through this function, when
1233 * xtime_nsec is not as small. 1417 * xtime_nsec is not as small.
1234 */ 1418 */
1235 if (unlikely((s64)tk->xtime_nsec < 0)) { 1419 if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
1236 s64 neg = -(s64)tk->xtime_nsec; 1420 s64 neg = -(s64)tk->tkr.xtime_nsec;
1237 tk->xtime_nsec = 0; 1421 tk->tkr.xtime_nsec = 0;
1238 tk->ntp_error += neg << tk->ntp_error_shift; 1422 tk->ntp_error += neg << tk->ntp_error_shift;
1239 } 1423 }
1240
1241} 1424}
1242 1425
1243/** 1426/**
@@ -1250,26 +1433,26 @@ out_adjust:
1250 */ 1433 */
1251static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1434static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1252{ 1435{
1253 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1436 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
1254 unsigned int clock_set = 0; 1437 unsigned int clock_set = 0;
1255 1438
1256 while (tk->xtime_nsec >= nsecps) { 1439 while (tk->tkr.xtime_nsec >= nsecps) {
1257 int leap; 1440 int leap;
1258 1441
1259 tk->xtime_nsec -= nsecps; 1442 tk->tkr.xtime_nsec -= nsecps;
1260 tk->xtime_sec++; 1443 tk->xtime_sec++;
1261 1444
1262 /* Figure out if its a leap sec and apply if needed */ 1445 /* Figure out if its a leap sec and apply if needed */
1263 leap = second_overflow(tk->xtime_sec); 1446 leap = second_overflow(tk->xtime_sec);
1264 if (unlikely(leap)) { 1447 if (unlikely(leap)) {
1265 struct timespec ts; 1448 struct timespec64 ts;
1266 1449
1267 tk->xtime_sec += leap; 1450 tk->xtime_sec += leap;
1268 1451
1269 ts.tv_sec = leap; 1452 ts.tv_sec = leap;
1270 ts.tv_nsec = 0; 1453 ts.tv_nsec = 0;
1271 tk_set_wall_to_mono(tk, 1454 tk_set_wall_to_mono(tk,
1272 timespec_sub(tk->wall_to_monotonic, ts)); 1455 timespec64_sub(tk->wall_to_monotonic, ts));
1273 1456
1274 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1457 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1275 1458
@@ -1301,9 +1484,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1301 1484
1302 /* Accumulate one shifted interval */ 1485 /* Accumulate one shifted interval */
1303 offset -= interval; 1486 offset -= interval;
1304 tk->cycle_last += interval; 1487 tk->tkr.cycle_last += interval;
1305 1488
1306 tk->xtime_nsec += tk->xtime_interval << shift; 1489 tk->tkr.xtime_nsec += tk->xtime_interval << shift;
1307 *clock_set |= accumulate_nsecs_to_secs(tk); 1490 *clock_set |= accumulate_nsecs_to_secs(tk);
1308 1491
1309 /* Accumulate raw time */ 1492 /* Accumulate raw time */
@@ -1317,48 +1500,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1317 tk->raw_time.tv_nsec = raw_nsecs; 1500 tk->raw_time.tv_nsec = raw_nsecs;
1318 1501
1319 /* Accumulate error between NTP and clock interval */ 1502 /* Accumulate error between NTP and clock interval */
1320 tk->ntp_error += ntp_tick_length() << shift; 1503 tk->ntp_error += tk->ntp_tick << shift;
1321 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << 1504 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
1322 (tk->ntp_error_shift + shift); 1505 (tk->ntp_error_shift + shift);
1323 1506
1324 return offset; 1507 return offset;
1325} 1508}
1326 1509
1327#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1328static inline void old_vsyscall_fixup(struct timekeeper *tk)
1329{
1330 s64 remainder;
1331
1332 /*
1333 * Store only full nanoseconds into xtime_nsec after rounding
1334 * it up and add the remainder to the error difference.
1335 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1336 * by truncating the remainder in vsyscalls. However, it causes
1337 * additional work to be done in timekeeping_adjust(). Once
1338 * the vsyscall implementations are converted to use xtime_nsec
1339 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1340 * users are removed, this can be killed.
1341 */
1342 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1343 tk->xtime_nsec -= remainder;
1344 tk->xtime_nsec += 1ULL << tk->shift;
1345 tk->ntp_error += remainder << tk->ntp_error_shift;
1346 tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
1347}
1348#else
1349#define old_vsyscall_fixup(tk)
1350#endif
1351
1352
1353
1354/** 1510/**
1355 * update_wall_time - Uses the current clocksource to increment the wall time 1511 * update_wall_time - Uses the current clocksource to increment the wall time
1356 * 1512 *
1357 */ 1513 */
1358void update_wall_time(void) 1514void update_wall_time(void)
1359{ 1515{
1360 struct clocksource *clock; 1516 struct timekeeper *real_tk = &tk_core.timekeeper;
1361 struct timekeeper *real_tk = &timekeeper;
1362 struct timekeeper *tk = &shadow_timekeeper; 1517 struct timekeeper *tk = &shadow_timekeeper;
1363 cycle_t offset; 1518 cycle_t offset;
1364 int shift = 0, maxshift; 1519 int shift = 0, maxshift;
@@ -1371,12 +1526,11 @@ void update_wall_time(void)
1371 if (unlikely(timekeeping_suspended)) 1526 if (unlikely(timekeeping_suspended))
1372 goto out; 1527 goto out;
1373 1528
1374 clock = real_tk->clock;
1375
1376#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1529#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1377 offset = real_tk->cycle_interval; 1530 offset = real_tk->cycle_interval;
1378#else 1531#else
1379 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1532 offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
1533 tk->tkr.cycle_last, tk->tkr.mask);
1380#endif 1534#endif
1381 1535
1382 /* Check if there's really nothing to do */ 1536 /* Check if there's really nothing to do */
@@ -1418,9 +1572,7 @@ void update_wall_time(void)
1418 */ 1572 */
1419 clock_set |= accumulate_nsecs_to_secs(tk); 1573 clock_set |= accumulate_nsecs_to_secs(tk);
1420 1574
1421 write_seqcount_begin(&timekeeper_seq); 1575 write_seqcount_begin(&tk_core.seq);
1422 /* Update clock->cycle_last with the new value */
1423 clock->cycle_last = tk->cycle_last;
1424 /* 1576 /*
1425 * Update the real timekeeper. 1577 * Update the real timekeeper.
1426 * 1578 *
@@ -1428,12 +1580,12 @@ void update_wall_time(void)
1428 * requires changes to all other timekeeper usage sites as 1580 * requires changes to all other timekeeper usage sites as
1429 * well, i.e. move the timekeeper pointer getter into the 1581 * well, i.e. move the timekeeper pointer getter into the
1430 * spinlocked/seqcount protected sections. And we trade this 1582 * spinlocked/seqcount protected sections. And we trade this
1431 * memcpy under the timekeeper_seq against one before we start 1583 * memcpy under the tk_core.seq against one before we start
1432 * updating. 1584 * updating.
1433 */ 1585 */
1434 memcpy(real_tk, tk, sizeof(*tk)); 1586 memcpy(real_tk, tk, sizeof(*tk));
1435 timekeeping_update(real_tk, clock_set); 1587 timekeeping_update(real_tk, clock_set);
1436 write_seqcount_end(&timekeeper_seq); 1588 write_seqcount_end(&tk_core.seq);
1437out: 1589out:
1438 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1590 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1439 if (clock_set) 1591 if (clock_set)
@@ -1454,83 +1606,16 @@ out:
1454 */ 1606 */
1455void getboottime(struct timespec *ts) 1607void getboottime(struct timespec *ts)
1456{ 1608{
1457 struct timekeeper *tk = &timekeeper; 1609 struct timekeeper *tk = &tk_core.timekeeper;
1458 struct timespec boottime = { 1610 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1459 .tv_sec = tk->wall_to_monotonic.tv_sec +
1460 tk->total_sleep_time.tv_sec,
1461 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1462 tk->total_sleep_time.tv_nsec
1463 };
1464
1465 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1466}
1467EXPORT_SYMBOL_GPL(getboottime);
1468
1469/**
1470 * get_monotonic_boottime - Returns monotonic time since boot
1471 * @ts: pointer to the timespec to be set
1472 *
1473 * Returns the monotonic time since boot in a timespec.
1474 *
1475 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
1476 * includes the time spent in suspend.
1477 */
1478void get_monotonic_boottime(struct timespec *ts)
1479{
1480 struct timekeeper *tk = &timekeeper;
1481 struct timespec tomono, sleep;
1482 s64 nsec;
1483 unsigned int seq;
1484
1485 WARN_ON(timekeeping_suspended);
1486
1487 do {
1488 seq = read_seqcount_begin(&timekeeper_seq);
1489 ts->tv_sec = tk->xtime_sec;
1490 nsec = timekeeping_get_ns(tk);
1491 tomono = tk->wall_to_monotonic;
1492 sleep = tk->total_sleep_time;
1493
1494 } while (read_seqcount_retry(&timekeeper_seq, seq));
1495
1496 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1497 ts->tv_nsec = 0;
1498 timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
1499}
1500EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1501
1502/**
1503 * ktime_get_boottime - Returns monotonic time since boot in a ktime
1504 *
1505 * Returns the monotonic time since boot in a ktime
1506 *
1507 * This is similar to CLOCK_MONTONIC/ktime_get, but also
1508 * includes the time spent in suspend.
1509 */
1510ktime_t ktime_get_boottime(void)
1511{
1512 struct timespec ts;
1513
1514 get_monotonic_boottime(&ts);
1515 return timespec_to_ktime(ts);
1516}
1517EXPORT_SYMBOL_GPL(ktime_get_boottime);
1518
1519/**
1520 * monotonic_to_bootbased - Convert the monotonic time to boot based.
1521 * @ts: pointer to the timespec to be converted
1522 */
1523void monotonic_to_bootbased(struct timespec *ts)
1524{
1525 struct timekeeper *tk = &timekeeper;
1526 1611
1527 *ts = timespec_add(*ts, tk->total_sleep_time); 1612 *ts = ktime_to_timespec(t);
1528} 1613}
1529EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1614EXPORT_SYMBOL_GPL(getboottime);
1530 1615
1531unsigned long get_seconds(void) 1616unsigned long get_seconds(void)
1532{ 1617{
1533 struct timekeeper *tk = &timekeeper; 1618 struct timekeeper *tk = &tk_core.timekeeper;
1534 1619
1535 return tk->xtime_sec; 1620 return tk->xtime_sec;
1536} 1621}
@@ -1538,43 +1623,44 @@ EXPORT_SYMBOL(get_seconds);
1538 1623
1539struct timespec __current_kernel_time(void) 1624struct timespec __current_kernel_time(void)
1540{ 1625{
1541 struct timekeeper *tk = &timekeeper; 1626 struct timekeeper *tk = &tk_core.timekeeper;
1542 1627
1543 return tk_xtime(tk); 1628 return timespec64_to_timespec(tk_xtime(tk));
1544} 1629}
1545 1630
1546struct timespec current_kernel_time(void) 1631struct timespec current_kernel_time(void)
1547{ 1632{
1548 struct timekeeper *tk = &timekeeper; 1633 struct timekeeper *tk = &tk_core.timekeeper;
1549 struct timespec now; 1634 struct timespec64 now;
1550 unsigned long seq; 1635 unsigned long seq;
1551 1636
1552 do { 1637 do {
1553 seq = read_seqcount_begin(&timekeeper_seq); 1638 seq = read_seqcount_begin(&tk_core.seq);
1554 1639
1555 now = tk_xtime(tk); 1640 now = tk_xtime(tk);
1556 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1641 } while (read_seqcount_retry(&tk_core.seq, seq));
1557 1642
1558 return now; 1643 return timespec64_to_timespec(now);
1559} 1644}
1560EXPORT_SYMBOL(current_kernel_time); 1645EXPORT_SYMBOL(current_kernel_time);
1561 1646
1562struct timespec get_monotonic_coarse(void) 1647struct timespec get_monotonic_coarse(void)
1563{ 1648{
1564 struct timekeeper *tk = &timekeeper; 1649 struct timekeeper *tk = &tk_core.timekeeper;
1565 struct timespec now, mono; 1650 struct timespec64 now, mono;
1566 unsigned long seq; 1651 unsigned long seq;
1567 1652
1568 do { 1653 do {
1569 seq = read_seqcount_begin(&timekeeper_seq); 1654 seq = read_seqcount_begin(&tk_core.seq);
1570 1655
1571 now = tk_xtime(tk); 1656 now = tk_xtime(tk);
1572 mono = tk->wall_to_monotonic; 1657 mono = tk->wall_to_monotonic;
1573 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1658 } while (read_seqcount_retry(&tk_core.seq, seq));
1574 1659
1575 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1660 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
1576 now.tv_nsec + mono.tv_nsec); 1661 now.tv_nsec + mono.tv_nsec);
1577 return now; 1662
1663 return timespec64_to_timespec(now);
1578} 1664}
1579 1665
1580/* 1666/*
@@ -1587,29 +1673,38 @@ void do_timer(unsigned long ticks)
1587} 1673}
1588 1674
1589/** 1675/**
1590 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, 1676 * ktime_get_update_offsets_tick - hrtimer helper
1591 * and sleep offsets. 1677 * @offs_real: pointer to storage for monotonic -> realtime offset
1592 * @xtim: pointer to timespec to be set with xtime 1678 * @offs_boot: pointer to storage for monotonic -> boottime offset
1593 * @wtom: pointer to timespec to be set with wall_to_monotonic 1679 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1594 * @sleep: pointer to timespec to be set with time in suspend 1680 *
1681 * Returns monotonic time at last tick and various offsets
1595 */ 1682 */
1596void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1683ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
1597 struct timespec *wtom, struct timespec *sleep) 1684 ktime_t *offs_tai)
1598{ 1685{
1599 struct timekeeper *tk = &timekeeper; 1686 struct timekeeper *tk = &tk_core.timekeeper;
1600 unsigned long seq; 1687 unsigned int seq;
1688 ktime_t base;
1689 u64 nsecs;
1601 1690
1602 do { 1691 do {
1603 seq = read_seqcount_begin(&timekeeper_seq); 1692 seq = read_seqcount_begin(&tk_core.seq);
1604 *xtim = tk_xtime(tk); 1693
1605 *wtom = tk->wall_to_monotonic; 1694 base = tk->tkr.base_mono;
1606 *sleep = tk->total_sleep_time; 1695 nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
1607 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1696
1697 *offs_real = tk->offs_real;
1698 *offs_boot = tk->offs_boot;
1699 *offs_tai = tk->offs_tai;
1700 } while (read_seqcount_retry(&tk_core.seq, seq));
1701
1702 return ktime_add_ns(base, nsecs);
1608} 1703}
1609 1704
1610#ifdef CONFIG_HIGH_RES_TIMERS 1705#ifdef CONFIG_HIGH_RES_TIMERS
1611/** 1706/**
1612 * ktime_get_update_offsets - hrtimer helper 1707 * ktime_get_update_offsets_now - hrtimer helper
1613 * @offs_real: pointer to storage for monotonic -> realtime offset 1708 * @offs_real: pointer to storage for monotonic -> realtime offset
1614 * @offs_boot: pointer to storage for monotonic -> boottime offset 1709 * @offs_boot: pointer to storage for monotonic -> boottime offset
1615 * @offs_tai: pointer to storage for monotonic -> clock tai offset 1710 * @offs_tai: pointer to storage for monotonic -> clock tai offset
@@ -1617,57 +1712,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1617 * Returns current monotonic time and updates the offsets 1712 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interrupt() or retrigger_next_event() 1713 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1714 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1715ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1716 ktime_t *offs_tai)
1622{ 1717{
1623 struct timekeeper *tk = &timekeeper; 1718 struct timekeeper *tk = &tk_core.timekeeper;
1624 ktime_t now;
1625 unsigned int seq; 1719 unsigned int seq;
1626 u64 secs, nsecs; 1720 ktime_t base;
1721 u64 nsecs;
1627 1722
1628 do { 1723 do {
1629 seq = read_seqcount_begin(&timekeeper_seq); 1724 seq = read_seqcount_begin(&tk_core.seq);
1630 1725
1631 secs = tk->xtime_sec; 1726 base = tk->tkr.base_mono;
1632 nsecs = timekeeping_get_ns(tk); 1727 nsecs = timekeeping_get_ns(&tk->tkr);
1633 1728
1634 *offs_real = tk->offs_real; 1729 *offs_real = tk->offs_real;
1635 *offs_boot = tk->offs_boot; 1730 *offs_boot = tk->offs_boot;
1636 *offs_tai = tk->offs_tai; 1731 *offs_tai = tk->offs_tai;
1637 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1732 } while (read_seqcount_retry(&tk_core.seq, seq));
1638 1733
1639 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1734 return ktime_add_ns(base, nsecs);
1640 now = ktime_sub(now, *offs_real);
1641 return now;
1642} 1735}
1643#endif 1736#endif
1644 1737
1645/** 1738/**
1646 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1647 */
1648ktime_t ktime_get_monotonic_offset(void)
1649{
1650 struct timekeeper *tk = &timekeeper;
1651 unsigned long seq;
1652 struct timespec wtom;
1653
1654 do {
1655 seq = read_seqcount_begin(&timekeeper_seq);
1656 wtom = tk->wall_to_monotonic;
1657 } while (read_seqcount_retry(&timekeeper_seq, seq));
1658
1659 return timespec_to_ktime(wtom);
1660}
1661EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1662
1663/**
1664 * do_adjtimex() - Accessor function to NTP __do_adjtimex function 1739 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
1665 */ 1740 */
1666int do_adjtimex(struct timex *txc) 1741int do_adjtimex(struct timex *txc)
1667{ 1742{
1668 struct timekeeper *tk = &timekeeper; 1743 struct timekeeper *tk = &tk_core.timekeeper;
1669 unsigned long flags; 1744 unsigned long flags;
1670 struct timespec ts; 1745 struct timespec64 ts;
1671 s32 orig_tai, tai; 1746 s32 orig_tai, tai;
1672 int ret; 1747 int ret;
1673 1748
@@ -1687,10 +1762,10 @@ int do_adjtimex(struct timex *txc)
1687 return ret; 1762 return ret;
1688 } 1763 }
1689 1764
1690 getnstimeofday(&ts); 1765 getnstimeofday64(&ts);
1691 1766
1692 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1767 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1693 write_seqcount_begin(&timekeeper_seq); 1768 write_seqcount_begin(&tk_core.seq);
1694 1769
1695 orig_tai = tai = tk->tai_offset; 1770 orig_tai = tai = tk->tai_offset;
1696 ret = __do_adjtimex(txc, &ts, &tai); 1771 ret = __do_adjtimex(txc, &ts, &tai);
@@ -1699,7 +1774,7 @@ int do_adjtimex(struct timex *txc)
1699 __timekeeping_set_tai_offset(tk, tai); 1774 __timekeeping_set_tai_offset(tk, tai);
1700 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1775 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
1701 } 1776 }
1702 write_seqcount_end(&timekeeper_seq); 1777 write_seqcount_end(&tk_core.seq);
1703 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1778 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1704 1779
1705 if (tai != orig_tai) 1780 if (tai != orig_tai)
@@ -1719,11 +1794,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
1719 unsigned long flags; 1794 unsigned long flags;
1720 1795
1721 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1796 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1722 write_seqcount_begin(&timekeeper_seq); 1797 write_seqcount_begin(&tk_core.seq);
1723 1798
1724 __hardpps(phase_ts, raw_ts); 1799 __hardpps(phase_ts, raw_ts);
1725 1800
1726 write_seqcount_end(&timekeeper_seq); 1801 write_seqcount_end(&tk_core.seq);
1727 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1802 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1728} 1803}
1729EXPORT_SYMBOL(hardpps); 1804EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000000..adc1fc98bde3
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,20 @@
1#ifndef _KERNEL_TIME_TIMEKEEPING_H
2#define _KERNEL_TIME_TIMEKEEPING_H
3/*
4 * Internal interfaces for kernel/time/
5 */
6extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
7 ktime_t *offs_boot,
8 ktime_t *offs_tai);
9extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
10 ktime_t *offs_boot,
11 ktime_t *offs_tai);
12
13extern int timekeeping_valid_for_hres(void);
14extern u64 timekeeping_max_deferment(void);
15extern int timekeeping_inject_offset(struct timespec *ts);
16extern s32 timekeeping_get_tai_offset(void);
17extern void timekeeping_set_tai_offset(s32 tai_offset);
18extern void timekeeping_clocktai(struct timespec *ts);
19
20#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 4d54f97558df..f6bd65236712 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)
67} 67}
68late_initcall(tk_debug_sleep_time_init); 68late_initcall(tk_debug_sleep_time_init);
69 69
70void tk_debug_account_sleep_time(struct timespec *t) 70void tk_debug_account_sleep_time(struct timespec64 *t)
71{ 71{
72 sleep_time_bin[fls(t->tv_sec)]++; 72 sleep_time_bin[fls(t->tv_sec)]++;
73} 73}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 13323ea08ffa..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,12 +3,27 @@
3/* 3/*
4 * timekeeping debug functions 4 * timekeeping debug functions
5 */ 5 */
6#include <linux/clocksource.h>
6#include <linux/time.h> 7#include <linux/time.h>
7 8
8#ifdef CONFIG_DEBUG_FS 9#ifdef CONFIG_DEBUG_FS
9extern void tk_debug_account_sleep_time(struct timespec *t); 10extern void tk_debug_account_sleep_time(struct timespec64 *t);
10#else 11#else
11#define tk_debug_account_sleep_time(x) 12#define tk_debug_account_sleep_time(x)
12#endif 13#endif
13 14
15#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
16static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
17{
18 cycle_t ret = (now - last) & mask;
19
20 return (s64) ret > 0 ? ret : 0;
21}
22#else
23static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
24{
25 return (now - last) & mask;
26}
27#endif
28
14#endif /* _TIMEKEEPING_INTERNAL_H */ 29#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
new file mode 100644
index 000000000000..aca5dfe2fa3d
--- /dev/null
+++ b/kernel/time/timer.c
@@ -0,0 +1,1736 @@
1/*
2 * linux/kernel/timer.c
3 *
4 * Kernel internal timers
5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds
7 *
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
9 *
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
20 */
21
22#include <linux/kernel_stat.h>
23#include <linux/export.h>
24#include <linux/interrupt.h>
25#include <linux/percpu.h>
26#include <linux/init.h>
27#include <linux/mm.h>
28#include <linux/swap.h>
29#include <linux/pid_namespace.h>
30#include <linux/notifier.h>
31#include <linux/thread_info.h>
32#include <linux/time.h>
33#include <linux/jiffies.h>
34#include <linux/posix-timers.h>
35#include <linux/cpu.h>
36#include <linux/syscalls.h>
37#include <linux/delay.h>
38#include <linux/tick.h>
39#include <linux/kallsyms.h>
40#include <linux/irq_work.h>
41#include <linux/sched.h>
42#include <linux/sched/sysctl.h>
43#include <linux/slab.h>
44#include <linux/compat.h>
45
46#include <asm/uaccess.h>
47#include <asm/unistd.h>
48#include <asm/div64.h>
49#include <asm/timex.h>
50#include <asm/io.h>
51
52#define CREATE_TRACE_POINTS
53#include <trace/events/timer.h>
54
55__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
56
57EXPORT_SYMBOL(jiffies_64);
58
59/*
60 * per-CPU timer vector definitions:
61 */
62#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
63#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
64#define TVN_SIZE (1 << TVN_BITS)
65#define TVR_SIZE (1 << TVR_BITS)
66#define TVN_MASK (TVN_SIZE - 1)
67#define TVR_MASK (TVR_SIZE - 1)
68#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
69
70struct tvec {
71 struct list_head vec[TVN_SIZE];
72};
73
74struct tvec_root {
75 struct list_head vec[TVR_SIZE];
76};
77
78struct tvec_base {
79 spinlock_t lock;
80 struct timer_list *running_timer;
81 unsigned long timer_jiffies;
82 unsigned long next_timer;
83 unsigned long active_timers;
84 unsigned long all_timers;
85 int cpu;
86 struct tvec_root tv1;
87 struct tvec tv2;
88 struct tvec tv3;
89 struct tvec tv4;
90 struct tvec tv5;
91} ____cacheline_aligned;
92
93struct tvec_base boot_tvec_bases;
94EXPORT_SYMBOL(boot_tvec_bases);
95static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
96
97/* Functions below help us manage 'deferrable' flag */
98static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
99{
100 return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
101}
102
103static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
104{
105 return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
106}
107
108static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
109{
110 return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
111}
112
113static inline void
114timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
115{
116 unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
117
118 timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
119}
120
121static unsigned long round_jiffies_common(unsigned long j, int cpu,
122 bool force_up)
123{
124 int rem;
125 unsigned long original = j;
126
127 /*
128 * We don't want all cpus firing their timers at once hitting the
129 * same lock or cachelines, so we skew each extra cpu with an extra
130 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
131 * already did this.
132 * The skew is done by adding 3*cpunr, then round, then subtract this
133 * extra offset again.
134 */
135 j += cpu * 3;
136
137 rem = j % HZ;
138
139 /*
140 * If the target jiffie is just after a whole second (which can happen
141 * due to delays of the timer irq, long irq off times etc etc) then
142 * we should round down to the whole second, not up. Use 1/4th second
143 * as cutoff for this rounding as an extreme upper bound for this.
144 * But never round down if @force_up is set.
145 */
146 if (rem < HZ/4 && !force_up) /* round down */
147 j = j - rem;
148 else /* round up */
149 j = j - rem + HZ;
150
151 /* now that we have rounded, subtract the extra skew again */
152 j -= cpu * 3;
153
154 /*
155 * Make sure j is still in the future. Otherwise return the
156 * unmodified value.
157 */
158 return time_is_after_jiffies(j) ? j : original;
159}
160
161/**
162 * __round_jiffies - function to round jiffies to a full second
163 * @j: the time in (absolute) jiffies that should be rounded
164 * @cpu: the processor number on which the timeout will happen
165 *
166 * __round_jiffies() rounds an absolute time in the future (in jiffies)
167 * up or down to (approximately) full seconds. This is useful for timers
168 * for which the exact time they fire does not matter too much, as long as
169 * they fire approximately every X seconds.
170 *
171 * By rounding these timers to whole seconds, all such timers will fire
172 * at the same time, rather than at various times spread out. The goal
173 * of this is to have the CPU wake up less, which saves power.
174 *
175 * The exact rounding is skewed for each processor to avoid all
176 * processors firing at the exact same time, which could lead
177 * to lock contention or spurious cache line bouncing.
178 *
179 * The return value is the rounded version of the @j parameter.
180 */
181unsigned long __round_jiffies(unsigned long j, int cpu)
182{
183 return round_jiffies_common(j, cpu, false);
184}
185EXPORT_SYMBOL_GPL(__round_jiffies);
186
187/**
188 * __round_jiffies_relative - function to round jiffies to a full second
189 * @j: the time in (relative) jiffies that should be rounded
190 * @cpu: the processor number on which the timeout will happen
191 *
192 * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
193 * up or down to (approximately) full seconds. This is useful for timers
194 * for which the exact time they fire does not matter too much, as long as
195 * they fire approximately every X seconds.
196 *
197 * By rounding these timers to whole seconds, all such timers will fire
198 * at the same time, rather than at various times spread out. The goal
199 * of this is to have the CPU wake up less, which saves power.
200 *
201 * The exact rounding is skewed for each processor to avoid all
202 * processors firing at the exact same time, which could lead
203 * to lock contention or spurious cache line bouncing.
204 *
205 * The return value is the rounded version of the @j parameter.
206 */
207unsigned long __round_jiffies_relative(unsigned long j, int cpu)
208{
209 unsigned long j0 = jiffies;
210
211 /* Use j0 because jiffies might change while we run */
212 return round_jiffies_common(j + j0, cpu, false) - j0;
213}
214EXPORT_SYMBOL_GPL(__round_jiffies_relative);
215
216/**
217 * round_jiffies - function to round jiffies to a full second
218 * @j: the time in (absolute) jiffies that should be rounded
219 *
220 * round_jiffies() rounds an absolute time in the future (in jiffies)
221 * up or down to (approximately) full seconds. This is useful for timers
222 * for which the exact time they fire does not matter too much, as long as
223 * they fire approximately every X seconds.
224 *
225 * By rounding these timers to whole seconds, all such timers will fire
226 * at the same time, rather than at various times spread out. The goal
227 * of this is to have the CPU wake up less, which saves power.
228 *
229 * The return value is the rounded version of the @j parameter.
230 */
231unsigned long round_jiffies(unsigned long j)
232{
233 return round_jiffies_common(j, raw_smp_processor_id(), false);
234}
235EXPORT_SYMBOL_GPL(round_jiffies);
236
237/**
238 * round_jiffies_relative - function to round jiffies to a full second
239 * @j: the time in (relative) jiffies that should be rounded
240 *
241 * round_jiffies_relative() rounds a time delta in the future (in jiffies)
242 * up or down to (approximately) full seconds. This is useful for timers
243 * for which the exact time they fire does not matter too much, as long as
244 * they fire approximately every X seconds.
245 *
246 * By rounding these timers to whole seconds, all such timers will fire
247 * at the same time, rather than at various times spread out. The goal
248 * of this is to have the CPU wake up less, which saves power.
249 *
250 * The return value is the rounded version of the @j parameter.
251 */
252unsigned long round_jiffies_relative(unsigned long j)
253{
254 return __round_jiffies_relative(j, raw_smp_processor_id());
255}
256EXPORT_SYMBOL_GPL(round_jiffies_relative);
257
258/**
259 * __round_jiffies_up - function to round jiffies up to a full second
260 * @j: the time in (absolute) jiffies that should be rounded
261 * @cpu: the processor number on which the timeout will happen
262 *
263 * This is the same as __round_jiffies() except that it will never
264 * round down. This is useful for timeouts for which the exact time
265 * of firing does not matter too much, as long as they don't fire too
266 * early.
267 */
268unsigned long __round_jiffies_up(unsigned long j, int cpu)
269{
270 return round_jiffies_common(j, cpu, true);
271}
272EXPORT_SYMBOL_GPL(__round_jiffies_up);
273
274/**
275 * __round_jiffies_up_relative - function to round jiffies up to a full second
276 * @j: the time in (relative) jiffies that should be rounded
277 * @cpu: the processor number on which the timeout will happen
278 *
279 * This is the same as __round_jiffies_relative() except that it will never
280 * round down. This is useful for timeouts for which the exact time
281 * of firing does not matter too much, as long as they don't fire too
282 * early.
283 */
284unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
285{
286 unsigned long j0 = jiffies;
287
288 /* Use j0 because jiffies might change while we run */
289 return round_jiffies_common(j + j0, cpu, true) - j0;
290}
291EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
292
293/**
294 * round_jiffies_up - function to round jiffies up to a full second
295 * @j: the time in (absolute) jiffies that should be rounded
296 *
297 * This is the same as round_jiffies() except that it will never
298 * round down. This is useful for timeouts for which the exact time
299 * of firing does not matter too much, as long as they don't fire too
300 * early.
301 */
302unsigned long round_jiffies_up(unsigned long j)
303{
304 return round_jiffies_common(j, raw_smp_processor_id(), true);
305}
306EXPORT_SYMBOL_GPL(round_jiffies_up);
307
308/**
309 * round_jiffies_up_relative - function to round jiffies up to a full second
310 * @j: the time in (relative) jiffies that should be rounded
311 *
312 * This is the same as round_jiffies_relative() except that it will never
313 * round down. This is useful for timeouts for which the exact time
314 * of firing does not matter too much, as long as they don't fire too
315 * early.
316 */
317unsigned long round_jiffies_up_relative(unsigned long j)
318{
319 return __round_jiffies_up_relative(j, raw_smp_processor_id());
320}
321EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
322
323/**
324 * set_timer_slack - set the allowed slack for a timer
325 * @timer: the timer to be modified
326 * @slack_hz: the amount of time (in jiffies) allowed for rounding
327 *
328 * Set the amount of time, in jiffies, that a certain timer has
329 * in terms of slack. By setting this value, the timer subsystem
330 * will schedule the actual timer somewhere between
331 * the time mod_timer() asks for, and that time plus the slack.
332 *
333 * By setting the slack to -1, a percentage of the delay is used
334 * instead.
335 */
336void set_timer_slack(struct timer_list *timer, int slack_hz)
337{
338 timer->slack = slack_hz;
339}
340EXPORT_SYMBOL_GPL(set_timer_slack);
341
342/*
343 * If the list is empty, catch up ->timer_jiffies to the current time.
344 * The caller must hold the tvec_base lock. Returns true if the list
345 * was empty and therefore ->timer_jiffies was updated.
346 */
347static bool catchup_timer_jiffies(struct tvec_base *base)
348{
349 if (!base->all_timers) {
350 base->timer_jiffies = jiffies;
351 return true;
352 }
353 return false;
354}
355
356static void
357__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
358{
359 unsigned long expires = timer->expires;
360 unsigned long idx = expires - base->timer_jiffies;
361 struct list_head *vec;
362
363 if (idx < TVR_SIZE) {
364 int i = expires & TVR_MASK;
365 vec = base->tv1.vec + i;
366 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
367 int i = (expires >> TVR_BITS) & TVN_MASK;
368 vec = base->tv2.vec + i;
369 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
370 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
371 vec = base->tv3.vec + i;
372 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
373 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
374 vec = base->tv4.vec + i;
375 } else if ((signed long) idx < 0) {
376 /*
377 * Can happen if you add a timer with expires == jiffies,
378 * or you set a timer to go off in the past
379 */
380 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
381 } else {
382 int i;
383 /* If the timeout is larger than MAX_TVAL (on 64-bit
384 * architectures or with CONFIG_BASE_SMALL=1) then we
385 * use the maximum timeout.
386 */
387 if (idx > MAX_TVAL) {
388 idx = MAX_TVAL;
389 expires = idx + base->timer_jiffies;
390 }
391 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
392 vec = base->tv5.vec + i;
393 }
394 /*
395 * Timers are FIFO:
396 */
397 list_add_tail(&timer->entry, vec);
398}
399
400static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
401{
402 (void)catchup_timer_jiffies(base);
403 __internal_add_timer(base, timer);
404 /*
405 * Update base->active_timers and base->next_timer
406 */
407 if (!tbase_get_deferrable(timer->base)) {
408 if (!base->active_timers++ ||
409 time_before(timer->expires, base->next_timer))
410 base->next_timer = timer->expires;
411 }
412 base->all_timers++;
413
414 /*
415 * Check whether the other CPU is in dynticks mode and needs
416 * to be triggered to reevaluate the timer wheel.
417 * We are protected against the other CPU fiddling
418 * with the timer by holding the timer base lock. This also
419 * makes sure that a CPU on the way to stop its tick can not
420 * evaluate the timer wheel.
421 *
422 * Spare the IPI for deferrable timers on idle targets though.
423 * The next busy ticks will take care of it. Except full dynticks
424 * require special care against races with idle_cpu(), lets deal
425 * with that later.
426 */
427 if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
428 wake_up_nohz_cpu(base->cpu);
429}
430
431#ifdef CONFIG_TIMER_STATS
432void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
433{
434 if (timer->start_site)
435 return;
436
437 timer->start_site = addr;
438 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
439 timer->start_pid = current->pid;
440}
441
442static void timer_stats_account_timer(struct timer_list *timer)
443{
444 unsigned int flag = 0;
445
446 if (likely(!timer->start_site))
447 return;
448 if (unlikely(tbase_get_deferrable(timer->base)))
449 flag |= TIMER_STATS_FLAG_DEFERRABLE;
450
451 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
452 timer->function, timer->start_comm, flag);
453}
454
455#else
456static void timer_stats_account_timer(struct timer_list *timer) {}
457#endif
458
459#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
460
461static struct debug_obj_descr timer_debug_descr;
462
463static void *timer_debug_hint(void *addr)
464{
465 return ((struct timer_list *) addr)->function;
466}
467
468/*
469 * fixup_init is called when:
470 * - an active object is initialized
471 */
472static int timer_fixup_init(void *addr, enum debug_obj_state state)
473{
474 struct timer_list *timer = addr;
475
476 switch (state) {
477 case ODEBUG_STATE_ACTIVE:
478 del_timer_sync(timer);
479 debug_object_init(timer, &timer_debug_descr);
480 return 1;
481 default:
482 return 0;
483 }
484}
485
486/* Stub timer callback for improperly used timers. */
487static void stub_timer(unsigned long data)
488{
489 WARN_ON(1);
490}
491
492/*
493 * fixup_activate is called when:
494 * - an active object is activated
495 * - an unknown object is activated (might be a statically initialized object)
496 */
497static int timer_fixup_activate(void *addr, enum debug_obj_state state)
498{
499 struct timer_list *timer = addr;
500
501 switch (state) {
502
503 case ODEBUG_STATE_NOTAVAILABLE:
504 /*
505 * This is not really a fixup. The timer was
506 * statically initialized. We just make sure that it
507 * is tracked in the object tracker.
508 */
509 if (timer->entry.next == NULL &&
510 timer->entry.prev == TIMER_ENTRY_STATIC) {
511 debug_object_init(timer, &timer_debug_descr);
512 debug_object_activate(timer, &timer_debug_descr);
513 return 0;
514 } else {
515 setup_timer(timer, stub_timer, 0);
516 return 1;
517 }
518 return 0;
519
520 case ODEBUG_STATE_ACTIVE:
521 WARN_ON(1);
522
523 default:
524 return 0;
525 }
526}
527
528/*
529 * fixup_free is called when:
530 * - an active object is freed
531 */
532static int timer_fixup_free(void *addr, enum debug_obj_state state)
533{
534 struct timer_list *timer = addr;
535
536 switch (state) {
537 case ODEBUG_STATE_ACTIVE:
538 del_timer_sync(timer);
539 debug_object_free(timer, &timer_debug_descr);
540 return 1;
541 default:
542 return 0;
543 }
544}
545
546/*
547 * fixup_assert_init is called when:
548 * - an untracked/uninit-ed object is found
549 */
550static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
551{
552 struct timer_list *timer = addr;
553
554 switch (state) {
555 case ODEBUG_STATE_NOTAVAILABLE:
556 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
557 /*
558 * This is not really a fixup. The timer was
559 * statically initialized. We just make sure that it
560 * is tracked in the object tracker.
561 */
562 debug_object_init(timer, &timer_debug_descr);
563 return 0;
564 } else {
565 setup_timer(timer, stub_timer, 0);
566 return 1;
567 }
568 default:
569 return 0;
570 }
571}
572
573static struct debug_obj_descr timer_debug_descr = {
574 .name = "timer_list",
575 .debug_hint = timer_debug_hint,
576 .fixup_init = timer_fixup_init,
577 .fixup_activate = timer_fixup_activate,
578 .fixup_free = timer_fixup_free,
579 .fixup_assert_init = timer_fixup_assert_init,
580};
581
582static inline void debug_timer_init(struct timer_list *timer)
583{
584 debug_object_init(timer, &timer_debug_descr);
585}
586
587static inline void debug_timer_activate(struct timer_list *timer)
588{
589 debug_object_activate(timer, &timer_debug_descr);
590}
591
592static inline void debug_timer_deactivate(struct timer_list *timer)
593{
594 debug_object_deactivate(timer, &timer_debug_descr);
595}
596
597static inline void debug_timer_free(struct timer_list *timer)
598{
599 debug_object_free(timer, &timer_debug_descr);
600}
601
602static inline void debug_timer_assert_init(struct timer_list *timer)
603{
604 debug_object_assert_init(timer, &timer_debug_descr);
605}
606
607static void do_init_timer(struct timer_list *timer, unsigned int flags,
608 const char *name, struct lock_class_key *key);
609
610void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
611 const char *name, struct lock_class_key *key)
612{
613 debug_object_init_on_stack(timer, &timer_debug_descr);
614 do_init_timer(timer, flags, name, key);
615}
616EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
617
618void destroy_timer_on_stack(struct timer_list *timer)
619{
620 debug_object_free(timer, &timer_debug_descr);
621}
622EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
623
624#else
625static inline void debug_timer_init(struct timer_list *timer) { }
626static inline void debug_timer_activate(struct timer_list *timer) { }
627static inline void debug_timer_deactivate(struct timer_list *timer) { }
628static inline void debug_timer_assert_init(struct timer_list *timer) { }
629#endif
630
631static inline void debug_init(struct timer_list *timer)
632{
633 debug_timer_init(timer);
634 trace_timer_init(timer);
635}
636
637static inline void
638debug_activate(struct timer_list *timer, unsigned long expires)
639{
640 debug_timer_activate(timer);
641 trace_timer_start(timer, expires);
642}
643
644static inline void debug_deactivate(struct timer_list *timer)
645{
646 debug_timer_deactivate(timer);
647 trace_timer_cancel(timer);
648}
649
650static inline void debug_assert_init(struct timer_list *timer)
651{
652 debug_timer_assert_init(timer);
653}
654
655static void do_init_timer(struct timer_list *timer, unsigned int flags,
656 const char *name, struct lock_class_key *key)
657{
658 struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
659
660 timer->entry.next = NULL;
661 timer->base = (void *)((unsigned long)base | flags);
662 timer->slack = -1;
663#ifdef CONFIG_TIMER_STATS
664 timer->start_site = NULL;
665 timer->start_pid = -1;
666 memset(timer->start_comm, 0, TASK_COMM_LEN);
667#endif
668 lockdep_init_map(&timer->lockdep_map, name, key, 0);
669}
670
671/**
672 * init_timer_key - initialize a timer
673 * @timer: the timer to be initialized
674 * @flags: timer flags
675 * @name: name of the timer
676 * @key: lockdep class key of the fake lock used for tracking timer
677 * sync lock dependencies
678 *
679 * init_timer_key() must be done to a timer prior calling *any* of the
680 * other timer functions.
681 */
682void init_timer_key(struct timer_list *timer, unsigned int flags,
683 const char *name, struct lock_class_key *key)
684{
685 debug_init(timer);
686 do_init_timer(timer, flags, name, key);
687}
688EXPORT_SYMBOL(init_timer_key);
689
690static inline void detach_timer(struct timer_list *timer, bool clear_pending)
691{
692 struct list_head *entry = &timer->entry;
693
694 debug_deactivate(timer);
695
696 __list_del(entry->prev, entry->next);
697 if (clear_pending)
698 entry->next = NULL;
699 entry->prev = LIST_POISON2;
700}
701
702static inline void
703detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
704{
705 detach_timer(timer, true);
706 if (!tbase_get_deferrable(timer->base))
707 base->active_timers--;
708 base->all_timers--;
709 (void)catchup_timer_jiffies(base);
710}
711
712static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
713 bool clear_pending)
714{
715 if (!timer_pending(timer))
716 return 0;
717
718 detach_timer(timer, clear_pending);
719 if (!tbase_get_deferrable(timer->base)) {
720 base->active_timers--;
721 if (timer->expires == base->next_timer)
722 base->next_timer = base->timer_jiffies;
723 }
724 base->all_timers--;
725 (void)catchup_timer_jiffies(base);
726 return 1;
727}
728
729/*
730 * We are using hashed locking: holding per_cpu(tvec_bases).lock
731 * means that all timers which are tied to this base via timer->base are
732 * locked, and the base itself is locked too.
733 *
734 * So __run_timers/migrate_timers can safely modify all timers which could
735 * be found on ->tvX lists.
736 *
737 * When the timer's base is locked, and the timer removed from list, it is
738 * possible to set timer->base = NULL and drop the lock: the timer remains
739 * locked.
740 */
741static struct tvec_base *lock_timer_base(struct timer_list *timer,
742 unsigned long *flags)
743 __acquires(timer->base->lock)
744{
745 struct tvec_base *base;
746
747 for (;;) {
748 struct tvec_base *prelock_base = timer->base;
749 base = tbase_get_base(prelock_base);
750 if (likely(base != NULL)) {
751 spin_lock_irqsave(&base->lock, *flags);
752 if (likely(prelock_base == timer->base))
753 return base;
754 /* The timer has migrated to another CPU */
755 spin_unlock_irqrestore(&base->lock, *flags);
756 }
757 cpu_relax();
758 }
759}
760
761static inline int
762__mod_timer(struct timer_list *timer, unsigned long expires,
763 bool pending_only, int pinned)
764{
765 struct tvec_base *base, *new_base;
766 unsigned long flags;
767 int ret = 0 , cpu;
768
769 timer_stats_timer_set_start_info(timer);
770 BUG_ON(!timer->function);
771
772 base = lock_timer_base(timer, &flags);
773
774 ret = detach_if_pending(timer, base, false);
775 if (!ret && pending_only)
776 goto out_unlock;
777
778 debug_activate(timer, expires);
779
780 cpu = get_nohz_timer_target(pinned);
781 new_base = per_cpu(tvec_bases, cpu);
782
783 if (base != new_base) {
784 /*
785 * We are trying to schedule the timer on the local CPU.
786 * However we can't change timer's base while it is running,
787 * otherwise del_timer_sync() can't detect that the timer's
788 * handler yet has not finished. This also guarantees that
789 * the timer is serialized wrt itself.
790 */
791 if (likely(base->running_timer != timer)) {
792 /* See the comment in lock_timer_base() */
793 timer_set_base(timer, NULL);
794 spin_unlock(&base->lock);
795 base = new_base;
796 spin_lock(&base->lock);
797 timer_set_base(timer, base);
798 }
799 }
800
801 timer->expires = expires;
802 internal_add_timer(base, timer);
803
804out_unlock:
805 spin_unlock_irqrestore(&base->lock, flags);
806
807 return ret;
808}
809
810/**
811 * mod_timer_pending - modify a pending timer's timeout
812 * @timer: the pending timer to be modified
813 * @expires: new timeout in jiffies
814 *
815 * mod_timer_pending() is the same for pending timers as mod_timer(),
816 * but will not re-activate and modify already deleted timers.
817 *
818 * It is useful for unserialized use of timers.
819 */
820int mod_timer_pending(struct timer_list *timer, unsigned long expires)
821{
822 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
823}
824EXPORT_SYMBOL(mod_timer_pending);
825
826/*
827 * Decide where to put the timer while taking the slack into account
828 *
829 * Algorithm:
830 * 1) calculate the maximum (absolute) time
831 * 2) calculate the highest bit where the expires and new max are different
832 * 3) use this bit to make a mask
833 * 4) use the bitmask to round down the maximum time, so that all last
834 * bits are zeros
835 */
836static inline
837unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
838{
839 unsigned long expires_limit, mask;
840 int bit;
841
842 if (timer->slack >= 0) {
843 expires_limit = expires + timer->slack;
844 } else {
845 long delta = expires - jiffies;
846
847 if (delta < 256)
848 return expires;
849
850 expires_limit = expires + delta / 256;
851 }
852 mask = expires ^ expires_limit;
853 if (mask == 0)
854 return expires;
855
856 bit = find_last_bit(&mask, BITS_PER_LONG);
857
858 mask = (1UL << bit) - 1;
859
860 expires_limit = expires_limit & ~(mask);
861
862 return expires_limit;
863}
864
865/**
866 * mod_timer - modify a timer's timeout
867 * @timer: the timer to be modified
868 * @expires: new timeout in jiffies
869 *
870 * mod_timer() is a more efficient way to update the expire field of an
871 * active timer (if the timer is inactive it will be activated)
872 *
873 * mod_timer(timer, expires) is equivalent to:
874 *
875 * del_timer(timer); timer->expires = expires; add_timer(timer);
876 *
877 * Note that if there are multiple unserialized concurrent users of the
878 * same timer, then mod_timer() is the only safe way to modify the timeout,
879 * since add_timer() cannot modify an already running timer.
880 *
881 * The function returns whether it has modified a pending timer or not.
882 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
883 * active timer returns 1.)
884 */
885int mod_timer(struct timer_list *timer, unsigned long expires)
886{
887 expires = apply_slack(timer, expires);
888
889 /*
890 * This is a common optimization triggered by the
891 * networking code - if the timer is re-modified
892 * to be the same thing then just return:
893 */
894 if (timer_pending(timer) && timer->expires == expires)
895 return 1;
896
897 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
898}
899EXPORT_SYMBOL(mod_timer);
900
901/**
902 * mod_timer_pinned - modify a timer's timeout
903 * @timer: the timer to be modified
904 * @expires: new timeout in jiffies
905 *
906 * mod_timer_pinned() is a way to update the expire field of an
907 * active timer (if the timer is inactive it will be activated)
908 * and to ensure that the timer is scheduled on the current CPU.
909 *
910 * Note that this does not prevent the timer from being migrated
911 * when the current CPU goes offline. If this is a problem for
912 * you, use CPU-hotplug notifiers to handle it correctly, for
913 * example, cancelling the timer when the corresponding CPU goes
914 * offline.
915 *
916 * mod_timer_pinned(timer, expires) is equivalent to:
917 *
918 * del_timer(timer); timer->expires = expires; add_timer(timer);
919 */
920int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
921{
922 if (timer->expires == expires && timer_pending(timer))
923 return 1;
924
925 return __mod_timer(timer, expires, false, TIMER_PINNED);
926}
927EXPORT_SYMBOL(mod_timer_pinned);
928
929/**
930 * add_timer - start a timer
931 * @timer: the timer to be added
932 *
933 * The kernel will do a ->function(->data) callback from the
934 * timer interrupt at the ->expires point in the future. The
935 * current time is 'jiffies'.
936 *
937 * The timer's ->expires, ->function (and if the handler uses it, ->data)
938 * fields must be set prior calling this function.
939 *
940 * Timers with an ->expires field in the past will be executed in the next
941 * timer tick.
942 */
943void add_timer(struct timer_list *timer)
944{
945 BUG_ON(timer_pending(timer));
946 mod_timer(timer, timer->expires);
947}
948EXPORT_SYMBOL(add_timer);
949
950/**
951 * add_timer_on - start a timer on a particular CPU
952 * @timer: the timer to be added
953 * @cpu: the CPU to start it on
954 *
955 * This is not very scalable on SMP. Double adds are not possible.
956 */
957void add_timer_on(struct timer_list *timer, int cpu)
958{
959 struct tvec_base *base = per_cpu(tvec_bases, cpu);
960 unsigned long flags;
961
962 timer_stats_timer_set_start_info(timer);
963 BUG_ON(timer_pending(timer) || !timer->function);
964 spin_lock_irqsave(&base->lock, flags);
965 timer_set_base(timer, base);
966 debug_activate(timer, timer->expires);
967 internal_add_timer(base, timer);
968 spin_unlock_irqrestore(&base->lock, flags);
969}
970EXPORT_SYMBOL_GPL(add_timer_on);
971
972/**
973 * del_timer - deactive a timer.
974 * @timer: the timer to be deactivated
975 *
976 * del_timer() deactivates a timer - this works on both active and inactive
977 * timers.
978 *
979 * The function returns whether it has deactivated a pending timer or not.
980 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
981 * active timer returns 1.)
982 */
983int del_timer(struct timer_list *timer)
984{
985 struct tvec_base *base;
986 unsigned long flags;
987 int ret = 0;
988
989 debug_assert_init(timer);
990
991 timer_stats_timer_clear_start_info(timer);
992 if (timer_pending(timer)) {
993 base = lock_timer_base(timer, &flags);
994 ret = detach_if_pending(timer, base, true);
995 spin_unlock_irqrestore(&base->lock, flags);
996 }
997
998 return ret;
999}
1000EXPORT_SYMBOL(del_timer);
1001
1002/**
1003 * try_to_del_timer_sync - Try to deactivate a timer
1004 * @timer: timer do del
1005 *
1006 * This function tries to deactivate a timer. Upon successful (ret >= 0)
1007 * exit the timer is not queued and the handler is not running on any CPU.
1008 */
1009int try_to_del_timer_sync(struct timer_list *timer)
1010{
1011 struct tvec_base *base;
1012 unsigned long flags;
1013 int ret = -1;
1014
1015 debug_assert_init(timer);
1016
1017 base = lock_timer_base(timer, &flags);
1018
1019 if (base->running_timer != timer) {
1020 timer_stats_timer_clear_start_info(timer);
1021 ret = detach_if_pending(timer, base, true);
1022 }
1023 spin_unlock_irqrestore(&base->lock, flags);
1024
1025 return ret;
1026}
1027EXPORT_SYMBOL(try_to_del_timer_sync);
1028
1029#ifdef CONFIG_SMP
1030/**
1031 * del_timer_sync - deactivate a timer and wait for the handler to finish.
1032 * @timer: the timer to be deactivated
1033 *
1034 * This function only differs from del_timer() on SMP: besides deactivating
1035 * the timer it also makes sure the handler has finished executing on other
1036 * CPUs.
1037 *
1038 * Synchronization rules: Callers must prevent restarting of the timer,
1039 * otherwise this function is meaningless. It must not be called from
1040 * interrupt contexts unless the timer is an irqsafe one. The caller must
1041 * not hold locks which would prevent completion of the timer's
1042 * handler. The timer's handler must not call add_timer_on(). Upon exit the
1043 * timer is not queued and the handler is not running on any CPU.
1044 *
1045 * Note: For !irqsafe timers, you must not hold locks that are held in
1046 * interrupt context while calling this function. Even if the lock has
1047 * nothing to do with the timer in question. Here's why:
1048 *
1049 * CPU0 CPU1
1050 * ---- ----
1051 * <SOFTIRQ>
1052 * call_timer_fn();
1053 * base->running_timer = mytimer;
1054 * spin_lock_irq(somelock);
1055 * <IRQ>
1056 * spin_lock(somelock);
1057 * del_timer_sync(mytimer);
1058 * while (base->running_timer == mytimer);
1059 *
1060 * Now del_timer_sync() will never return and never release somelock.
1061 * The interrupt on the other CPU is waiting to grab somelock but
1062 * it has interrupted the softirq that CPU0 is waiting to finish.
1063 *
1064 * The function returns whether it has deactivated a pending timer or not.
1065 */
1066int del_timer_sync(struct timer_list *timer)
1067{
1068#ifdef CONFIG_LOCKDEP
1069 unsigned long flags;
1070
1071 /*
1072 * If lockdep gives a backtrace here, please reference
1073 * the synchronization rules above.
1074 */
1075 local_irq_save(flags);
1076 lock_map_acquire(&timer->lockdep_map);
1077 lock_map_release(&timer->lockdep_map);
1078 local_irq_restore(flags);
1079#endif
1080 /*
1081 * don't use it in hardirq context, because it
1082 * could lead to deadlock.
1083 */
1084 WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
1085 for (;;) {
1086 int ret = try_to_del_timer_sync(timer);
1087 if (ret >= 0)
1088 return ret;
1089 cpu_relax();
1090 }
1091}
1092EXPORT_SYMBOL(del_timer_sync);
1093#endif
1094
1095static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1096{
1097 /* cascade all the timers from tv up one level */
1098 struct timer_list *timer, *tmp;
1099 struct list_head tv_list;
1100
1101 list_replace_init(tv->vec + index, &tv_list);
1102
1103 /*
1104 * We are removing _all_ timers from the list, so we
1105 * don't have to detach them individually.
1106 */
1107 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1108 BUG_ON(tbase_get_base(timer->base) != base);
1109 /* No accounting, while moving them */
1110 __internal_add_timer(base, timer);
1111 }
1112
1113 return index;
1114}
1115
1116static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1117 unsigned long data)
1118{
1119 int count = preempt_count();
1120
1121#ifdef CONFIG_LOCKDEP
1122 /*
1123 * It is permissible to free the timer from inside the
1124 * function that is called from it, this we need to take into
1125 * account for lockdep too. To avoid bogus "held lock freed"
1126 * warnings as well as problems when looking into
1127 * timer->lockdep_map, make a copy and use that here.
1128 */
1129 struct lockdep_map lockdep_map;
1130
1131 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1132#endif
1133 /*
1134 * Couple the lock chain with the lock chain at
1135 * del_timer_sync() by acquiring the lock_map around the fn()
1136 * call here and in del_timer_sync().
1137 */
1138 lock_map_acquire(&lockdep_map);
1139
1140 trace_timer_expire_entry(timer);
1141 fn(data);
1142 trace_timer_expire_exit(timer);
1143
1144 lock_map_release(&lockdep_map);
1145
1146 if (count != preempt_count()) {
1147 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1148 fn, count, preempt_count());
1149 /*
1150 * Restore the preempt count. That gives us a decent
1151 * chance to survive and extract information. If the
1152 * callback kept a lock held, bad luck, but not worse
1153 * than the BUG() we had.
1154 */
1155 preempt_count_set(count);
1156 }
1157}
1158
1159#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
1160
1161/**
1162 * __run_timers - run all expired timers (if any) on this CPU.
1163 * @base: the timer vector to be processed.
1164 *
1165 * This function cascades all vectors and executes all expired timer
1166 * vectors.
1167 */
1168static inline void __run_timers(struct tvec_base *base)
1169{
1170 struct timer_list *timer;
1171
1172 spin_lock_irq(&base->lock);
1173 if (catchup_timer_jiffies(base)) {
1174 spin_unlock_irq(&base->lock);
1175 return;
1176 }
1177 while (time_after_eq(jiffies, base->timer_jiffies)) {
1178 struct list_head work_list;
1179 struct list_head *head = &work_list;
1180 int index = base->timer_jiffies & TVR_MASK;
1181
1182 /*
1183 * Cascade timers:
1184 */
1185 if (!index &&
1186 (!cascade(base, &base->tv2, INDEX(0))) &&
1187 (!cascade(base, &base->tv3, INDEX(1))) &&
1188 !cascade(base, &base->tv4, INDEX(2)))
1189 cascade(base, &base->tv5, INDEX(3));
1190 ++base->timer_jiffies;
1191 list_replace_init(base->tv1.vec + index, head);
1192 while (!list_empty(head)) {
1193 void (*fn)(unsigned long);
1194 unsigned long data;
1195 bool irqsafe;
1196
1197 timer = list_first_entry(head, struct timer_list,entry);
1198 fn = timer->function;
1199 data = timer->data;
1200 irqsafe = tbase_get_irqsafe(timer->base);
1201
1202 timer_stats_account_timer(timer);
1203
1204 base->running_timer = timer;
1205 detach_expired_timer(timer, base);
1206
1207 if (irqsafe) {
1208 spin_unlock(&base->lock);
1209 call_timer_fn(timer, fn, data);
1210 spin_lock(&base->lock);
1211 } else {
1212 spin_unlock_irq(&base->lock);
1213 call_timer_fn(timer, fn, data);
1214 spin_lock_irq(&base->lock);
1215 }
1216 }
1217 }
1218 base->running_timer = NULL;
1219 spin_unlock_irq(&base->lock);
1220}
1221
1222#ifdef CONFIG_NO_HZ_COMMON
1223/*
1224 * Find out when the next timer event is due to happen. This
1225 * is used on S/390 to stop all activity when a CPU is idle.
1226 * This function needs to be called with interrupts disabled.
1227 */
1228static unsigned long __next_timer_interrupt(struct tvec_base *base)
1229{
1230 unsigned long timer_jiffies = base->timer_jiffies;
1231 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
1232 int index, slot, array, found = 0;
1233 struct timer_list *nte;
1234 struct tvec *varray[4];
1235
1236 /* Look for timer events in tv1. */
1237 index = slot = timer_jiffies & TVR_MASK;
1238 do {
1239 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
1240 if (tbase_get_deferrable(nte->base))
1241 continue;
1242
1243 found = 1;
1244 expires = nte->expires;
1245 /* Look at the cascade bucket(s)? */
1246 if (!index || slot < index)
1247 goto cascade;
1248 return expires;
1249 }
1250 slot = (slot + 1) & TVR_MASK;
1251 } while (slot != index);
1252
1253cascade:
1254 /* Calculate the next cascade event */
1255 if (index)
1256 timer_jiffies += TVR_SIZE - index;
1257 timer_jiffies >>= TVR_BITS;
1258
1259 /* Check tv2-tv5. */
1260 varray[0] = &base->tv2;
1261 varray[1] = &base->tv3;
1262 varray[2] = &base->tv4;
1263 varray[3] = &base->tv5;
1264
1265 for (array = 0; array < 4; array++) {
1266 struct tvec *varp = varray[array];
1267
1268 index = slot = timer_jiffies & TVN_MASK;
1269 do {
1270 list_for_each_entry(nte, varp->vec + slot, entry) {
1271 if (tbase_get_deferrable(nte->base))
1272 continue;
1273
1274 found = 1;
1275 if (time_before(nte->expires, expires))
1276 expires = nte->expires;
1277 }
1278 /*
1279 * Do we still search for the first timer or are
1280 * we looking up the cascade buckets ?
1281 */
1282 if (found) {
1283 /* Look at the cascade bucket(s)? */
1284 if (!index || slot < index)
1285 break;
1286 return expires;
1287 }
1288 slot = (slot + 1) & TVN_MASK;
1289 } while (slot != index);
1290
1291 if (index)
1292 timer_jiffies += TVN_SIZE - index;
1293 timer_jiffies >>= TVN_BITS;
1294 }
1295 return expires;
1296}
1297
1298/*
1299 * Check, if the next hrtimer event is before the next timer wheel
1300 * event:
1301 */
1302static unsigned long cmp_next_hrtimer_event(unsigned long now,
1303 unsigned long expires)
1304{
1305 ktime_t hr_delta = hrtimer_get_next_event();
1306 struct timespec tsdelta;
1307 unsigned long delta;
1308
1309 if (hr_delta.tv64 == KTIME_MAX)
1310 return expires;
1311
1312 /*
1313 * Expired timer available, let it expire in the next tick
1314 */
1315 if (hr_delta.tv64 <= 0)
1316 return now + 1;
1317
1318 tsdelta = ktime_to_timespec(hr_delta);
1319 delta = timespec_to_jiffies(&tsdelta);
1320
1321 /*
1322 * Limit the delta to the max value, which is checked in
1323 * tick_nohz_stop_sched_tick():
1324 */
1325 if (delta > NEXT_TIMER_MAX_DELTA)
1326 delta = NEXT_TIMER_MAX_DELTA;
1327
1328 /*
1329 * Take rounding errors in to account and make sure, that it
1330 * expires in the next tick. Otherwise we go into an endless
1331 * ping pong due to tick_nohz_stop_sched_tick() retriggering
1332 * the timer softirq
1333 */
1334 if (delta < 1)
1335 delta = 1;
1336 now += delta;
1337 if (time_before(now, expires))
1338 return now;
1339 return expires;
1340}
1341
1342/**
1343 * get_next_timer_interrupt - return the jiffy of the next pending timer
1344 * @now: current time (in jiffies)
1345 */
1346unsigned long get_next_timer_interrupt(unsigned long now)
1347{
1348 struct tvec_base *base = __this_cpu_read(tvec_bases);
1349 unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
1350
1351 /*
1352 * Pretend that there is no timer pending if the cpu is offline.
1353 * Possible pending timers will be migrated later to an active cpu.
1354 */
1355 if (cpu_is_offline(smp_processor_id()))
1356 return expires;
1357
1358 spin_lock(&base->lock);
1359 if (base->active_timers) {
1360 if (time_before_eq(base->next_timer, base->timer_jiffies))
1361 base->next_timer = __next_timer_interrupt(base);
1362 expires = base->next_timer;
1363 }
1364 spin_unlock(&base->lock);
1365
1366 if (time_before_eq(expires, now))
1367 return now;
1368
1369 return cmp_next_hrtimer_event(now, expires);
1370}
1371#endif
1372
1373/*
1374 * Called from the timer interrupt handler to charge one tick to the current
1375 * process. user_tick is 1 if the tick is user time, 0 for system.
1376 */
1377void update_process_times(int user_tick)
1378{
1379 struct task_struct *p = current;
1380 int cpu = smp_processor_id();
1381
1382 /* Note: this timer irq context must be accounted for as well. */
1383 account_process_tick(p, user_tick);
1384 run_local_timers();
1385 rcu_check_callbacks(cpu, user_tick);
1386#ifdef CONFIG_IRQ_WORK
1387 if (in_irq())
1388 irq_work_run();
1389#endif
1390 scheduler_tick();
1391 run_posix_cpu_timers(p);
1392}
1393
1394/*
1395 * This function runs timers and the timer-tq in bottom half context.
1396 */
1397static void run_timer_softirq(struct softirq_action *h)
1398{
1399 struct tvec_base *base = __this_cpu_read(tvec_bases);
1400
1401 hrtimer_run_pending();
1402
1403 if (time_after_eq(jiffies, base->timer_jiffies))
1404 __run_timers(base);
1405}
1406
1407/*
1408 * Called by the local, per-CPU timer interrupt on SMP.
1409 */
1410void run_local_timers(void)
1411{
1412 hrtimer_run_queues();
1413 raise_softirq(TIMER_SOFTIRQ);
1414}
1415
1416#ifdef __ARCH_WANT_SYS_ALARM
1417
1418/*
1419 * For backwards compatibility? This can be done in libc so Alpha
1420 * and all newer ports shouldn't need it.
1421 */
1422SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1423{
1424 return alarm_setitimer(seconds);
1425}
1426
1427#endif
1428
1429static void process_timeout(unsigned long __data)
1430{
1431 wake_up_process((struct task_struct *)__data);
1432}
1433
1434/**
1435 * schedule_timeout - sleep until timeout
1436 * @timeout: timeout value in jiffies
1437 *
1438 * Make the current task sleep until @timeout jiffies have
1439 * elapsed. The routine will return immediately unless
1440 * the current task state has been set (see set_current_state()).
1441 *
1442 * You can set the task state as follows -
1443 *
1444 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1445 * pass before the routine returns. The routine will return 0
1446 *
1447 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1448 * delivered to the current task. In this case the remaining time
1449 * in jiffies will be returned, or 0 if the timer expired in time
1450 *
1451 * The current task state is guaranteed to be TASK_RUNNING when this
1452 * routine returns.
1453 *
1454 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1455 * the CPU away without a bound on the timeout. In this case the return
1456 * value will be %MAX_SCHEDULE_TIMEOUT.
1457 *
1458 * In all cases the return value is guaranteed to be non-negative.
1459 */
1460signed long __sched schedule_timeout(signed long timeout)
1461{
1462 struct timer_list timer;
1463 unsigned long expire;
1464
1465 switch (timeout)
1466 {
1467 case MAX_SCHEDULE_TIMEOUT:
1468 /*
1469 * These two special cases are useful to be comfortable
1470 * in the caller. Nothing more. We could take
1471 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1472 * but I' d like to return a valid offset (>=0) to allow
1473 * the caller to do everything it want with the retval.
1474 */
1475 schedule();
1476 goto out;
1477 default:
1478 /*
1479 * Another bit of PARANOID. Note that the retval will be
1480 * 0 since no piece of kernel is supposed to do a check
1481 * for a negative retval of schedule_timeout() (since it
1482 * should never happens anyway). You just have the printk()
1483 * that will tell you if something is gone wrong and where.
1484 */
1485 if (timeout < 0) {
1486 printk(KERN_ERR "schedule_timeout: wrong timeout "
1487 "value %lx\n", timeout);
1488 dump_stack();
1489 current->state = TASK_RUNNING;
1490 goto out;
1491 }
1492 }
1493
1494 expire = timeout + jiffies;
1495
1496 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1497 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1498 schedule();
1499 del_singleshot_timer_sync(&timer);
1500
1501 /* Remove the timer from the object tracker */
1502 destroy_timer_on_stack(&timer);
1503
1504 timeout = expire - jiffies;
1505
1506 out:
1507 return timeout < 0 ? 0 : timeout;
1508}
1509EXPORT_SYMBOL(schedule_timeout);
1510
1511/*
1512 * We can use __set_current_state() here because schedule_timeout() calls
1513 * schedule() unconditionally.
1514 */
1515signed long __sched schedule_timeout_interruptible(signed long timeout)
1516{
1517 __set_current_state(TASK_INTERRUPTIBLE);
1518 return schedule_timeout(timeout);
1519}
1520EXPORT_SYMBOL(schedule_timeout_interruptible);
1521
1522signed long __sched schedule_timeout_killable(signed long timeout)
1523{
1524 __set_current_state(TASK_KILLABLE);
1525 return schedule_timeout(timeout);
1526}
1527EXPORT_SYMBOL(schedule_timeout_killable);
1528
1529signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1530{
1531 __set_current_state(TASK_UNINTERRUPTIBLE);
1532 return schedule_timeout(timeout);
1533}
1534EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1535
1536static int init_timers_cpu(int cpu)
1537{
1538 int j;
1539 struct tvec_base *base;
1540 static char tvec_base_done[NR_CPUS];
1541
1542 if (!tvec_base_done[cpu]) {
1543 static char boot_done;
1544
1545 if (boot_done) {
1546 /*
1547 * The APs use this path later in boot
1548 */
1549 base = kzalloc_node(sizeof(*base), GFP_KERNEL,
1550 cpu_to_node(cpu));
1551 if (!base)
1552 return -ENOMEM;
1553
1554 /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
1555 if (WARN_ON(base != tbase_get_base(base))) {
1556 kfree(base);
1557 return -ENOMEM;
1558 }
1559 per_cpu(tvec_bases, cpu) = base;
1560 } else {
1561 /*
1562 * This is for the boot CPU - we use compile-time
1563 * static initialisation because per-cpu memory isn't
1564 * ready yet and because the memory allocators are not
1565 * initialised either.
1566 */
1567 boot_done = 1;
1568 base = &boot_tvec_bases;
1569 }
1570 spin_lock_init(&base->lock);
1571 tvec_base_done[cpu] = 1;
1572 base->cpu = cpu;
1573 } else {
1574 base = per_cpu(tvec_bases, cpu);
1575 }
1576
1577
1578 for (j = 0; j < TVN_SIZE; j++) {
1579 INIT_LIST_HEAD(base->tv5.vec + j);
1580 INIT_LIST_HEAD(base->tv4.vec + j);
1581 INIT_LIST_HEAD(base->tv3.vec + j);
1582 INIT_LIST_HEAD(base->tv2.vec + j);
1583 }
1584 for (j = 0; j < TVR_SIZE; j++)
1585 INIT_LIST_HEAD(base->tv1.vec + j);
1586
1587 base->timer_jiffies = jiffies;
1588 base->next_timer = base->timer_jiffies;
1589 base->active_timers = 0;
1590 base->all_timers = 0;
1591 return 0;
1592}
1593
1594#ifdef CONFIG_HOTPLUG_CPU
1595static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1596{
1597 struct timer_list *timer;
1598
1599 while (!list_empty(head)) {
1600 timer = list_first_entry(head, struct timer_list, entry);
1601 /* We ignore the accounting on the dying cpu */
1602 detach_timer(timer, false);
1603 timer_set_base(timer, new_base);
1604 internal_add_timer(new_base, timer);
1605 }
1606}
1607
1608static void migrate_timers(int cpu)
1609{
1610 struct tvec_base *old_base;
1611 struct tvec_base *new_base;
1612 int i;
1613
1614 BUG_ON(cpu_online(cpu));
1615 old_base = per_cpu(tvec_bases, cpu);
1616 new_base = get_cpu_var(tvec_bases);
1617 /*
1618 * The caller is globally serialized and nobody else
1619 * takes two locks at once, deadlock is not possible.
1620 */
1621 spin_lock_irq(&new_base->lock);
1622 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1623
1624 BUG_ON(old_base->running_timer);
1625
1626 for (i = 0; i < TVR_SIZE; i++)
1627 migrate_timer_list(new_base, old_base->tv1.vec + i);
1628 for (i = 0; i < TVN_SIZE; i++) {
1629 migrate_timer_list(new_base, old_base->tv2.vec + i);
1630 migrate_timer_list(new_base, old_base->tv3.vec + i);
1631 migrate_timer_list(new_base, old_base->tv4.vec + i);
1632 migrate_timer_list(new_base, old_base->tv5.vec + i);
1633 }
1634
1635 spin_unlock(&old_base->lock);
1636 spin_unlock_irq(&new_base->lock);
1637 put_cpu_var(tvec_bases);
1638}
1639#endif /* CONFIG_HOTPLUG_CPU */
1640
1641static int timer_cpu_notify(struct notifier_block *self,
1642 unsigned long action, void *hcpu)
1643{
1644 long cpu = (long)hcpu;
1645 int err;
1646
1647 switch(action) {
1648 case CPU_UP_PREPARE:
1649 case CPU_UP_PREPARE_FROZEN:
1650 err = init_timers_cpu(cpu);
1651 if (err < 0)
1652 return notifier_from_errno(err);
1653 break;
1654#ifdef CONFIG_HOTPLUG_CPU
1655 case CPU_DEAD:
1656 case CPU_DEAD_FROZEN:
1657 migrate_timers(cpu);
1658 break;
1659#endif
1660 default:
1661 break;
1662 }
1663 return NOTIFY_OK;
1664}
1665
1666static struct notifier_block timers_nb = {
1667 .notifier_call = timer_cpu_notify,
1668};
1669
1670
1671void __init init_timers(void)
1672{
1673 int err;
1674
1675 /* ensure there are enough low bits for flags in timer->base pointer */
1676 BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
1677
1678 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1679 (void *)(long)smp_processor_id());
1680 BUG_ON(err != NOTIFY_OK);
1681
1682 init_timer_stats();
1683 register_cpu_notifier(&timers_nb);
1684 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1685}
1686
1687/**
1688 * msleep - sleep safely even with waitqueue interruptions
1689 * @msecs: Time in milliseconds to sleep for
1690 */
1691void msleep(unsigned int msecs)
1692{
1693 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1694
1695 while (timeout)
1696 timeout = schedule_timeout_uninterruptible(timeout);
1697}
1698
1699EXPORT_SYMBOL(msleep);
1700
1701/**
1702 * msleep_interruptible - sleep waiting for signals
1703 * @msecs: Time in milliseconds to sleep for
1704 */
1705unsigned long msleep_interruptible(unsigned int msecs)
1706{
1707 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1708
1709 while (timeout && !signal_pending(current))
1710 timeout = schedule_timeout_interruptible(timeout);
1711 return jiffies_to_msecs(timeout);
1712}
1713
1714EXPORT_SYMBOL(msleep_interruptible);
1715
1716static int __sched do_usleep_range(unsigned long min, unsigned long max)
1717{
1718 ktime_t kmin;
1719 unsigned long delta;
1720
1721 kmin = ktime_set(0, min * NSEC_PER_USEC);
1722 delta = (max - min) * NSEC_PER_USEC;
1723 return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1724}
1725
1726/**
1727 * usleep_range - Drop in replacement for udelay where wakeup is flexible
1728 * @min: Minimum time in usecs to sleep
1729 * @max: Maximum time in usecs to sleep
1730 */
1731void usleep_range(unsigned long min, unsigned long max)
1732{
1733 __set_current_state(TASK_UNINTERRUPTIBLE);
1734 do_usleep_range(min, max);
1735}
1736EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c
new file mode 100644
index 000000000000..e622ba365a13
--- /dev/null
+++ b/kernel/time/udelay_test.c
@@ -0,0 +1,168 @@
1/*
2 * udelay() test kernel module
3 *
4 * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
5 * Tests are configured by writing: USECS ITERATIONS
6 * Tests are executed by reading from the same file.
7 * Specifying usecs of 0 or negative values will run multiples tests.
8 *
9 * Copyright (C) 2014 Google, Inc.
10 *
11 * This software is licensed under the terms of the GNU General Public
12 * License version 2, as published by the Free Software Foundation, and
13 * may be copied, distributed, and modified under those terms.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 */
20
21#include <linux/debugfs.h>
22#include <linux/delay.h>
23#include <linux/ktime.h>
24#include <linux/module.h>
25#include <linux/uaccess.h>
26
27#define DEFAULT_ITERATIONS 100
28
29#define DEBUGFS_FILENAME "udelay_test"
30
31static DEFINE_MUTEX(udelay_test_lock);
32static struct dentry *udelay_test_debugfs_file;
33static int udelay_test_usecs;
34static int udelay_test_iterations = DEFAULT_ITERATIONS;
35
36static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
37{
38 int min = 0, max = 0, fail_count = 0;
39 uint64_t sum = 0;
40 uint64_t avg;
41 int i;
42 /* Allow udelay to be up to 0.5% fast */
43 int allowed_error_ns = usecs * 5;
44
45 for (i = 0; i < iters; ++i) {
46 struct timespec ts1, ts2;
47 int time_passed;
48
49 ktime_get_ts(&ts1);
50 udelay(usecs);
51 ktime_get_ts(&ts2);
52 time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
53
54 if (i == 0 || time_passed < min)
55 min = time_passed;
56 if (i == 0 || time_passed > max)
57 max = time_passed;
58 if ((time_passed + allowed_error_ns) / 1000 < usecs)
59 ++fail_count;
60 WARN_ON(time_passed < 0);
61 sum += time_passed;
62 }
63
64 avg = sum;
65 do_div(avg, iters);
66 seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
67 usecs, iters, usecs * 1000,
68 (usecs * 1000) - allowed_error_ns, min, avg, max);
69 if (fail_count)
70 seq_printf(s, " FAIL=%d", fail_count);
71 seq_puts(s, "\n");
72
73 return 0;
74}
75
76static int udelay_test_show(struct seq_file *s, void *v)
77{
78 int usecs;
79 int iters;
80 int ret = 0;
81
82 mutex_lock(&udelay_test_lock);
83 usecs = udelay_test_usecs;
84 iters = udelay_test_iterations;
85 mutex_unlock(&udelay_test_lock);
86
87 if (usecs > 0 && iters > 0) {
88 return udelay_test_single(s, usecs, iters);
89 } else if (usecs == 0) {
90 struct timespec ts;
91
92 ktime_get_ts(&ts);
93 seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
94 loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
95 seq_puts(s, "usage:\n");
96 seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
97 seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
98 }
99
100 return ret;
101}
102
103static int udelay_test_open(struct inode *inode, struct file *file)
104{
105 return single_open(file, udelay_test_show, inode->i_private);
106}
107
108static ssize_t udelay_test_write(struct file *file, const char __user *buf,
109 size_t count, loff_t *pos)
110{
111 char lbuf[32];
112 int ret;
113 int usecs;
114 int iters;
115
116 if (count >= sizeof(lbuf))
117 return -EINVAL;
118
119 if (copy_from_user(lbuf, buf, count))
120 return -EFAULT;
121 lbuf[count] = '\0';
122
123 ret = sscanf(lbuf, "%d %d", &usecs, &iters);
124 if (ret < 1)
125 return -EINVAL;
126 else if (ret < 2)
127 iters = DEFAULT_ITERATIONS;
128
129 mutex_lock(&udelay_test_lock);
130 udelay_test_usecs = usecs;
131 udelay_test_iterations = iters;
132 mutex_unlock(&udelay_test_lock);
133
134 return count;
135}
136
137static const struct file_operations udelay_test_debugfs_ops = {
138 .owner = THIS_MODULE,
139 .open = udelay_test_open,
140 .read = seq_read,
141 .write = udelay_test_write,
142 .llseek = seq_lseek,
143 .release = single_release,
144};
145
146static int __init udelay_test_init(void)
147{
148 mutex_lock(&udelay_test_lock);
149 udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
150 S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
151 mutex_unlock(&udelay_test_lock);
152
153 return 0;
154}
155
156module_init(udelay_test_init);
157
158static void __exit udelay_test_exit(void)
159{
160 mutex_lock(&udelay_test_lock);
161 debugfs_remove(udelay_test_debugfs_file);
162 mutex_unlock(&udelay_test_lock);
163}
164
165module_exit(udelay_test_exit);
166
167MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
168MODULE_LICENSE("GPL");