aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/time
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Kconfig25
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/clockevents.c345
-rw-r--r--kernel/time/clocksource.c246
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c480
-rw-r--r--kernel/time/tick-common.c346
-rw-r--r--kernel/time/tick-internal.h110
-rw-r--r--kernel/time/tick-oneshot.c84
-rw-r--r--kernel/time/tick-sched.c563
-rw-r--r--kernel/time/timer_list.c287
-rw-r--r--kernel/time/timer_stats.c411
13 files changed, 2862 insertions, 75 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000000..f66351126544
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
1#
2# Timer subsystem related configuration options
3#
4config TICK_ONESHOT
5 bool
6 default n
7
8config NO_HZ
9 bool "Tickless System (Dynamic Ticks)"
10 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
11 select TICK_ONESHOT
12 help
13 This option enables a tickless system: timer interrupts will
14 only trigger on an as-needed basis both when the system is
15 busy and when the system is idle.
16
17config HIGH_RES_TIMERS
18 bool "High Resolution Timer Support"
19 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
20 select TICK_ONESHOT
21 help
22 This option enables high resolution timer support. If your
23 hardware is not capable then this option only increases
24 the size of the kernel image.
25
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16fb..93bccba1f265 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
1obj-y += ntp.o clocksource.o jiffies.o 1obj-y += ntp.o clocksource.o jiffies.o timer_list.o
2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
8obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000000..67932ea78c17
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
1/*
2 * linux/kernel/time/clockevents.c
3 *
4 * This file contains functions which manage clock event devices.
5 *
6 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
8 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
9 *
10 * This code is licenced under the GPL version 2. For details see
11 * kernel-base/COPYING.
12 */
13
14#include <linux/clockchips.h>
15#include <linux/hrtimer.h>
16#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h>
20#include <linux/sysdev.h>
21
22/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices);
24static LIST_HEAD(clockevents_released);
25
26/* Notification for clock events */
27static RAW_NOTIFIER_HEAD(clockevents_chain);
28
29/* Protection for the above */
30static DEFINE_SPINLOCK(clockevents_lock);
31
32/**
33 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
34 * @latch: value to convert
35 * @evt: pointer to clock event device descriptor
36 *
37 * Math helper, returns latch value converted to nanoseconds (bound checked)
38 */
39unsigned long clockevent_delta2ns(unsigned long latch,
40 struct clock_event_device *evt)
41{
42 u64 clc = ((u64) latch << evt->shift);
43
44 do_div(clc, evt->mult);
45 if (clc < 1000)
46 clc = 1000;
47 if (clc > LONG_MAX)
48 clc = LONG_MAX;
49
50 return (unsigned long) clc;
51}
52
53/**
54 * clockevents_set_mode - set the operating mode of a clock event device
55 * @dev: device to modify
56 * @mode: new mode
57 *
58 * Must be called with interrupts disabled !
59 */
60void clockevents_set_mode(struct clock_event_device *dev,
61 enum clock_event_mode mode)
62{
63 if (dev->mode != mode) {
64 dev->set_mode(mode, dev);
65 dev->mode = mode;
66 }
67}
68
69/**
70 * clockevents_program_event - Reprogram the clock event device.
71 * @expires: absolute expiry time (monotonic clock)
72 *
73 * Returns 0 on success, -ETIME when the event is in the past.
74 */
75int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
76 ktime_t now)
77{
78 unsigned long long clc;
79 int64_t delta;
80
81 delta = ktime_to_ns(ktime_sub(expires, now));
82
83 if (delta <= 0)
84 return -ETIME;
85
86 dev->next_event = expires;
87
88 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
89 return 0;
90
91 if (delta > dev->max_delta_ns)
92 delta = dev->max_delta_ns;
93 if (delta < dev->min_delta_ns)
94 delta = dev->min_delta_ns;
95
96 clc = delta * dev->mult;
97 clc >>= dev->shift;
98
99 return dev->set_next_event((unsigned long) clc, dev);
100}
101
102/**
103 * clockevents_register_notifier - register a clock events change listener
104 */
105int clockevents_register_notifier(struct notifier_block *nb)
106{
107 int ret;
108
109 spin_lock(&clockevents_lock);
110 ret = raw_notifier_chain_register(&clockevents_chain, nb);
111 spin_unlock(&clockevents_lock);
112
113 return ret;
114}
115
116/**
117 * clockevents_unregister_notifier - unregister a clock events change listener
118 */
119void clockevents_unregister_notifier(struct notifier_block *nb)
120{
121 spin_lock(&clockevents_lock);
122 raw_notifier_chain_unregister(&clockevents_chain, nb);
123 spin_unlock(&clockevents_lock);
124}
125
126/*
127 * Notify about a clock event change. Called with clockevents_lock
128 * held.
129 */
130static void clockevents_do_notify(unsigned long reason, void *dev)
131{
132 raw_notifier_call_chain(&clockevents_chain, reason, dev);
133}
134
135/*
136 * Called after a notify add to make devices availble which were
137 * released from the notifier call.
138 */
139static void clockevents_notify_released(void)
140{
141 struct clock_event_device *dev;
142
143 while (!list_empty(&clockevents_released)) {
144 dev = list_entry(clockevents_released.next,
145 struct clock_event_device, list);
146 list_del(&dev->list);
147 list_add(&dev->list, &clockevent_devices);
148 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
149 }
150}
151
152/**
153 * clockevents_register_device - register a clock event device
154 * @dev: device to register
155 */
156void clockevents_register_device(struct clock_event_device *dev)
157{
158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159
160 spin_lock(&clockevents_lock);
161
162 list_add(&dev->list, &clockevent_devices);
163 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
164 clockevents_notify_released();
165
166 spin_unlock(&clockevents_lock);
167}
168
169/*
170 * Noop handler when we shut down an event device
171 */
172static void clockevents_handle_noop(struct clock_event_device *dev)
173{
174}
175
176/**
177 * clockevents_exchange_device - release and request clock devices
178 * @old: device to release (can be NULL)
179 * @new: device to request (can be NULL)
180 *
181 * Called from the notifier chain. clockevents_lock is held already
182 */
183void clockevents_exchange_device(struct clock_event_device *old,
184 struct clock_event_device *new)
185{
186 unsigned long flags;
187
188 local_irq_save(flags);
189 /*
190 * Caller releases a clock event device. We queue it into the
191 * released list and do a notify add later.
192 */
193 if (old) {
194 old->event_handler = clockevents_handle_noop;
195 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
196 list_del(&old->list);
197 list_add(&old->list, &clockevents_released);
198 }
199
200 if (new) {
201 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
202 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
203 }
204 local_irq_restore(flags);
205}
206
207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events
250 */
251void clockevents_notify(unsigned long reason, void *arg)
252{
253 spin_lock(&clockevents_lock);
254 clockevents_do_notify(reason, arg);
255
256 switch (reason) {
257 case CLOCK_EVT_NOTIFY_CPU_DEAD:
258 /*
259 * Unregister the clock event devices which were
260 * released from the users in the notify chain.
261 */
262 while (!list_empty(&clockevents_released)) {
263 struct clock_event_device *dev;
264
265 dev = list_entry(clockevents_released.next,
266 struct clock_event_device, list);
267 list_del(&dev->list);
268 }
269 break;
270 default:
271 break;
272 }
273 spin_unlock(&clockevents_lock);
274}
275EXPORT_SYMBOL_GPL(clockevents_notify);
276
277#ifdef CONFIG_SYSFS
278
279/**
280 * clockevents_show_registered - sysfs interface for listing clockevents
281 * @dev: unused
282 * @buf: char buffer to be filled with clock events list
283 *
284 * Provides sysfs interface for listing registered clock event devices
285 */
286static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
287{
288 struct list_head *tmp;
289 char *p = buf;
290 int cpu;
291
292 spin_lock(&clockevents_lock);
293
294 list_for_each(tmp, &clockevent_devices) {
295 struct clock_event_device *ce;
296
297 ce = list_entry(tmp, struct clock_event_device, list);
298 p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
299 ce->features, ce->mode);
300 p += sprintf(p, " C:");
301 if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
302 for_each_cpu_mask(cpu, ce->cpumask)
303 p += sprintf(p, " %d", cpu);
304 } else {
305 /*
306 * FIXME: Add the cpu which is handling this sucker
307 */
308 }
309 p += sprintf(p, "\n");
310 }
311
312 spin_unlock(&clockevents_lock);
313
314 return p - buf;
315}
316
317/*
318 * Sysfs setup bits:
319 */
320static SYSDEV_ATTR(registered, 0600,
321 clockevents_show_registered, NULL);
322
323static struct sysdev_class clockevents_sysclass = {
324 set_kset_name("clockevents"),
325};
326
327static struct sys_device clockevents_sys_device = {
328 .id = 0,
329 .cls = &clockevents_sysclass,
330};
331
332static int __init clockevents_sysfs_init(void)
333{
334 int error = sysdev_class_register(&clockevents_sysclass);
335
336 if (!error)
337 error = sysdev_register(&clockevents_sys_device);
338 if (!error)
339 error = sysdev_create_file(
340 &clockevents_sys_device,
341 &attr_registered);
342 return error;
343}
344device_initcall(clockevents_sysfs_init);
345#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9ef176c4e09..193a0793af95 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -29,6 +29,7 @@
29#include <linux/init.h> 29#include <linux/init.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h>
32 33
33/* XXX - Would like a better way for initializing curr_clocksource */ 34/* XXX - Would like a better way for initializing curr_clocksource */
34extern struct clocksource clocksource_jiffies; 35extern struct clocksource clocksource_jiffies;
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
48 */ 49 */
49static struct clocksource *curr_clocksource = &clocksource_jiffies; 50static struct clocksource *curr_clocksource = &clocksource_jiffies;
50static struct clocksource *next_clocksource; 51static struct clocksource *next_clocksource;
52static struct clocksource *clocksource_override;
51static LIST_HEAD(clocksource_list); 53static LIST_HEAD(clocksource_list);
52static DEFINE_SPINLOCK(clocksource_lock); 54static DEFINE_SPINLOCK(clocksource_lock);
53static char override_name[32]; 55static char override_name[32];
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void)
62 finished_booting = 1; 64 finished_booting = 1;
63 return 0; 65 return 0;
64} 66}
65
66late_initcall(clocksource_done_booting); 67late_initcall(clocksource_done_booting);
67 68
69#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
70static LIST_HEAD(watchdog_list);
71static struct clocksource *watchdog;
72static struct timer_list watchdog_timer;
73static DEFINE_SPINLOCK(watchdog_lock);
74static cycle_t watchdog_last;
75/*
76 * Interval: 0.5sec Treshold: 0.0625s
77 */
78#define WATCHDOG_INTERVAL (HZ >> 1)
79#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
80
81static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
82{
83 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
84 return;
85
86 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
87 cs->name, delta);
88 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
89 clocksource_change_rating(cs, 0);
90 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
91 list_del(&cs->wd_list);
92}
93
94static void clocksource_watchdog(unsigned long data)
95{
96 struct clocksource *cs, *tmp;
97 cycle_t csnow, wdnow;
98 int64_t wd_nsec, cs_nsec;
99
100 spin_lock(&watchdog_lock);
101
102 wdnow = watchdog->read();
103 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
104 watchdog_last = wdnow;
105
106 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
107 csnow = cs->read();
108 /* Initialized ? */
109 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
110 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
111 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
112 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
113 /*
114 * We just marked the clocksource as
115 * highres-capable, notify the rest of the
116 * system as well so that we transition
117 * into high-res mode:
118 */
119 tick_clock_notify();
120 }
121 cs->flags |= CLOCK_SOURCE_WATCHDOG;
122 cs->wd_last = csnow;
123 } else {
124 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
125 cs->wd_last = csnow;
126 /* Check the delta. Might remove from the list ! */
127 clocksource_ratewd(cs, cs_nsec - wd_nsec);
128 }
129 }
130
131 if (!list_empty(&watchdog_list)) {
132 __mod_timer(&watchdog_timer,
133 watchdog_timer.expires + WATCHDOG_INTERVAL);
134 }
135 spin_unlock(&watchdog_lock);
136}
137static void clocksource_check_watchdog(struct clocksource *cs)
138{
139 struct clocksource *cse;
140 unsigned long flags;
141
142 spin_lock_irqsave(&watchdog_lock, flags);
143 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
144 int started = !list_empty(&watchdog_list);
145
146 list_add(&cs->wd_list, &watchdog_list);
147 if (!started && watchdog) {
148 watchdog_last = watchdog->read();
149 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
150 add_timer(&watchdog_timer);
151 }
152 } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
153 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
154
155 if (!watchdog || cs->rating > watchdog->rating) {
156 if (watchdog)
157 del_timer(&watchdog_timer);
158 watchdog = cs;
159 init_timer(&watchdog_timer);
160 watchdog_timer.function = clocksource_watchdog;
161
162 /* Reset watchdog cycles */
163 list_for_each_entry(cse, &watchdog_list, wd_list)
164 cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
165 /* Start if list is not empty */
166 if (!list_empty(&watchdog_list)) {
167 watchdog_last = watchdog->read();
168 watchdog_timer.expires =
169 jiffies + WATCHDOG_INTERVAL;
170 add_timer(&watchdog_timer);
171 }
172 }
173 }
174 spin_unlock_irqrestore(&watchdog_lock, flags);
175}
176#else
177static void clocksource_check_watchdog(struct clocksource *cs)
178{
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
180 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
181}
182#endif
183
68/** 184/**
69 * clocksource_get_next - Returns the selected clocksource 185 * clocksource_get_next - Returns the selected clocksource
70 * 186 *
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
84} 200}
85 201
86/** 202/**
87 * select_clocksource - Finds the best registered clocksource. 203 * select_clocksource - Selects the best registered clocksource.
88 * 204 *
89 * Private function. Must hold clocksource_lock when called. 205 * Private function. Must hold clocksource_lock when called.
90 * 206 *
91 * Looks through the list of registered clocksources, returning 207 * Select the clocksource with the best rating, or the clocksource,
92 * the one with the highest rating value. If there is a clocksource 208 * which is selected by userspace override.
93 * name that matches the override string, it returns that clocksource.
94 */ 209 */
95static struct clocksource *select_clocksource(void) 210static struct clocksource *select_clocksource(void)
96{ 211{
97 struct clocksource *best = NULL; 212 struct clocksource *next;
98 struct list_head *tmp;
99 213
100 list_for_each(tmp, &clocksource_list) { 214 if (list_empty(&clocksource_list))
101 struct clocksource *src; 215 return NULL;
102 216
103 src = list_entry(tmp, struct clocksource, list); 217 if (clocksource_override)
104 if (!best) 218 next = clocksource_override;
105 best = src; 219 else
106 220 next = list_entry(clocksource_list.next, struct clocksource,
107 /* check for override: */ 221 list);
108 if (strlen(src->name) == strlen(override_name) && 222
109 !strcmp(src->name, override_name)) { 223 if (next == curr_clocksource)
110 best = src; 224 return NULL;
111 break;
112 }
113 /* pick the highest rating: */
114 if (src->rating > best->rating)
115 best = src;
116 }
117 225
118 return best; 226 return next;
119} 227}
120 228
121/** 229/*
122 * is_registered_source - Checks if clocksource is registered 230 * Enqueue the clocksource sorted by rating
123 * @c: pointer to a clocksource
124 *
125 * Private helper function. Must hold clocksource_lock when called.
126 *
127 * Returns one if the clocksource is already registered, zero otherwise.
128 */ 231 */
129static int is_registered_source(struct clocksource *c) 232static int clocksource_enqueue(struct clocksource *c)
130{ 233{
131 int len = strlen(c->name); 234 struct list_head *tmp, *entry = &clocksource_list;
132 struct list_head *tmp;
133 235
134 list_for_each(tmp, &clocksource_list) { 236 list_for_each(tmp, &clocksource_list) {
135 struct clocksource *src; 237 struct clocksource *cs;
136 238
137 src = list_entry(tmp, struct clocksource, list); 239 cs = list_entry(tmp, struct clocksource, list);
138 if (strlen(src->name) == len && !strcmp(src->name, c->name)) 240 if (cs == c)
139 return 1; 241 return -EBUSY;
242 /* Keep track of the place, where to insert */
243 if (cs->rating >= c->rating)
244 entry = tmp;
140 } 245 }
246 list_add(&c->list, entry);
247
248 if (strlen(c->name) == strlen(override_name) &&
249 !strcmp(c->name, override_name))
250 clocksource_override = c;
141 251
142 return 0; 252 return 0;
143} 253}
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
150 */ 260 */
151int clocksource_register(struct clocksource *c) 261int clocksource_register(struct clocksource *c)
152{ 262{
153 int ret = 0;
154 unsigned long flags; 263 unsigned long flags;
264 int ret;
155 265
156 spin_lock_irqsave(&clocksource_lock, flags); 266 spin_lock_irqsave(&clocksource_lock, flags);
157 /* check if clocksource is already registered */ 267 ret = clocksource_enqueue(c);
158 if (is_registered_source(c)) { 268 if (!ret)
159 printk("register_clocksource: Cannot register %s. "
160 "Already registered!", c->name);
161 ret = -EBUSY;
162 } else {
163 /* register it */
164 list_add(&c->list, &clocksource_list);
165 /* scan the registered clocksources, and pick the best one */
166 next_clocksource = select_clocksource(); 269 next_clocksource = select_clocksource();
167 }
168 spin_unlock_irqrestore(&clocksource_lock, flags); 270 spin_unlock_irqrestore(&clocksource_lock, flags);
271 if (!ret)
272 clocksource_check_watchdog(c);
169 return ret; 273 return ret;
170} 274}
171EXPORT_SYMBOL(clocksource_register); 275EXPORT_SYMBOL(clocksource_register);
172 276
173/** 277/**
174 * clocksource_reselect - Rescan list for next clocksource 278 * clocksource_change_rating - Change the rating of a registered clocksource
175 * 279 *
176 * A quick helper function to be used if a clocksource changes its
177 * rating. Forces the clocksource list to be re-scanned for the best
178 * clocksource.
179 */ 280 */
180void clocksource_reselect(void) 281void clocksource_change_rating(struct clocksource *cs, int rating)
181{ 282{
182 unsigned long flags; 283 unsigned long flags;
183 284
184 spin_lock_irqsave(&clocksource_lock, flags); 285 spin_lock_irqsave(&clocksource_lock, flags);
286 list_del(&cs->list);
287 cs->rating = rating;
288 clocksource_enqueue(cs);
185 next_clocksource = select_clocksource(); 289 next_clocksource = select_clocksource();
186 spin_unlock_irqrestore(&clocksource_lock, flags); 290 spin_unlock_irqrestore(&clocksource_lock, flags);
187} 291}
188EXPORT_SYMBOL(clocksource_reselect);
189 292
190#ifdef CONFIG_SYSFS 293#ifdef CONFIG_SYSFS
191/** 294/**
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
221static ssize_t sysfs_override_clocksource(struct sys_device *dev, 324static ssize_t sysfs_override_clocksource(struct sys_device *dev,
222 const char *buf, size_t count) 325 const char *buf, size_t count)
223{ 326{
327 struct clocksource *ovr = NULL;
328 struct list_head *tmp;
224 size_t ret = count; 329 size_t ret = count;
330 int len;
331
225 /* strings from sysfs write are not 0 terminated! */ 332 /* strings from sysfs write are not 0 terminated! */
226 if (count >= sizeof(override_name)) 333 if (count >= sizeof(override_name))
227 return -EINVAL; 334 return -EINVAL;
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
229 /* strip of \n: */ 336 /* strip of \n: */
230 if (buf[count-1] == '\n') 337 if (buf[count-1] == '\n')
231 count--; 338 count--;
232 if (count < 1)
233 return -EINVAL;
234 339
235 spin_lock_irq(&clocksource_lock); 340 spin_lock_irq(&clocksource_lock);
236 341
237 /* copy the name given: */ 342 if (count > 0)
238 memcpy(override_name, buf, count); 343 memcpy(override_name, buf, count);
239 override_name[count] = 0; 344 override_name[count] = 0;
240 345
241 /* try to select it: */ 346 len = strlen(override_name);
242 next_clocksource = select_clocksource(); 347 if (len) {
348 ovr = clocksource_override;
349 /* try to select it: */
350 list_for_each(tmp, &clocksource_list) {
351 struct clocksource *cs;
352
353 cs = list_entry(tmp, struct clocksource, list);
354 if (strlen(cs->name) == len &&
355 !strcmp(cs->name, override_name))
356 ovr = cs;
357 }
358 }
359
360 /* Reselect, when the override name has changed */
361 if (ovr != clocksource_override) {
362 clocksource_override = ovr;
363 next_clocksource = select_clocksource();
364 }
243 365
244 spin_unlock_irq(&clocksource_lock); 366 spin_unlock_irq(&clocksource_lock);
245 367
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a07..3be8da8fed7e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66}; 65};
67 66
68static int __init init_jiffies_clocksource(void) 67static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f9..eb12509e00bd 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
24 24
25#define MAX_TICKADJ 500 /* microsecs */ 25#define MAX_TICKADJ 500 /* microsecs */
26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
27 TICK_LENGTH_SHIFT) / HZ) 27 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
28 28
29/* 29/*
30 * phase-lock loop variables 30 * phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
46 46
47static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
48{ 48{
49 tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
51 tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 51 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 do_div(tick_length_base, HZ); 54 tick_length_base = second_length;
54 55
55 tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; 56 do_div(second_length, HZ);
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT;
58
59 do_div(tick_length_base, NTP_INTERVAL_FREQ);
56} 60}
57 61
58/** 62/**
@@ -162,7 +166,7 @@ void second_overflow(void)
162 tick_length -= MAX_TICKADJ_SCALED; 166 tick_length -= MAX_TICKADJ_SCALED;
163 } else { 167 } else {
164 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 168 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
165 HZ) << TICK_LENGTH_SHIFT; 169 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
166 time_adjust = 0; 170 time_adjust = 0;
167 } 171 }
168 } 172 }
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
239 result = -EINVAL; 243 result = -EINVAL;
240 goto leave; 244 goto leave;
241 } 245 }
242 time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); 246 time_freq = ((s64)txc->freq * NSEC_PER_USEC)
247 >> (SHIFT_USEC - SHIFT_NSEC);
243 } 248 }
244 249
245 if (txc->modes & ADJ_MAXERROR) { 250 if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
309 freq_adj += time_freq; 314 freq_adj += time_freq;
310 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); 315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
311 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); 316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
312 time_offset = (time_offset / HZ) << SHIFT_UPDATE; 317 time_offset = (time_offset / NTP_INTERVAL_FREQ)
318 << SHIFT_UPDATE;
313 } /* STA_PLL */ 319 } /* STA_PLL */
314 } /* txc->modes & ADJ_OFFSET */ 320 } /* txc->modes & ADJ_OFFSET */
315 if (txc->modes & ADJ_TICK) 321 if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
324 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 330 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
325 txc->offset = save_adjust; 331 txc->offset = save_adjust;
326 else 332 else
327 txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; 333 txc->offset = shift_right(time_offset, SHIFT_UPDATE)
328 txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); 334 * NTP_INTERVAL_FREQ / 1000;
335 txc->freq = (time_freq / NSEC_PER_USEC)
336 << (SHIFT_USEC - SHIFT_NSEC);
329 txc->maxerror = time_maxerror; 337 txc->maxerror = time_maxerror;
330 txc->esterror = time_esterror; 338 txc->esterror = time_esterror;
331 txc->status = time_status; 339 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000000..12b3efeb9f6f
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
1/*
2 * linux/kernel/time/tick-broadcast.c
3 *
4 * This file contains functions which emulate a local clock-event
5 * device via a broadcast event source.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Broadcast support for broken x86 hardware, where the local apic
27 * timer stops in C3 state.
28 */
29
30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock);
33
34/*
35 * Debugging: see timer_list.c
36 */
37struct tick_device *tick_get_broadcast_device(void)
38{
39 return &tick_broadcast_device;
40}
41
42cpumask_t *tick_get_broadcast_mask(void)
43{
44 return &tick_broadcast_mask;
45}
46
47/*
48 * Start the device in periodic mode
49 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
53 tick_setup_periodic(bc, 1);
54}
55
56/*
57 * Check, if the device can be utilized as broadcast device:
58 */
59int tick_check_broadcast_device(struct clock_event_device *dev)
60{
61 if (tick_broadcast_device.evtdev ||
62 (dev->features & CLOCK_EVT_FEAT_C3STOP))
63 return 0;
64
65 clockevents_exchange_device(NULL, dev);
66 tick_broadcast_device.evtdev = dev;
67 if (!cpus_empty(tick_broadcast_mask))
68 tick_broadcast_start_periodic(dev);
69 return 1;
70}
71
72/*
73 * Check, if the device is the broadcast device
74 */
75int tick_is_broadcast_device(struct clock_event_device *dev)
76{
77 return (dev && tick_broadcast_device.evtdev == dev);
78}
79
80/*
81 * Check, if the device is disfunctional and a place holder, which
82 * needs to be handled by the broadcast device.
83 */
84int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
85{
86 unsigned long flags;
87 int ret = 0;
88
89 spin_lock_irqsave(&tick_broadcast_lock, flags);
90
91 /*
92 * Devices might be registered with both periodic and oneshot
93 * mode disabled. This signals, that the device needs to be
94 * operated from the broadcast device and is a placeholder for
95 * the cpu local device.
96 */
97 if (!tick_device_is_functional(dev)) {
98 dev->event_handler = tick_handle_periodic;
99 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1;
102 }
103
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret;
106}
107
108/*
109 * Broadcast the event to the cpus, which are set in the mask
110 */
111int tick_do_broadcast(cpumask_t mask)
112{
113 int ret = 0, cpu = smp_processor_id();
114 struct tick_device *td;
115
116 /*
117 * Check, if the current cpu is in the mask
118 */
119 if (cpu_isset(cpu, mask)) {
120 cpu_clear(cpu, mask);
121 td = &per_cpu(tick_cpu_device, cpu);
122 td->evtdev->event_handler(td->evtdev);
123 ret = 1;
124 }
125
126 if (!cpus_empty(mask)) {
127 /*
128 * It might be necessary to actually check whether the devices
129 * have different broadcast functions. For now, just use the
130 * one of the first device. This works as long as we have this
131 * misfeature only on x86 (lapic)
132 */
133 cpu = first_cpu(mask);
134 td = &per_cpu(tick_cpu_device, cpu);
135 td->evtdev->broadcast(mask);
136 ret = 1;
137 }
138 return ret;
139}
140
141/*
142 * Periodic broadcast:
143 * - invoke the broadcast handlers
144 */
145static void tick_do_periodic_broadcast(void)
146{
147 cpumask_t mask;
148
149 spin_lock(&tick_broadcast_lock);
150
151 cpus_and(mask, cpu_online_map, tick_broadcast_mask);
152 tick_do_broadcast(mask);
153
154 spin_unlock(&tick_broadcast_lock);
155}
156
157/*
158 * Event handler for periodic broadcast ticks
159 */
160static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
161{
162 dev->next_event.tv64 = KTIME_MAX;
163
164 tick_do_periodic_broadcast();
165
166 /*
167 * The device is in periodic mode. No reprogramming necessary:
168 */
169 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
170 return;
171
172 /*
173 * Setup the next period for devices, which do not have
174 * periodic mode:
175 */
176 for (;;) {
177 ktime_t next = ktime_add(dev->next_event, tick_period);
178
179 if (!clockevents_program_event(dev, next, ktime_get()))
180 return;
181 tick_do_periodic_broadcast();
182 }
183}
184
185/*
186 * Powerstate information: The system enters/leaves a state, where
187 * affected devices might stop
188 */
189static void tick_do_broadcast_on_off(void *why)
190{
191 struct clock_event_device *bc, *dev;
192 struct tick_device *td;
193 unsigned long flags, *reason = why;
194 int cpu;
195
196 spin_lock_irqsave(&tick_broadcast_lock, flags);
197
198 cpu = smp_processor_id();
199 td = &per_cpu(tick_cpu_device, cpu);
200 dev = td->evtdev;
201 bc = tick_broadcast_device.evtdev;
202
203 /*
204 * Is the device in broadcast mode forever or is it not
205 * affected by the powerstate ?
206 */
207 if (!dev || !tick_device_is_functional(dev) ||
208 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
209 goto out;
210
211 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
212 if (!cpu_isset(cpu, tick_broadcast_mask)) {
213 cpu_set(cpu, tick_broadcast_mask);
214 if (td->mode == TICKDEV_MODE_PERIODIC)
215 clockevents_set_mode(dev,
216 CLOCK_EVT_MODE_SHUTDOWN);
217 }
218 } else {
219 if (cpu_isset(cpu, tick_broadcast_mask)) {
220 cpu_clear(cpu, tick_broadcast_mask);
221 if (td->mode == TICKDEV_MODE_PERIODIC)
222 tick_setup_periodic(dev, 0);
223 }
224 }
225
226 if (cpus_empty(tick_broadcast_mask))
227 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
228 else {
229 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
230 tick_broadcast_start_periodic(bc);
231 else
232 tick_broadcast_setup_oneshot(bc);
233 }
234out:
235 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
236}
237
238/*
239 * Powerstate information: The system enters/leaves a state, where
240 * affected devices might stop.
241 */
242void tick_broadcast_on_off(unsigned long reason, int *oncpu)
243{
244 int cpu = get_cpu();
245
246 if (cpu == *oncpu)
247 tick_do_broadcast_on_off(&reason);
248 else
249 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
250 &reason, 1, 1);
251 put_cpu();
252}
253
254/*
255 * Set the periodic handler depending on broadcast on/off
256 */
257void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
258{
259 if (!broadcast)
260 dev->event_handler = tick_handle_periodic;
261 else
262 dev->event_handler = tick_handle_periodic_broadcast;
263}
264
265/*
266 * Remove a CPU from broadcasting
267 */
268void tick_shutdown_broadcast(unsigned int *cpup)
269{
270 struct clock_event_device *bc;
271 unsigned long flags;
272 unsigned int cpu = *cpup;
273
274 spin_lock_irqsave(&tick_broadcast_lock, flags);
275
276 bc = tick_broadcast_device.evtdev;
277 cpu_clear(cpu, tick_broadcast_mask);
278
279 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
280 if (bc && cpus_empty(tick_broadcast_mask))
281 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
282 }
283
284 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
285}
286
287#ifdef CONFIG_TICK_ONESHOT
288
289static cpumask_t tick_broadcast_oneshot_mask;
290
291/*
292 * Debugging: see timer_list.c
293 */
294cpumask_t *tick_get_broadcast_oneshot_mask(void)
295{
296 return &tick_broadcast_oneshot_mask;
297}
298
299static int tick_broadcast_set_event(ktime_t expires, int force)
300{
301 struct clock_event_device *bc = tick_broadcast_device.evtdev;
302 ktime_t now = ktime_get();
303 int res;
304
305 for(;;) {
306 res = clockevents_program_event(bc, expires, now);
307 if (!res || !force)
308 return res;
309 now = ktime_get();
310 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
311 }
312}
313
314/*
315 * Reprogram the broadcast device:
316 *
317 * Called with tick_broadcast_lock held and interrupts disabled.
318 */
319static int tick_broadcast_reprogram(void)
320{
321 ktime_t expires = { .tv64 = KTIME_MAX };
322 struct tick_device *td;
323 int cpu;
324
325 /*
326 * Find the event which expires next:
327 */
328 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
329 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
330 td = &per_cpu(tick_cpu_device, cpu);
331 if (td->evtdev->next_event.tv64 < expires.tv64)
332 expires = td->evtdev->next_event;
333 }
334
335 if (expires.tv64 == KTIME_MAX)
336 return 0;
337
338 return tick_broadcast_set_event(expires, 0);
339}
340
341/*
342 * Handle oneshot mode broadcasting
343 */
344static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
345{
346 struct tick_device *td;
347 cpumask_t mask;
348 ktime_t now;
349 int cpu;
350
351 spin_lock(&tick_broadcast_lock);
352again:
353 dev->next_event.tv64 = KTIME_MAX;
354 mask = CPU_MASK_NONE;
355 now = ktime_get();
356 /* Find all expired events */
357 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
358 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
359 td = &per_cpu(tick_cpu_device, cpu);
360 if (td->evtdev->next_event.tv64 <= now.tv64)
361 cpu_set(cpu, mask);
362 }
363
364 /*
365 * Wakeup the cpus which have an expired event. The broadcast
366 * device is reprogrammed in the return from idle code.
367 */
368 if (!tick_do_broadcast(mask)) {
369 /*
370 * The global event did not expire any CPU local
371 * events. This happens in dyntick mode, as the
372 * maximum PIT delta is quite small.
373 */
374 if (tick_broadcast_reprogram())
375 goto again;
376 }
377 spin_unlock(&tick_broadcast_lock);
378}
379
380/*
381 * Powerstate information: The system enters/leaves a state, where
382 * affected devices might stop
383 */
384void tick_broadcast_oneshot_control(unsigned long reason)
385{
386 struct clock_event_device *bc, *dev;
387 struct tick_device *td;
388 unsigned long flags;
389 int cpu;
390
391 spin_lock_irqsave(&tick_broadcast_lock, flags);
392
393 /*
394 * Periodic mode does not care about the enter/exit of power
395 * states
396 */
397 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
398 goto out;
399
400 bc = tick_broadcast_device.evtdev;
401 cpu = smp_processor_id();
402 td = &per_cpu(tick_cpu_device, cpu);
403 dev = td->evtdev;
404
405 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
406 goto out;
407
408 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
409 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
410 cpu_set(cpu, tick_broadcast_oneshot_mask);
411 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
412 if (dev->next_event.tv64 < bc->next_event.tv64)
413 tick_broadcast_set_event(dev->next_event, 1);
414 }
415 } else {
416 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
417 cpu_clear(cpu, tick_broadcast_oneshot_mask);
418 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
419 if (dev->next_event.tv64 != KTIME_MAX)
420 tick_program_event(dev->next_event, 1);
421 }
422 }
423
424out:
425 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
426}
427
428/**
429 * tick_broadcast_setup_highres - setup the broadcast device for highres
430 */
431void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
432{
433 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
434 bc->event_handler = tick_handle_oneshot_broadcast;
435 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
436 bc->next_event.tv64 = KTIME_MAX;
437 }
438}
439
440/*
441 * Select oneshot operating mode for the broadcast device
442 */
443void tick_broadcast_switch_to_oneshot(void)
444{
445 struct clock_event_device *bc;
446 unsigned long flags;
447
448 spin_lock_irqsave(&tick_broadcast_lock, flags);
449
450 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
451 bc = tick_broadcast_device.evtdev;
452 if (bc)
453 tick_broadcast_setup_oneshot(bc);
454 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
455}
456
457
458/*
459 * Remove a dead CPU from broadcasting
460 */
461void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
462{
463 struct clock_event_device *bc;
464 unsigned long flags;
465 unsigned int cpu = *cpup;
466
467 spin_lock_irqsave(&tick_broadcast_lock, flags);
468
469 bc = tick_broadcast_device.evtdev;
470 cpu_clear(cpu, tick_broadcast_oneshot_mask);
471
472 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
473 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
474 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
475 }
476
477 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
478}
479
480#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000000..4500e347f1bb
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,346 @@
1/*
2 * linux/kernel/time/tick-common.c
3 *
4 * This file contains the base functions to manage periodic tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Tick devices
27 */
28DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
29/*
30 * Tick next event: keeps track of the tick time
31 */
32ktime_t tick_next_period;
33ktime_t tick_period;
34static int tick_do_timer_cpu = -1;
35DEFINE_SPINLOCK(tick_device_lock);
36
37/*
38 * Debugging: see timer_list.c
39 */
40struct tick_device *tick_get_device(int cpu)
41{
42 return &per_cpu(tick_cpu_device, cpu);
43}
44
45/**
46 * tick_is_oneshot_available - check for a oneshot capable event device
47 */
48int tick_is_oneshot_available(void)
49{
50 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
51
52 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
53}
54
55/*
56 * Periodic tick
57 */
58static void tick_periodic(int cpu)
59{
60 if (tick_do_timer_cpu == cpu) {
61 write_seqlock(&xtime_lock);
62
63 /* Keep track of the next tick event */
64 tick_next_period = ktime_add(tick_next_period, tick_period);
65
66 do_timer(1);
67 write_sequnlock(&xtime_lock);
68 }
69
70 update_process_times(user_mode(get_irq_regs()));
71 profile_tick(CPU_PROFILING);
72}
73
74/*
75 * Event handler for periodic ticks
76 */
77void tick_handle_periodic(struct clock_event_device *dev)
78{
79 int cpu = smp_processor_id();
80
81 tick_periodic(cpu);
82
83 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
84 return;
85 /*
86 * Setup the next period for devices, which do not have
87 * periodic mode:
88 */
89 for (;;) {
90 ktime_t next = ktime_add(dev->next_event, tick_period);
91
92 if (!clockevents_program_event(dev, next, ktime_get()))
93 return;
94 tick_periodic(cpu);
95 }
96}
97
98/*
99 * Setup the device for a periodic tick
100 */
101void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
102{
103 tick_set_periodic_handler(dev, broadcast);
104
105 /* Broadcast setup ? */
106 if (!tick_device_is_functional(dev))
107 return;
108
109 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
110 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
111 } else {
112 unsigned long seq;
113 ktime_t next;
114
115 do {
116 seq = read_seqbegin(&xtime_lock);
117 next = tick_next_period;
118 } while (read_seqretry(&xtime_lock, seq));
119
120 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
121
122 for (;;) {
123 if (!clockevents_program_event(dev, next, ktime_get()))
124 return;
125 next = ktime_add(next, tick_period);
126 }
127 }
128}
129
130/*
131 * Setup the tick device
132 */
133static void tick_setup_device(struct tick_device *td,
134 struct clock_event_device *newdev, int cpu,
135 cpumask_t cpumask)
136{
137 ktime_t next_event;
138 void (*handler)(struct clock_event_device *) = NULL;
139
140 /*
141 * First device setup ?
142 */
143 if (!td->evtdev) {
144 /*
145 * If no cpu took the do_timer update, assign it to
146 * this cpu:
147 */
148 if (tick_do_timer_cpu == -1) {
149 tick_do_timer_cpu = cpu;
150 tick_next_period = ktime_get();
151 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
152 }
153
154 /*
155 * Startup in periodic mode first.
156 */
157 td->mode = TICKDEV_MODE_PERIODIC;
158 } else {
159 handler = td->evtdev->event_handler;
160 next_event = td->evtdev->next_event;
161 }
162
163 td->evtdev = newdev;
164
165 /*
166 * When the device is not per cpu, pin the interrupt to the
167 * current cpu:
168 */
169 if (!cpus_equal(newdev->cpumask, cpumask))
170 irq_set_affinity(newdev->irq, cpumask);
171
172 /*
173 * When global broadcasting is active, check if the current
174 * device is registered as a placeholder for broadcast mode.
175 * This allows us to handle this x86 misfeature in a generic
176 * way.
177 */
178 if (tick_device_uses_broadcast(newdev, cpu))
179 return;
180
181 if (td->mode == TICKDEV_MODE_PERIODIC)
182 tick_setup_periodic(newdev, 0);
183 else
184 tick_setup_oneshot(newdev, handler, next_event);
185}
186
187/*
188 * Check, if the new registered device should be used.
189 */
190static int tick_check_new_device(struct clock_event_device *newdev)
191{
192 struct clock_event_device *curdev;
193 struct tick_device *td;
194 int cpu, ret = NOTIFY_OK;
195 unsigned long flags;
196 cpumask_t cpumask;
197
198 spin_lock_irqsave(&tick_device_lock, flags);
199
200 cpu = smp_processor_id();
201 if (!cpu_isset(cpu, newdev->cpumask))
202 goto out;
203
204 td = &per_cpu(tick_cpu_device, cpu);
205 curdev = td->evtdev;
206 cpumask = cpumask_of_cpu(cpu);
207
208 /* cpu local device ? */
209 if (!cpus_equal(newdev->cpumask, cpumask)) {
210
211 /*
212 * If the cpu affinity of the device interrupt can not
213 * be set, ignore it.
214 */
215 if (!irq_can_set_affinity(newdev->irq))
216 goto out_bc;
217
218 /*
219 * If we have a cpu local device already, do not replace it
220 * by a non cpu local device
221 */
222 if (curdev && cpus_equal(curdev->cpumask, cpumask))
223 goto out_bc;
224 }
225
226 /*
227 * If we have an active device, then check the rating and the oneshot
228 * feature.
229 */
230 if (curdev) {
231 /*
232 * Prefer one shot capable devices !
233 */
234 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
235 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
236 goto out_bc;
237 /*
238 * Check the rating
239 */
240 if (curdev->rating >= newdev->rating)
241 goto out_bc;
242 }
243
244 /*
245 * Replace the eventually existing device by the new
246 * device. If the current device is the broadcast device, do
247 * not give it back to the clockevents layer !
248 */
249 if (tick_is_broadcast_device(curdev)) {
250 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
251 curdev = NULL;
252 }
253 clockevents_exchange_device(curdev, newdev);
254 tick_setup_device(td, newdev, cpu, cpumask);
255 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
256 tick_oneshot_notify();
257
258 spin_unlock_irqrestore(&tick_device_lock, flags);
259 return NOTIFY_STOP;
260
261out_bc:
262 /*
263 * Can the new device be used as a broadcast device ?
264 */
265 if (tick_check_broadcast_device(newdev))
266 ret = NOTIFY_STOP;
267out:
268 spin_unlock_irqrestore(&tick_device_lock, flags);
269
270 return ret;
271}
272
273/*
274 * Shutdown an event device on a given cpu:
275 *
276 * This is called on a life CPU, when a CPU is dead. So we cannot
277 * access the hardware device itself.
278 * We just set the mode and remove it from the lists.
279 */
280static void tick_shutdown(unsigned int *cpup)
281{
282 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
283 struct clock_event_device *dev = td->evtdev;
284 unsigned long flags;
285
286 spin_lock_irqsave(&tick_device_lock, flags);
287 td->mode = TICKDEV_MODE_PERIODIC;
288 if (dev) {
289 /*
290 * Prevent that the clock events layer tries to call
291 * the set mode function!
292 */
293 dev->mode = CLOCK_EVT_MODE_UNUSED;
294 clockevents_exchange_device(dev, NULL);
295 td->evtdev = NULL;
296 }
297 spin_unlock_irqrestore(&tick_device_lock, flags);
298}
299
300/*
301 * Notification about clock event devices
302 */
303static int tick_notify(struct notifier_block *nb, unsigned long reason,
304 void *dev)
305{
306 switch (reason) {
307
308 case CLOCK_EVT_NOTIFY_ADD:
309 return tick_check_new_device(dev);
310
311 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
312 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
313 tick_broadcast_on_off(reason, dev);
314 break;
315
316 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
317 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
318 tick_broadcast_oneshot_control(reason);
319 break;
320
321 case CLOCK_EVT_NOTIFY_CPU_DEAD:
322 tick_shutdown_broadcast_oneshot(dev);
323 tick_shutdown_broadcast(dev);
324 tick_shutdown(dev);
325 break;
326
327 default:
328 break;
329 }
330
331 return NOTIFY_OK;
332}
333
334static struct notifier_block tick_notifier = {
335 .notifier_call = tick_notify,
336};
337
338/**
339 * tick_init - initialize the tick control
340 *
341 * Register the notifier with the clockevents framework
342 */
343void __init tick_init(void)
344{
345 clockevents_register_notifier(&tick_notifier);
346}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000000..54861a0f29ff
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
1/*
2 * tick internal variable and functions used by low/high res code
3 */
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period;
7extern ktime_t tick_period;
8
9extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
10extern void tick_handle_periodic(struct clock_event_device *dev);
11
12/*
13 * NO_HZ / high resolution timer shared code
14 */
15#ifdef CONFIG_TICK_ONESHOT
16extern void tick_setup_oneshot(struct clock_event_device *newdev,
17 void (*handler)(struct clock_event_device *),
18 ktime_t nextevt);
19extern int tick_program_event(ktime_t expires, int force);
20extern void tick_oneshot_notify(void);
21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
22
23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
25extern void tick_broadcast_oneshot_control(unsigned long reason);
26extern void tick_broadcast_switch_to_oneshot(void);
27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
28# else /* BROADCAST */
29static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
30{
31 BUG();
32}
33static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
34static inline void tick_broadcast_switch_to_oneshot(void) { }
35static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
36# endif /* !BROADCAST */
37
38#else /* !ONESHOT */
39static inline
40void tick_setup_oneshot(struct clock_event_device *newdev,
41 void (*handler)(struct clock_event_device *),
42 ktime_t nextevt)
43{
44 BUG();
45}
46static inline int tick_program_event(ktime_t expires, int force)
47{
48 return 0;
49}
50static inline void tick_oneshot_notify(void) { }
51static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
52{
53 BUG();
54}
55static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
56static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
57#endif /* !TICK_ONESHOT */
58
59/*
60 * Broadcasting support
61 */
62#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
63extern int tick_do_broadcast(cpumask_t mask);
64
65extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
66extern int tick_check_broadcast_device(struct clock_event_device *dev);
67extern int tick_is_broadcast_device(struct clock_event_device *dev);
68extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
69extern void tick_shutdown_broadcast(unsigned int *cpup);
70
71extern void
72tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
73
74#else /* !BROADCAST */
75
76static inline int tick_check_broadcast_device(struct clock_event_device *dev)
77{
78 return 0;
79}
80
81static inline int tick_is_broadcast_device(struct clock_event_device *dev)
82{
83 return 0;
84}
85static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
86 int cpu)
87{
88 return 0;
89}
90static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
91static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
92static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
93
94/*
95 * Set the periodic handler in non broadcast mode
96 */
97static inline void tick_set_periodic_handler(struct clock_event_device *dev,
98 int broadcast)
99{
100 dev->event_handler = tick_handle_periodic;
101}
102#endif /* !BROADCAST */
103
104/*
105 * Check, if the device is functional or a dummy for broadcast
106 */
107static inline int tick_device_is_functional(struct clock_event_device *dev)
108{
109 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
110}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000000..2e8b7ff863cc
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
1/*
2 * linux/kernel/time/tick-oneshot.c
3 *
4 * This file contains functions which manage high resolution tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/**
26 * tick_program_event
27 */
28int tick_program_event(ktime_t expires, int force)
29{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get();
32
33 while (1) {
34 int ret = clockevents_program_event(dev, expires, now);
35
36 if (!ret || !force)
37 return ret;
38 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
40 }
41}
42
43/**
44 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
45 */
46void tick_setup_oneshot(struct clock_event_device *newdev,
47 void (*handler)(struct clock_event_device *),
48 ktime_t next_event)
49{
50 newdev->event_handler = handler;
51 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
52 clockevents_program_event(newdev, next_event, ktime_get());
53}
54
55/**
56 * tick_switch_to_oneshot - switch to oneshot mode
57 */
58int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
59{
60 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
61 struct clock_event_device *dev = td->evtdev;
62
63 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
64 !tick_device_is_functional(dev))
65 return -EINVAL;
66
67 td->mode = TICKDEV_MODE_ONESHOT;
68 dev->event_handler = handler;
69 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
70 tick_broadcast_switch_to_oneshot();
71 return 0;
72}
73
74#ifdef CONFIG_HIGH_RES_TIMERS
75/**
76 * tick_init_highres - switch to high resolution mode
77 *
78 * Called with interrupts disabled.
79 */
80int tick_init_highres(void)
81{
82 return tick_switch_to_oneshot(hrtimer_interrupt);
83}
84#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000000..95e41f7f850b
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,563 @@
1/*
2 * linux/kernel/time/tick-sched.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * No idle tick implementation for low and high resolution timers
9 *
10 * Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/interrupt.h>
18#include <linux/kernel_stat.h>
19#include <linux/percpu.h>
20#include <linux/profile.h>
21#include <linux/sched.h>
22#include <linux/tick.h>
23
24#include "tick-internal.h"
25
26/*
27 * Per cpu nohz control structure
28 */
29static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
30
31/*
32 * The time, when the last jiffy update happened. Protected by xtime_lock.
33 */
34static ktime_t last_jiffies_update;
35
36struct tick_sched *tick_get_tick_sched(int cpu)
37{
38 return &per_cpu(tick_cpu_sched, cpu);
39}
40
41/*
42 * Must be called with interrupts disabled !
43 */
44static void tick_do_update_jiffies64(ktime_t now)
45{
46 unsigned long ticks = 0;
47 ktime_t delta;
48
49 /* Reevalute with xtime_lock held */
50 write_seqlock(&xtime_lock);
51
52 delta = ktime_sub(now, last_jiffies_update);
53 if (delta.tv64 >= tick_period.tv64) {
54
55 delta = ktime_sub(delta, tick_period);
56 last_jiffies_update = ktime_add(last_jiffies_update,
57 tick_period);
58
59 /* Slow path for long timeouts */
60 if (unlikely(delta.tv64 >= tick_period.tv64)) {
61 s64 incr = ktime_to_ns(tick_period);
62
63 ticks = ktime_divns(delta, incr);
64
65 last_jiffies_update = ktime_add_ns(last_jiffies_update,
66 incr * ticks);
67 }
68 do_timer(++ticks);
69 }
70 write_sequnlock(&xtime_lock);
71}
72
73/*
74 * Initialize and return retrieve the jiffies update.
75 */
76static ktime_t tick_init_jiffy_update(void)
77{
78 ktime_t period;
79
80 write_seqlock(&xtime_lock);
81 /* Did we start the jiffies update yet ? */
82 if (last_jiffies_update.tv64 == 0)
83 last_jiffies_update = tick_next_period;
84 period = last_jiffies_update;
85 write_sequnlock(&xtime_lock);
86 return period;
87}
88
89/*
90 * NOHZ - aka dynamic tick functionality
91 */
92#ifdef CONFIG_NO_HZ
93/*
94 * NO HZ enabled ?
95 */
96static int tick_nohz_enabled __read_mostly = 1;
97
98/*
99 * Enable / Disable tickless mode
100 */
101static int __init setup_tick_nohz(char *str)
102{
103 if (!strcmp(str, "off"))
104 tick_nohz_enabled = 0;
105 else if (!strcmp(str, "on"))
106 tick_nohz_enabled = 1;
107 else
108 return 0;
109 return 1;
110}
111
112__setup("nohz=", setup_tick_nohz);
113
114/**
115 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
116 *
117 * Called from interrupt entry when the CPU was idle
118 *
119 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
120 * must be updated. Otherwise an interrupt handler could use a stale jiffy
121 * value. We do this unconditionally on any cpu, as we don't know whether the
122 * cpu, which has the update task assigned is in a long sleep.
123 */
124void tick_nohz_update_jiffies(void)
125{
126 int cpu = smp_processor_id();
127 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
128 unsigned long flags;
129 ktime_t now;
130
131 if (!ts->tick_stopped)
132 return;
133
134 cpu_clear(cpu, nohz_cpu_mask);
135 now = ktime_get();
136
137 local_irq_save(flags);
138 tick_do_update_jiffies64(now);
139 local_irq_restore(flags);
140}
141
142/**
143 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
144 *
145 * When the next event is more than a tick into the future, stop the idle tick
146 * Called either from the idle loop or from irq_exit() when an idle period was
147 * just interrupted by an interrupt which did not cause a reschedule.
148 */
149void tick_nohz_stop_sched_tick(void)
150{
151 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
152 struct tick_sched *ts;
153 ktime_t last_update, expires, now, delta;
154 int cpu;
155
156 local_irq_save(flags);
157
158 cpu = smp_processor_id();
159 ts = &per_cpu(tick_cpu_sched, cpu);
160
161 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
162 goto end;
163
164 if (need_resched())
165 goto end;
166
167 cpu = smp_processor_id();
168 BUG_ON(local_softirq_pending());
169
170 now = ktime_get();
171 /*
172 * When called from irq_exit we need to account the idle sleep time
173 * correctly.
174 */
175 if (ts->tick_stopped) {
176 delta = ktime_sub(now, ts->idle_entrytime);
177 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
178 }
179
180 ts->idle_entrytime = now;
181 ts->idle_calls++;
182
183 /* Read jiffies and the time when jiffies were updated last */
184 do {
185 seq = read_seqbegin(&xtime_lock);
186 last_update = last_jiffies_update;
187 last_jiffies = jiffies;
188 } while (read_seqretry(&xtime_lock, seq));
189
190 /* Get the next timer wheel timer */
191 next_jiffies = get_next_timer_interrupt(last_jiffies);
192 delta_jiffies = next_jiffies - last_jiffies;
193
194 /*
195 * Do not stop the tick, if we are only one off
196 * or if the cpu is required for rcu
197 */
198 if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu)))
199 goto out;
200
201 /* Schedule the tick, if we are at least one jiffie off */
202 if ((long)delta_jiffies >= 1) {
203
204 if (rcu_needs_cpu(cpu))
205 delta_jiffies = 1;
206 else
207 cpu_set(cpu, nohz_cpu_mask);
208 /*
209 * nohz_stop_sched_tick can be called several times before
210 * the nohz_restart_sched_tick is called. This happens when
211 * interrupts arrive which do not cause a reschedule. In the
212 * first call we save the current tick time, so we can restart
213 * the scheduler tick in nohz_restart_sched_tick.
214 */
215 if (!ts->tick_stopped) {
216 ts->idle_tick = ts->sched_timer.expires;
217 ts->tick_stopped = 1;
218 ts->idle_jiffies = last_jiffies;
219 }
220 /*
221 * calculate the expiry time for the next timer wheel
222 * timer
223 */
224 expires = ktime_add_ns(last_update, tick_period.tv64 *
225 delta_jiffies);
226 ts->idle_expires = expires;
227 ts->idle_sleeps++;
228
229 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
230 hrtimer_start(&ts->sched_timer, expires,
231 HRTIMER_MODE_ABS);
232 /* Check, if the timer was already in the past */
233 if (hrtimer_active(&ts->sched_timer))
234 goto out;
235 } else if(!tick_program_event(expires, 0))
236 goto out;
237 /*
238 * We are past the event already. So we crossed a
239 * jiffie boundary. Update jiffies and raise the
240 * softirq.
241 */
242 tick_do_update_jiffies64(ktime_get());
243 cpu_clear(cpu, nohz_cpu_mask);
244 }
245 raise_softirq_irqoff(TIMER_SOFTIRQ);
246out:
247 ts->next_jiffies = next_jiffies;
248 ts->last_jiffies = last_jiffies;
249end:
250 local_irq_restore(flags);
251}
252
253/**
254 * nohz_restart_sched_tick - restart the idle tick from the idle task
255 *
256 * Restart the idle tick when the CPU is woken up from idle
257 */
258void tick_nohz_restart_sched_tick(void)
259{
260 int cpu = smp_processor_id();
261 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
262 unsigned long ticks;
263 ktime_t now, delta;
264
265 if (!ts->tick_stopped)
266 return;
267
268 /* Update jiffies first */
269 now = ktime_get();
270
271 local_irq_disable();
272 tick_do_update_jiffies64(now);
273 cpu_clear(cpu, nohz_cpu_mask);
274
275 /* Account the idle time */
276 delta = ktime_sub(now, ts->idle_entrytime);
277 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
278
279 /*
280 * We stopped the tick in idle. Update process times would miss the
281 * time we slept as update_process_times does only a 1 tick
282 * accounting. Enforce that this is accounted to idle !
283 */
284 ticks = jiffies - ts->idle_jiffies;
285 /*
286 * We might be one off. Do not randomly account a huge number of ticks!
287 */
288 if (ticks && ticks < LONG_MAX) {
289 add_preempt_count(HARDIRQ_OFFSET);
290 account_system_time(current, HARDIRQ_OFFSET,
291 jiffies_to_cputime(ticks));
292 sub_preempt_count(HARDIRQ_OFFSET);
293 }
294
295 /*
296 * Cancel the scheduled timer and restore the tick
297 */
298 ts->tick_stopped = 0;
299 hrtimer_cancel(&ts->sched_timer);
300 ts->sched_timer.expires = ts->idle_tick;
301
302 while (1) {
303 /* Forward the time to expire in the future */
304 hrtimer_forward(&ts->sched_timer, now, tick_period);
305
306 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
307 hrtimer_start(&ts->sched_timer,
308 ts->sched_timer.expires,
309 HRTIMER_MODE_ABS);
310 /* Check, if the timer was already in the past */
311 if (hrtimer_active(&ts->sched_timer))
312 break;
313 } else {
314 if (!tick_program_event(ts->sched_timer.expires, 0))
315 break;
316 }
317 /* Update jiffies and reread time */
318 tick_do_update_jiffies64(now);
319 now = ktime_get();
320 }
321 local_irq_enable();
322}
323
324static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
325{
326 hrtimer_forward(&ts->sched_timer, now, tick_period);
327 return tick_program_event(ts->sched_timer.expires, 0);
328}
329
330/*
331 * The nohz low res interrupt handler
332 */
333static void tick_nohz_handler(struct clock_event_device *dev)
334{
335 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
336 struct pt_regs *regs = get_irq_regs();
337 ktime_t now = ktime_get();
338
339 dev->next_event.tv64 = KTIME_MAX;
340
341 /* Check, if the jiffies need an update */
342 tick_do_update_jiffies64(now);
343
344 /*
345 * When we are idle and the tick is stopped, we have to touch
346 * the watchdog as we might not schedule for a really long
347 * time. This happens on complete idle SMP systems while
348 * waiting on the login prompt. We also increment the "start
349 * of idle" jiffy stamp so the idle accounting adjustment we
350 * do when we go busy again does not account too much ticks.
351 */
352 if (ts->tick_stopped) {
353 touch_softlockup_watchdog();
354 ts->idle_jiffies++;
355 }
356
357 update_process_times(user_mode(regs));
358 profile_tick(CPU_PROFILING);
359
360 /* Do not restart, when we are in the idle loop */
361 if (ts->tick_stopped)
362 return;
363
364 while (tick_nohz_reprogram(ts, now)) {
365 now = ktime_get();
366 tick_do_update_jiffies64(now);
367 }
368}
369
370/**
371 * tick_nohz_switch_to_nohz - switch to nohz mode
372 */
373static void tick_nohz_switch_to_nohz(void)
374{
375 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
376 ktime_t next;
377
378 if (!tick_nohz_enabled)
379 return;
380
381 local_irq_disable();
382 if (tick_switch_to_oneshot(tick_nohz_handler)) {
383 local_irq_enable();
384 return;
385 }
386
387 ts->nohz_mode = NOHZ_MODE_LOWRES;
388
389 /*
390 * Recycle the hrtimer in ts, so we can share the
391 * hrtimer_forward with the highres code.
392 */
393 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
394 /* Get the next period */
395 next = tick_init_jiffy_update();
396
397 for (;;) {
398 ts->sched_timer.expires = next;
399 if (!tick_program_event(next, 0))
400 break;
401 next = ktime_add(next, tick_period);
402 }
403 local_irq_enable();
404
405 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
406 smp_processor_id());
407}
408
409#else
410
411static inline void tick_nohz_switch_to_nohz(void) { }
412
413#endif /* NO_HZ */
414
415/*
416 * High resolution timer specific code
417 */
418#ifdef CONFIG_HIGH_RES_TIMERS
419/*
420 * We rearm the timer until we get disabled by the idle code
421 * Called with interrupts disabled and timer->base->cpu_base->lock held.
422 */
423static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
424{
425 struct tick_sched *ts =
426 container_of(timer, struct tick_sched, sched_timer);
427 struct hrtimer_cpu_base *base = timer->base->cpu_base;
428 struct pt_regs *regs = get_irq_regs();
429 ktime_t now = ktime_get();
430
431 /* Check, if the jiffies need an update */
432 tick_do_update_jiffies64(now);
433
434 /*
435 * Do not call, when we are not in irq context and have
436 * no valid regs pointer
437 */
438 if (regs) {
439 /*
440 * When we are idle and the tick is stopped, we have to touch
441 * the watchdog as we might not schedule for a really long
442 * time. This happens on complete idle SMP systems while
443 * waiting on the login prompt. We also increment the "start of
444 * idle" jiffy stamp so the idle accounting adjustment we do
445 * when we go busy again does not account too much ticks.
446 */
447 if (ts->tick_stopped) {
448 touch_softlockup_watchdog();
449 ts->idle_jiffies++;
450 }
451 /*
452 * update_process_times() might take tasklist_lock, hence
453 * drop the base lock. sched-tick hrtimers are per-CPU and
454 * never accessible by userspace APIs, so this is safe to do.
455 */
456 spin_unlock(&base->lock);
457 update_process_times(user_mode(regs));
458 profile_tick(CPU_PROFILING);
459 spin_lock(&base->lock);
460 }
461
462 /* Do not restart, when we are in the idle loop */
463 if (ts->tick_stopped)
464 return HRTIMER_NORESTART;
465
466 hrtimer_forward(timer, now, tick_period);
467
468 return HRTIMER_RESTART;
469}
470
471/**
472 * tick_setup_sched_timer - setup the tick emulation timer
473 */
474void tick_setup_sched_timer(void)
475{
476 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
477 ktime_t now = ktime_get();
478
479 /*
480 * Emulate tick processing via per-CPU hrtimers:
481 */
482 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
483 ts->sched_timer.function = tick_sched_timer;
484 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
485
486 /* Get the next period */
487 ts->sched_timer.expires = tick_init_jiffy_update();
488
489 for (;;) {
490 hrtimer_forward(&ts->sched_timer, now, tick_period);
491 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
492 HRTIMER_MODE_ABS);
493 /* Check, if the timer was already in the past */
494 if (hrtimer_active(&ts->sched_timer))
495 break;
496 now = ktime_get();
497 }
498
499#ifdef CONFIG_NO_HZ
500 if (tick_nohz_enabled)
501 ts->nohz_mode = NOHZ_MODE_HIGHRES;
502#endif
503}
504
505void tick_cancel_sched_timer(int cpu)
506{
507 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
508
509 if (ts->sched_timer.base)
510 hrtimer_cancel(&ts->sched_timer);
511 ts->tick_stopped = 0;
512 ts->nohz_mode = NOHZ_MODE_INACTIVE;
513}
514#endif /* HIGH_RES_TIMERS */
515
516/**
517 * Async notification about clocksource changes
518 */
519void tick_clock_notify(void)
520{
521 int cpu;
522
523 for_each_possible_cpu(cpu)
524 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
525}
526
527/*
528 * Async notification about clock event changes
529 */
530void tick_oneshot_notify(void)
531{
532 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
533
534 set_bit(0, &ts->check_clocks);
535}
536
537/**
538 * Check, if a change happened, which makes oneshot possible.
539 *
540 * Called cyclic from the hrtimer softirq (driven by the timer
541 * softirq) allow_nohz signals, that we can switch into low-res nohz
542 * mode, because high resolution timers are disabled (either compile
543 * or runtime).
544 */
545int tick_check_oneshot_change(int allow_nohz)
546{
547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
548
549 if (!test_and_clear_bit(0, &ts->check_clocks))
550 return 0;
551
552 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
553 return 0;
554
555 if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
556 return 0;
557
558 if (!allow_nohz)
559 return 1;
560
561 tick_nohz_switch_to_nohz();
562 return 0;
563}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000000..f82c635c3d5c
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
1/*
2 * kernel/time/timer_list.c
3 *
4 * List pending timers
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/sched.h>
17#include <linux/seq_file.h>
18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20
21#include <asm/uaccess.h>
22
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26
27/*
28 * This allows printing both to /proc/timer_list and
29 * to the console (on SysRq-Q):
30 */
31#define SEQ_printf(m, x...) \
32 do { \
33 if (m) \
34 seq_printf(m, x); \
35 else \
36 printk(x); \
37 } while (0)
38
39static void print_name_offset(struct seq_file *m, void *sym)
40{
41 unsigned long addr = (unsigned long)sym;
42 char namebuf[KSYM_NAME_LEN+1];
43 unsigned long size, offset;
44 const char *sym_name;
45 char *modname;
46
47 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
48 if (sym_name)
49 SEQ_printf(m, "%s", sym_name);
50 else
51 SEQ_printf(m, "<%p>", sym);
52}
53
54static void
55print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
56{
57#ifdef CONFIG_TIMER_STATS
58 char tmp[TASK_COMM_LEN + 1];
59#endif
60 SEQ_printf(m, " #%d: ", idx);
61 print_name_offset(m, timer);
62 SEQ_printf(m, ", ");
63 print_name_offset(m, timer->function);
64 SEQ_printf(m, ", S:%02lx", timer->state);
65#ifdef CONFIG_TIMER_STATS
66 SEQ_printf(m, ", ");
67 print_name_offset(m, timer->start_site);
68 memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
69 tmp[TASK_COMM_LEN] = 0;
70 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
71#endif
72 SEQ_printf(m, "\n");
73 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
74 (unsigned long long)ktime_to_ns(timer->expires),
75 (unsigned long long)(ktime_to_ns(timer->expires) - now));
76}
77
78static void
79print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
80 u64 now)
81{
82 struct hrtimer *timer, tmp;
83 unsigned long next = 0, i;
84 struct rb_node *curr;
85 unsigned long flags;
86
87next_one:
88 i = 0;
89 spin_lock_irqsave(&base->cpu_base->lock, flags);
90
91 curr = base->first;
92 /*
93 * Crude but we have to do this O(N*N) thing, because
94 * we have to unlock the base when printing:
95 */
96 while (curr && i < next) {
97 curr = rb_next(curr);
98 i++;
99 }
100
101 if (curr) {
102
103 timer = rb_entry(curr, struct hrtimer, node);
104 tmp = *timer;
105 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
106
107 print_timer(m, &tmp, i, now);
108 next++;
109 goto next_one;
110 }
111 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
112}
113
114static void
115print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
116{
117 SEQ_printf(m, " .index: %d\n",
118 base->index);
119 SEQ_printf(m, " .resolution: %Ld nsecs\n",
120 (unsigned long long)ktime_to_ns(base->resolution));
121 SEQ_printf(m, " .get_time: ");
122 print_name_offset(m, base->get_time);
123 SEQ_printf(m, "\n");
124#ifdef CONFIG_HIGH_RES_TIMERS
125 SEQ_printf(m, " .offset: %Ld nsecs\n",
126 ktime_to_ns(base->offset));
127#endif
128 SEQ_printf(m, "active timers:\n");
129 print_active_timers(m, base, now);
130}
131
132static void print_cpu(struct seq_file *m, int cpu, u64 now)
133{
134 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
135 int i;
136
137 SEQ_printf(m, "\ncpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i);
140 print_base(m, cpu_base->clock_base + i, now);
141 }
142#define P(x) \
143 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
144#define P_ns(x) \
145 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
146 (u64)(ktime_to_ns(cpu_base->x)))
147
148#ifdef CONFIG_HIGH_RES_TIMERS
149 P_ns(expires_next);
150 P(hres_active);
151 P(nr_events);
152#endif
153#undef P
154#undef P_ns
155
156#ifdef CONFIG_TICK_ONESHOT
157# define P(x) \
158 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
159# define P_ns(x) \
160 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
161 (u64)(ktime_to_ns(ts->x)))
162 {
163 struct tick_sched *ts = tick_get_tick_sched(cpu);
164 P(nohz_mode);
165 P_ns(idle_tick);
166 P(tick_stopped);
167 P(idle_jiffies);
168 P(idle_calls);
169 P(idle_sleeps);
170 P_ns(idle_entrytime);
171 P_ns(idle_sleeptime);
172 P(last_jiffies);
173 P(next_jiffies);
174 P_ns(idle_expires);
175 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
176 }
177#endif
178
179#undef P
180#undef P_ns
181}
182
183#ifdef CONFIG_GENERIC_CLOCKEVENTS
184static void
185print_tickdevice(struct seq_file *m, struct tick_device *td)
186{
187 struct clock_event_device *dev = td->evtdev;
188
189 SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode);
190
191 SEQ_printf(m, "Clock Event Device: ");
192 if (!dev) {
193 SEQ_printf(m, "<NULL>\n");
194 return;
195 }
196 SEQ_printf(m, "%s\n", dev->name);
197 SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
198 SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
199 SEQ_printf(m, " mult: %ld\n", dev->mult);
200 SEQ_printf(m, " shift: %d\n", dev->shift);
201 SEQ_printf(m, " mode: %d\n", dev->mode);
202 SEQ_printf(m, " next_event: %Ld nsecs\n",
203 (unsigned long long) ktime_to_ns(dev->next_event));
204
205 SEQ_printf(m, " set_next_event: ");
206 print_name_offset(m, dev->set_next_event);
207 SEQ_printf(m, "\n");
208
209 SEQ_printf(m, " set_mode: ");
210 print_name_offset(m, dev->set_mode);
211 SEQ_printf(m, "\n");
212
213 SEQ_printf(m, " event_handler: ");
214 print_name_offset(m, dev->event_handler);
215 SEQ_printf(m, "\n");
216}
217
218static void timer_list_show_tickdevices(struct seq_file *m)
219{
220 int cpu;
221
222#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
223 print_tickdevice(m, tick_get_broadcast_device());
224 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
225 tick_get_broadcast_mask()->bits[0]);
226#ifdef CONFIG_TICK_ONESHOT
227 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
228 tick_get_broadcast_oneshot_mask()->bits[0]);
229#endif
230 SEQ_printf(m, "\n");
231#endif
232 for_each_online_cpu(cpu)
233 print_tickdevice(m, tick_get_device(cpu));
234 SEQ_printf(m, "\n");
235}
236#else
237static void timer_list_show_tickdevices(struct seq_file *m) { }
238#endif
239
240static int timer_list_show(struct seq_file *m, void *v)
241{
242 u64 now = ktime_to_ns(ktime_get());
243 int cpu;
244
245 SEQ_printf(m, "Timer List Version: v0.3\n");
246 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
247 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
248
249 for_each_online_cpu(cpu)
250 print_cpu(m, cpu, now);
251
252 SEQ_printf(m, "\n");
253 timer_list_show_tickdevices(m);
254
255 return 0;
256}
257
258void sysrq_timer_list_show(void)
259{
260 timer_list_show(NULL, NULL);
261}
262
263static int timer_list_open(struct inode *inode, struct file *filp)
264{
265 return single_open(filp, timer_list_show, NULL);
266}
267
268static struct file_operations timer_list_fops = {
269 .open = timer_list_open,
270 .read = seq_read,
271 .llseek = seq_lseek,
272 .release = seq_release,
273};
274
275static int __init init_timer_list_procfs(void)
276{
277 struct proc_dir_entry *pe;
278
279 pe = create_proc_entry("timer_list", 0644, NULL);
280 if (!pe)
281 return -ENOMEM;
282
283 pe->proc_fops = &timer_list_fops;
284
285 return 0;
286}
287__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000000..1bc4882e28e0
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
1/*
2 * kernel/time/timer_stats.c
3 *
4 * Collect timer usage statistics.
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * timer_stats is based on timer_top, a similar functionality which was part of
10 * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
11 * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
12 * on dynamic allocation of the statistics entries and linear search based
13 * lookup combined with a global lock, rather than the static array, hash
14 * and per-CPU locking which is used by timer_stats. It was written for the
15 * pre hrtimer kernel code and therefore did not take hrtimers into account.
16 * Nevertheless it provided the base for the timer_stats implementation and
17 * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
18 * for this effort.
19 *
20 * timer_top.c is
21 * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
22 * Written by Daniel Petrini <d.pensator@gmail.com>
23 * timer_top.c was released under the GNU General Public License version 2
24 *
25 * We export the addresses and counting of timer functions being called,
26 * the pid and cmdline from the owner process if applicable.
27 *
28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats
30 *
31 * Display the information collected so far:
32 * # cat /proc/timer_stats
33 *
34 * This program is free software; you can redistribute it and/or modify
35 * it under the terms of the GNU General Public License version 2 as
36 * published by the Free Software Foundation.
37 */
38
39#include <linux/proc_fs.h>
40#include <linux/module.h>
41#include <linux/spinlock.h>
42#include <linux/sched.h>
43#include <linux/seq_file.h>
44#include <linux/kallsyms.h>
45
46#include <asm/uaccess.h>
47
48/*
49 * This is our basic unit of interest: a timer expiry event identified
50 * by the timer, its start/expire functions and the PID of the task that
51 * started the timer. We count the number of times an event happens:
52 */
53struct entry {
54 /*
55 * Hash list:
56 */
57 struct entry *next;
58
59 /*
60 * Hash keys:
61 */
62 void *timer;
63 void *start_func;
64 void *expire_func;
65 pid_t pid;
66
67 /*
68 * Number of timeout events:
69 */
70 unsigned long count;
71
72 /*
73 * We save the command-line string to preserve
74 * this information past task exit:
75 */
76 char comm[TASK_COMM_LEN + 1];
77
78} ____cacheline_aligned_in_smp;
79
80/*
81 * Spinlock protecting the tables - not taken during lookup:
82 */
83static DEFINE_SPINLOCK(table_lock);
84
85/*
86 * Per-CPU lookup locks for fast hash lookup:
87 */
88static DEFINE_PER_CPU(spinlock_t, lookup_lock);
89
90/*
91 * Mutex to serialize state changes with show-stats activities:
92 */
93static DEFINE_MUTEX(show_mutex);
94
95/*
96 * Collection status, active/inactive:
97 */
98static int __read_mostly active;
99
100/*
101 * Beginning/end timestamps of measurement:
102 */
103static ktime_t time_start, time_stop;
104
105/*
106 * tstat entry structs only get allocated while collection is
107 * active and never freed during that time - this simplifies
108 * things quite a bit.
109 *
110 * They get freed when a new collection period is started.
111 */
112#define MAX_ENTRIES_BITS 10
113#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
114
115static unsigned long nr_entries;
116static struct entry entries[MAX_ENTRIES];
117
118static atomic_t overflow_count;
119
120static void reset_entries(void)
121{
122 nr_entries = 0;
123 memset(entries, 0, sizeof(entries));
124 atomic_set(&overflow_count, 0);
125}
126
127static struct entry *alloc_entry(void)
128{
129 if (nr_entries >= MAX_ENTRIES)
130 return NULL;
131
132 return entries + nr_entries++;
133}
134
135/*
136 * The entries are in a hash-table, for fast lookup:
137 */
138#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
139#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
140#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
141
142#define __tstat_hashfn(entry) \
143 (((unsigned long)(entry)->timer ^ \
144 (unsigned long)(entry)->start_func ^ \
145 (unsigned long)(entry)->expire_func ^ \
146 (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
147
148#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
149
150static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
151
152static int match_entries(struct entry *entry1, struct entry *entry2)
153{
154 return entry1->timer == entry2->timer &&
155 entry1->start_func == entry2->start_func &&
156 entry1->expire_func == entry2->expire_func &&
157 entry1->pid == entry2->pid;
158}
159
160/*
161 * Look up whether an entry matching this item is present
162 * in the hash already. Must be called with irqs off and the
163 * lookup lock held:
164 */
165static struct entry *tstat_lookup(struct entry *entry, char *comm)
166{
167 struct entry **head, *curr, *prev;
168
169 head = tstat_hashentry(entry);
170 curr = *head;
171
172 /*
173 * The fastpath is when the entry is already hashed,
174 * we do this with the lookup lock held, but with the
175 * table lock not held:
176 */
177 while (curr) {
178 if (match_entries(curr, entry))
179 return curr;
180
181 curr = curr->next;
182 }
183 /*
184 * Slowpath: allocate, set up and link a new hash entry:
185 */
186 prev = NULL;
187 curr = *head;
188
189 spin_lock(&table_lock);
190 /*
191 * Make sure we have not raced with another CPU:
192 */
193 while (curr) {
194 if (match_entries(curr, entry))
195 goto out_unlock;
196
197 prev = curr;
198 curr = curr->next;
199 }
200
201 curr = alloc_entry();
202 if (curr) {
203 *curr = *entry;
204 curr->count = 0;
205 memcpy(curr->comm, comm, TASK_COMM_LEN);
206 if (prev)
207 prev->next = curr;
208 else
209 *head = curr;
210 curr->next = NULL;
211 }
212 out_unlock:
213 spin_unlock(&table_lock);
214
215 return curr;
216}
217
218/**
219 * timer_stats_update_stats - Update the statistics for a timer.
220 * @timer: pointer to either a timer_list or a hrtimer
221 * @pid: the pid of the task which set up the timer
222 * @startf: pointer to the function which did the timer setup
223 * @timerf: pointer to the timer callback function of the timer
224 * @comm: name of the process which set up the timer
225 *
226 * When the timer is already registered, then the event counter is
227 * incremented. Otherwise the timer is registered in a free slot.
228 */
229void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
230 void *timerf, char * comm)
231{
232 /*
233 * It doesnt matter which lock we take:
234 */
235 spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
236 struct entry *entry, input;
237 unsigned long flags;
238
239 input.timer = timer;
240 input.start_func = startf;
241 input.expire_func = timerf;
242 input.pid = pid;
243
244 spin_lock_irqsave(lock, flags);
245 if (!active)
246 goto out_unlock;
247
248 entry = tstat_lookup(&input, comm);
249 if (likely(entry))
250 entry->count++;
251 else
252 atomic_inc(&overflow_count);
253
254 out_unlock:
255 spin_unlock_irqrestore(lock, flags);
256}
257
258static void print_name_offset(struct seq_file *m, unsigned long addr)
259{
260 char namebuf[KSYM_NAME_LEN+1];
261 unsigned long size, offset;
262 const char *sym_name;
263 char *modname;
264
265 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
266 if (sym_name)
267 seq_printf(m, "%s", sym_name);
268 else
269 seq_printf(m, "<%p>", (void *)addr);
270}
271
272static int tstats_show(struct seq_file *m, void *v)
273{
274 struct timespec period;
275 struct entry *entry;
276 unsigned long ms;
277 long events = 0;
278 ktime_t time;
279 int i;
280
281 mutex_lock(&show_mutex);
282 /*
283 * If still active then calculate up to now:
284 */
285 if (active)
286 time_stop = ktime_get();
287
288 time = ktime_sub(time_stop, time_start);
289
290 period = ktime_to_timespec(time);
291 ms = period.tv_nsec / 1000000;
292
293 seq_puts(m, "Timer Stats Version: v0.1\n");
294 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
295 if (atomic_read(&overflow_count))
296 seq_printf(m, "Overflow: %d entries\n",
297 atomic_read(&overflow_count));
298
299 for (i = 0; i < nr_entries; i++) {
300 entry = entries + i;
301 seq_printf(m, "%4lu, %5d %-16s ",
302 entry->count, entry->pid, entry->comm);
303
304 print_name_offset(m, (unsigned long)entry->start_func);
305 seq_puts(m, " (");
306 print_name_offset(m, (unsigned long)entry->expire_func);
307 seq_puts(m, ")\n");
308
309 events += entry->count;
310 }
311
312 ms += period.tv_sec * 1000;
313 if (!ms)
314 ms = 1;
315
316 if (events && period.tv_sec)
317 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
318 events / period.tv_sec, events * 1000 / ms);
319 else
320 seq_printf(m, "%ld total events\n", events);
321
322 mutex_unlock(&show_mutex);
323
324 return 0;
325}
326
327/*
328 * After a state change, make sure all concurrent lookup/update
329 * activities have stopped:
330 */
331static void sync_access(void)
332{
333 unsigned long flags;
334 int cpu;
335
336 for_each_online_cpu(cpu) {
337 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
338 /* nothing */
339 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
340 }
341}
342
343static ssize_t tstats_write(struct file *file, const char __user *buf,
344 size_t count, loff_t *offs)
345{
346 char ctl[2];
347
348 if (count != 2 || *offs)
349 return -EINVAL;
350
351 if (copy_from_user(ctl, buf, count))
352 return -EFAULT;
353
354 mutex_lock(&show_mutex);
355 switch (ctl[0]) {
356 case '0':
357 if (active) {
358 active = 0;
359 time_stop = ktime_get();
360 sync_access();
361 }
362 break;
363 case '1':
364 if (!active) {
365 reset_entries();
366 time_start = ktime_get();
367 active = 1;
368 }
369 break;
370 default:
371 count = -EINVAL;
372 }
373 mutex_unlock(&show_mutex);
374
375 return count;
376}
377
378static int tstats_open(struct inode *inode, struct file *filp)
379{
380 return single_open(filp, tstats_show, NULL);
381}
382
383static struct file_operations tstats_fops = {
384 .open = tstats_open,
385 .read = seq_read,
386 .write = tstats_write,
387 .llseek = seq_lseek,
388 .release = seq_release,
389};
390
391void __init init_timer_stats(void)
392{
393 int cpu;
394
395 for_each_possible_cpu(cpu)
396 spin_lock_init(&per_cpu(lookup_lock, cpu));
397}
398
399static int __init init_tstats_procfs(void)
400{
401 struct proc_dir_entry *pe;
402
403 pe = create_proc_entry("timer_stats", 0644, NULL);
404 if (!pe)
405 return -ENOMEM;
406
407 pe->proc_fops = &tstats_fops;
408
409 return 0;
410}
411__initcall(init_tstats_procfs);