aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/hrtimer.c57
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/perf_counter.c394
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_fair.c1
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/timekeeping.c535
-rw-r--r--kernel/timer.c28
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c23
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c49
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_entries.h17
-rw-r--r--kernel/trace/trace_event_profile.c82
-rw-r--r--kernel/trace/trace_events.c49
-rw-r--r--kernel/trace/trace_power.c218
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_syscalls.c97
30 files changed, 1295 insertions, 2051 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f9..7c9b0a585502 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
90obj-$(CONFIG_MARKERS) += marker.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 90obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 91obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 92obj-$(CONFIG_FUNCTION_TRACER) += trace/
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 05071bf6a37b..c03f221fee44 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,37 +48,6 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/**
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81
82/* 51/*
83 * The timer bases: 52 * The timer bases:
84 * 53 *
@@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 75 }
107}; 76};
108 77
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 78/*
135 * Get the coarse grained time at the softirq based on xtime and 79 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 80 * wall_to_monotonic.
@@ -1155,7 +1099,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1155 clock_id = CLOCK_MONOTONIC; 1099 clock_id = CLOCK_MONOTONIC;
1156 1100
1157 timer->base = &cpu_base->clock_base[clock_id]; 1101 timer->base = &cpu_base->clock_base[clock_id];
1158 INIT_LIST_HEAD(&timer->cb_entry);
1159 hrtimer_init_timer_hres(timer); 1102 hrtimer_init_timer_hres(timer);
1160 1103
1161#ifdef CONFIG_TIMER_STATS 1104#ifdef CONFIG_TIMER_STATS
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 05ce49ced8f6..b6ee424245dd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2237,10 +2237,6 @@ static noinline struct module *load_module(void __user *umod,
2237 sizeof(*mod->ctors), &mod->num_ctors); 2237 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif 2238#endif
2239 2239
2240#ifdef CONFIG_MARKERS
2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2242 sizeof(*mod->markers), &mod->num_markers);
2243#endif
2244#ifdef CONFIG_TRACEPOINTS 2240#ifdef CONFIG_TRACEPOINTS
2245 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2241 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2246 "__tracepoints", 2242 "__tracepoints",
@@ -2958,20 +2954,6 @@ void module_layout(struct module *mod,
2958EXPORT_SYMBOL(module_layout); 2954EXPORT_SYMBOL(module_layout);
2959#endif 2955#endif
2960 2956
2961#ifdef CONFIG_MARKERS
2962void module_update_markers(void)
2963{
2964 struct module *mod;
2965
2966 mutex_lock(&module_mutex);
2967 list_for_each_entry(mod, &modules, list)
2968 if (!mod->taints)
2969 marker_update_probe_range(mod->markers,
2970 mod->markers + mod->num_markers);
2971 mutex_unlock(&module_mutex);
2972}
2973#endif
2974
2975#ifdef CONFIG_TRACEPOINTS 2957#ifdef CONFIG_TRACEPOINTS
2976void module_update_tracepoints(void) 2958void module_update_tracepoints(void)
2977{ 2959{
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8cb94a52d1bb..cc768ab81ac8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2176 data->nr_pages = nr_pages; 2176 data->nr_pages = nr_pages;
2177 atomic_set(&data->lock, -1); 2177 atomic_set(&data->lock, -1);
2178 2178
2179 if (counter->attr.watermark) {
2180 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2181 counter->attr.wakeup_watermark);
2182 }
2183 if (!data->watermark)
2184 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2185
2179 rcu_assign_pointer(counter->data, data); 2186 rcu_assign_pointer(counter->data, data);
2180 2187
2181 return 0; 2188 return 0;
@@ -2315,7 +2322,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2315 lock_limit >>= PAGE_SHIFT; 2322 lock_limit >>= PAGE_SHIFT;
2316 locked = vma->vm_mm->locked_vm + extra; 2323 locked = vma->vm_mm->locked_vm + extra;
2317 2324
2318 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { 2325 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2326 !capable(CAP_IPC_LOCK)) {
2319 ret = -EPERM; 2327 ret = -EPERM;
2320 goto unlock; 2328 goto unlock;
2321 } 2329 }
@@ -2504,35 +2512,15 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2504/* 2512/*
2505 * Output 2513 * Output
2506 */ 2514 */
2507 2515static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2508struct perf_output_handle { 2516 unsigned long offset, unsigned long head)
2509 struct perf_counter *counter;
2510 struct perf_mmap_data *data;
2511 unsigned long head;
2512 unsigned long offset;
2513 int nmi;
2514 int sample;
2515 int locked;
2516 unsigned long flags;
2517};
2518
2519static bool perf_output_space(struct perf_mmap_data *data,
2520 unsigned int offset, unsigned int head)
2521{ 2517{
2522 unsigned long tail;
2523 unsigned long mask; 2518 unsigned long mask;
2524 2519
2525 if (!data->writable) 2520 if (!data->writable)
2526 return true; 2521 return true;
2527 2522
2528 mask = (data->nr_pages << PAGE_SHIFT) - 1; 2523 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2529 /*
2530 * Userspace could choose to issue a mb() before updating the tail
2531 * pointer. So that all reads will be completed before the write is
2532 * issued.
2533 */
2534 tail = ACCESS_ONCE(data->user_page->data_tail);
2535 smp_rmb();
2536 2524
2537 offset = (offset - tail) & mask; 2525 offset = (offset - tail) & mask;
2538 head = (head - tail) & mask; 2526 head = (head - tail) & mask;
@@ -2633,8 +2621,8 @@ out:
2633 local_irq_restore(handle->flags); 2621 local_irq_restore(handle->flags);
2634} 2622}
2635 2623
2636static void perf_output_copy(struct perf_output_handle *handle, 2624void perf_output_copy(struct perf_output_handle *handle,
2637 const void *buf, unsigned int len) 2625 const void *buf, unsigned int len)
2638{ 2626{
2639 unsigned int pages_mask; 2627 unsigned int pages_mask;
2640 unsigned int offset; 2628 unsigned int offset;
@@ -2669,16 +2657,13 @@ static void perf_output_copy(struct perf_output_handle *handle,
2669 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); 2657 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2670} 2658}
2671 2659
2672#define perf_output_put(handle, x) \ 2660int perf_output_begin(struct perf_output_handle *handle,
2673 perf_output_copy((handle), &(x), sizeof(x)) 2661 struct perf_counter *counter, unsigned int size,
2674 2662 int nmi, int sample)
2675static int perf_output_begin(struct perf_output_handle *handle,
2676 struct perf_counter *counter, unsigned int size,
2677 int nmi, int sample)
2678{ 2663{
2679 struct perf_counter *output_counter; 2664 struct perf_counter *output_counter;
2680 struct perf_mmap_data *data; 2665 struct perf_mmap_data *data;
2681 unsigned int offset, head; 2666 unsigned long tail, offset, head;
2682 int have_lost; 2667 int have_lost;
2683 struct { 2668 struct {
2684 struct perf_event_header header; 2669 struct perf_event_header header;
@@ -2716,16 +2701,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
2716 perf_output_lock(handle); 2701 perf_output_lock(handle);
2717 2702
2718 do { 2703 do {
2704 /*
2705 * Userspace could choose to issue a mb() before updating the
2706 * tail pointer. So that all reads will be completed before the
2707 * write is issued.
2708 */
2709 tail = ACCESS_ONCE(data->user_page->data_tail);
2710 smp_rmb();
2719 offset = head = atomic_long_read(&data->head); 2711 offset = head = atomic_long_read(&data->head);
2720 head += size; 2712 head += size;
2721 if (unlikely(!perf_output_space(data, offset, head))) 2713 if (unlikely(!perf_output_space(data, tail, offset, head)))
2722 goto fail; 2714 goto fail;
2723 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2715 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2724 2716
2725 handle->offset = offset; 2717 handle->offset = offset;
2726 handle->head = head; 2718 handle->head = head;
2727 2719
2728 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2720 if (head - tail > data->watermark)
2729 atomic_set(&data->wakeup, 1); 2721 atomic_set(&data->wakeup, 1);
2730 2722
2731 if (have_lost) { 2723 if (have_lost) {
@@ -2749,7 +2741,7 @@ out:
2749 return -ENOSPC; 2741 return -ENOSPC;
2750} 2742}
2751 2743
2752static void perf_output_end(struct perf_output_handle *handle) 2744void perf_output_end(struct perf_output_handle *handle)
2753{ 2745{
2754 struct perf_counter *counter = handle->counter; 2746 struct perf_counter *counter = handle->counter;
2755 struct perf_mmap_data *data = handle->data; 2747 struct perf_mmap_data *data = handle->data;
@@ -2863,156 +2855,176 @@ static void perf_output_read(struct perf_output_handle *handle,
2863 perf_output_read_one(handle, counter); 2855 perf_output_read_one(handle, counter);
2864} 2856}
2865 2857
2866void perf_counter_output(struct perf_counter *counter, int nmi, 2858void perf_output_sample(struct perf_output_handle *handle,
2867 struct perf_sample_data *data) 2859 struct perf_event_header *header,
2860 struct perf_sample_data *data,
2861 struct perf_counter *counter)
2868{ 2862{
2869 int ret; 2863 u64 sample_type = data->type;
2870 u64 sample_type = counter->attr.sample_type;
2871 struct perf_output_handle handle;
2872 struct perf_event_header header;
2873 u64 ip;
2874 struct {
2875 u32 pid, tid;
2876 } tid_entry;
2877 struct perf_callchain_entry *callchain = NULL;
2878 int callchain_size = 0;
2879 u64 time;
2880 struct {
2881 u32 cpu, reserved;
2882 } cpu_entry;
2883 2864
2884 header.type = PERF_EVENT_SAMPLE; 2865 perf_output_put(handle, *header);
2885 header.size = sizeof(header);
2886 2866
2887 header.misc = 0; 2867 if (sample_type & PERF_SAMPLE_IP)
2888 header.misc |= perf_misc_flags(data->regs); 2868 perf_output_put(handle, data->ip);
2889
2890 if (sample_type & PERF_SAMPLE_IP) {
2891 ip = perf_instruction_pointer(data->regs);
2892 header.size += sizeof(ip);
2893 }
2894
2895 if (sample_type & PERF_SAMPLE_TID) {
2896 /* namespace issues */
2897 tid_entry.pid = perf_counter_pid(counter, current);
2898 tid_entry.tid = perf_counter_tid(counter, current);
2899
2900 header.size += sizeof(tid_entry);
2901 }
2902 2869
2903 if (sample_type & PERF_SAMPLE_TIME) { 2870 if (sample_type & PERF_SAMPLE_TID)
2904 /* 2871 perf_output_put(handle, data->tid_entry);
2905 * Maybe do better on x86 and provide cpu_clock_nmi()
2906 */
2907 time = sched_clock();
2908 2872
2909 header.size += sizeof(u64); 2873 if (sample_type & PERF_SAMPLE_TIME)
2910 } 2874 perf_output_put(handle, data->time);
2911 2875
2912 if (sample_type & PERF_SAMPLE_ADDR) 2876 if (sample_type & PERF_SAMPLE_ADDR)
2913 header.size += sizeof(u64); 2877 perf_output_put(handle, data->addr);
2914 2878
2915 if (sample_type & PERF_SAMPLE_ID) 2879 if (sample_type & PERF_SAMPLE_ID)
2916 header.size += sizeof(u64); 2880 perf_output_put(handle, data->id);
2917 2881
2918 if (sample_type & PERF_SAMPLE_STREAM_ID) 2882 if (sample_type & PERF_SAMPLE_STREAM_ID)
2919 header.size += sizeof(u64); 2883 perf_output_put(handle, data->stream_id);
2920
2921 if (sample_type & PERF_SAMPLE_CPU) {
2922 header.size += sizeof(cpu_entry);
2923 2884
2924 cpu_entry.cpu = raw_smp_processor_id(); 2885 if (sample_type & PERF_SAMPLE_CPU)
2925 cpu_entry.reserved = 0; 2886 perf_output_put(handle, data->cpu_entry);
2926 }
2927 2887
2928 if (sample_type & PERF_SAMPLE_PERIOD) 2888 if (sample_type & PERF_SAMPLE_PERIOD)
2929 header.size += sizeof(u64); 2889 perf_output_put(handle, data->period);
2930 2890
2931 if (sample_type & PERF_SAMPLE_READ) 2891 if (sample_type & PERF_SAMPLE_READ)
2932 header.size += perf_counter_read_size(counter); 2892 perf_output_read(handle, counter);
2933 2893
2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2894 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2935 callchain = perf_callchain(data->regs); 2895 if (data->callchain) {
2896 int size = 1;
2936 2897
2937 if (callchain) { 2898 if (data->callchain)
2938 callchain_size = (1 + callchain->nr) * sizeof(u64); 2899 size += data->callchain->nr;
2939 header.size += callchain_size; 2900
2940 } else 2901 size *= sizeof(u64);
2941 header.size += sizeof(u64); 2902
2903 perf_output_copy(handle, data->callchain, size);
2904 } else {
2905 u64 nr = 0;
2906 perf_output_put(handle, nr);
2907 }
2942 } 2908 }
2943 2909
2944 if (sample_type & PERF_SAMPLE_RAW) { 2910 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32); 2911 if (data->raw) {
2912 perf_output_put(handle, data->raw->size);
2913 perf_output_copy(handle, data->raw->data,
2914 data->raw->size);
2915 } else {
2916 struct {
2917 u32 size;
2918 u32 data;
2919 } raw = {
2920 .size = sizeof(u32),
2921 .data = 0,
2922 };
2923 perf_output_put(handle, raw);
2924 }
2925 }
2926}
2946 2927
2947 if (data->raw) 2928void perf_prepare_sample(struct perf_event_header *header,
2948 size += data->raw->size; 2929 struct perf_sample_data *data,
2949 else 2930 struct perf_counter *counter,
2950 size += sizeof(u32); 2931 struct pt_regs *regs)
2932{
2933 u64 sample_type = counter->attr.sample_type;
2951 2934
2952 WARN_ON_ONCE(size & (sizeof(u64)-1)); 2935 data->type = sample_type;
2953 header.size += size;
2954 }
2955 2936
2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2937 header->type = PERF_EVENT_SAMPLE;
2957 if (ret) 2938 header->size = sizeof(*header);
2958 return;
2959 2939
2960 perf_output_put(&handle, header); 2940 header->misc = 0;
2941 header->misc |= perf_misc_flags(regs);
2961 2942
2962 if (sample_type & PERF_SAMPLE_IP) 2943 if (sample_type & PERF_SAMPLE_IP) {
2963 perf_output_put(&handle, ip); 2944 data->ip = perf_instruction_pointer(regs);
2964 2945
2965 if (sample_type & PERF_SAMPLE_TID) 2946 header->size += sizeof(data->ip);
2966 perf_output_put(&handle, tid_entry); 2947 }
2967 2948
2968 if (sample_type & PERF_SAMPLE_TIME) 2949 if (sample_type & PERF_SAMPLE_TID) {
2969 perf_output_put(&handle, time); 2950 /* namespace issues */
2951 data->tid_entry.pid = perf_counter_pid(counter, current);
2952 data->tid_entry.tid = perf_counter_tid(counter, current);
2953
2954 header->size += sizeof(data->tid_entry);
2955 }
2956
2957 if (sample_type & PERF_SAMPLE_TIME) {
2958 data->time = perf_clock();
2959
2960 header->size += sizeof(data->time);
2961 }
2970 2962
2971 if (sample_type & PERF_SAMPLE_ADDR) 2963 if (sample_type & PERF_SAMPLE_ADDR)
2972 perf_output_put(&handle, data->addr); 2964 header->size += sizeof(data->addr);
2973 2965
2974 if (sample_type & PERF_SAMPLE_ID) { 2966 if (sample_type & PERF_SAMPLE_ID) {
2975 u64 id = primary_counter_id(counter); 2967 data->id = primary_counter_id(counter);
2976 2968
2977 perf_output_put(&handle, id); 2969 header->size += sizeof(data->id);
2978 } 2970 }
2979 2971
2980 if (sample_type & PERF_SAMPLE_STREAM_ID) 2972 if (sample_type & PERF_SAMPLE_STREAM_ID) {
2981 perf_output_put(&handle, counter->id); 2973 data->stream_id = counter->id;
2982 2974
2983 if (sample_type & PERF_SAMPLE_CPU) 2975 header->size += sizeof(data->stream_id);
2984 perf_output_put(&handle, cpu_entry); 2976 }
2977
2978 if (sample_type & PERF_SAMPLE_CPU) {
2979 data->cpu_entry.cpu = raw_smp_processor_id();
2980 data->cpu_entry.reserved = 0;
2981
2982 header->size += sizeof(data->cpu_entry);
2983 }
2985 2984
2986 if (sample_type & PERF_SAMPLE_PERIOD) 2985 if (sample_type & PERF_SAMPLE_PERIOD)
2987 perf_output_put(&handle, data->period); 2986 header->size += sizeof(data->period);
2988 2987
2989 if (sample_type & PERF_SAMPLE_READ) 2988 if (sample_type & PERF_SAMPLE_READ)
2990 perf_output_read(&handle, counter); 2989 header->size += perf_counter_read_size(counter);
2991 2990
2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2991 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2993 if (callchain) 2992 int size = 1;
2994 perf_output_copy(&handle, callchain, callchain_size); 2993
2995 else { 2994 data->callchain = perf_callchain(regs);
2996 u64 nr = 0; 2995
2997 perf_output_put(&handle, nr); 2996 if (data->callchain)
2998 } 2997 size += data->callchain->nr;
2998
2999 header->size += size * sizeof(u64);
2999 } 3000 }
3000 3001
3001 if (sample_type & PERF_SAMPLE_RAW) { 3002 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) { 3003 int size = sizeof(u32);
3003 perf_output_put(&handle, data->raw->size); 3004
3004 perf_output_copy(&handle, data->raw->data, data->raw->size); 3005 if (data->raw)
3005 } else { 3006 size += data->raw->size;
3006 struct { 3007 else
3007 u32 size; 3008 size += sizeof(u32);
3008 u32 data; 3009
3009 } raw = { 3010 WARN_ON_ONCE(size & (sizeof(u64)-1));
3010 .size = sizeof(u32), 3011 header->size += size;
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
3014 }
3015 } 3012 }
3013}
3014
3015static void perf_counter_output(struct perf_counter *counter, int nmi,
3016 struct perf_sample_data *data,
3017 struct pt_regs *regs)
3018{
3019 struct perf_output_handle handle;
3020 struct perf_event_header header;
3021
3022 perf_prepare_sample(&header, data, counter, regs);
3023
3024 if (perf_output_begin(&handle, counter, header.size, nmi, 1))
3025 return;
3026
3027 perf_output_sample(&handle, &header, data, counter);
3016 3028
3017 perf_output_end(&handle); 3029 perf_output_end(&handle);
3018} 3030}
@@ -3071,6 +3083,7 @@ struct perf_task_event {
3071 u32 ppid; 3083 u32 ppid;
3072 u32 tid; 3084 u32 tid;
3073 u32 ptid; 3085 u32 ptid;
3086 u64 time;
3074 } event; 3087 } event;
3075}; 3088};
3076 3089
@@ -3078,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter,
3078 struct perf_task_event *task_event) 3091 struct perf_task_event *task_event)
3079{ 3092{
3080 struct perf_output_handle handle; 3093 struct perf_output_handle handle;
3081 int size = task_event->event.header.size; 3094 int size;
3082 struct task_struct *task = task_event->task; 3095 struct task_struct *task = task_event->task;
3083 int ret = perf_output_begin(&handle, counter, size, 0, 0); 3096 int ret;
3097
3098 size = task_event->event.header.size;
3099 ret = perf_output_begin(&handle, counter, size, 0, 0);
3084 3100
3085 if (ret) 3101 if (ret)
3086 return; 3102 return;
@@ -3091,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
3091 task_event->event.tid = perf_counter_tid(counter, task); 3107 task_event->event.tid = perf_counter_tid(counter, task);
3092 task_event->event.ptid = perf_counter_tid(counter, current); 3108 task_event->event.ptid = perf_counter_tid(counter, current);
3093 3109
3110 task_event->event.time = perf_clock();
3111
3094 perf_output_put(&handle, task_event->event); 3112 perf_output_put(&handle, task_event->event);
3113
3095 perf_output_end(&handle); 3114 perf_output_end(&handle);
3096} 3115}
3097 3116
@@ -3473,7 +3492,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3473 .misc = 0, 3492 .misc = 0,
3474 .size = sizeof(throttle_event), 3493 .size = sizeof(throttle_event),
3475 }, 3494 },
3476 .time = sched_clock(), 3495 .time = perf_clock(),
3477 .id = primary_counter_id(counter), 3496 .id = primary_counter_id(counter),
3478 .stream_id = counter->id, 3497 .stream_id = counter->id,
3479 }; 3498 };
@@ -3493,14 +3512,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3493 * Generic counter overflow handling, sampling. 3512 * Generic counter overflow handling, sampling.
3494 */ 3513 */
3495 3514
3496int perf_counter_overflow(struct perf_counter *counter, int nmi, 3515static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
3497 struct perf_sample_data *data) 3516 int throttle, struct perf_sample_data *data,
3517 struct pt_regs *regs)
3498{ 3518{
3499 int events = atomic_read(&counter->event_limit); 3519 int events = atomic_read(&counter->event_limit);
3500 int throttle = counter->pmu->unthrottle != NULL;
3501 struct hw_perf_counter *hwc = &counter->hw; 3520 struct hw_perf_counter *hwc = &counter->hw;
3502 int ret = 0; 3521 int ret = 0;
3503 3522
3523 throttle = (throttle && counter->pmu->unthrottle != NULL);
3524
3504 if (!throttle) { 3525 if (!throttle) {
3505 hwc->interrupts++; 3526 hwc->interrupts++;
3506 } else { 3527 } else {
@@ -3523,7 +3544,7 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3523 } 3544 }
3524 3545
3525 if (counter->attr.freq) { 3546 if (counter->attr.freq) {
3526 u64 now = sched_clock(); 3547 u64 now = perf_clock();
3527 s64 delta = now - hwc->freq_stamp; 3548 s64 delta = now - hwc->freq_stamp;
3528 3549
3529 hwc->freq_stamp = now; 3550 hwc->freq_stamp = now;
@@ -3549,10 +3570,17 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3549 perf_counter_disable(counter); 3570 perf_counter_disable(counter);
3550 } 3571 }
3551 3572
3552 perf_counter_output(counter, nmi, data); 3573 perf_counter_output(counter, nmi, data, regs);
3553 return ret; 3574 return ret;
3554} 3575}
3555 3576
3577int perf_counter_overflow(struct perf_counter *counter, int nmi,
3578 struct perf_sample_data *data,
3579 struct pt_regs *regs)
3580{
3581 return __perf_counter_overflow(counter, nmi, 1, data, regs);
3582}
3583
3556/* 3584/*
3557 * Generic software counter infrastructure 3585 * Generic software counter infrastructure
3558 */ 3586 */
@@ -3588,9 +3616,11 @@ again:
3588} 3616}
3589 3617
3590static void perf_swcounter_overflow(struct perf_counter *counter, 3618static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data) 3619 int nmi, struct perf_sample_data *data,
3620 struct pt_regs *regs)
3592{ 3621{
3593 struct hw_perf_counter *hwc = &counter->hw; 3622 struct hw_perf_counter *hwc = &counter->hw;
3623 int throttle = 0;
3594 u64 overflow; 3624 u64 overflow;
3595 3625
3596 data->period = counter->hw.last_period; 3626 data->period = counter->hw.last_period;
@@ -3600,13 +3630,15 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
3600 return; 3630 return;
3601 3631
3602 for (; overflow; overflow--) { 3632 for (; overflow; overflow--) {
3603 if (perf_counter_overflow(counter, nmi, data)) { 3633 if (__perf_counter_overflow(counter, nmi, throttle,
3634 data, regs)) {
3604 /* 3635 /*
3605 * We inhibit the overflow from happening when 3636 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS. 3637 * hwc->interrupts == MAX_INTERRUPTS.
3607 */ 3638 */
3608 break; 3639 break;
3609 } 3640 }
3641 throttle = 1;
3610 } 3642 }
3611} 3643}
3612 3644
@@ -3618,7 +3650,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
3618} 3650}
3619 3651
3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr, 3652static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3621 int nmi, struct perf_sample_data *data) 3653 int nmi, struct perf_sample_data *data,
3654 struct pt_regs *regs)
3622{ 3655{
3623 struct hw_perf_counter *hwc = &counter->hw; 3656 struct hw_perf_counter *hwc = &counter->hw;
3624 3657
@@ -3627,11 +3660,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3627 if (!hwc->sample_period) 3660 if (!hwc->sample_period)
3628 return; 3661 return;
3629 3662
3630 if (!data->regs) 3663 if (!regs)
3631 return; 3664 return;
3632 3665
3633 if (!atomic64_add_negative(nr, &hwc->period_left)) 3666 if (!atomic64_add_negative(nr, &hwc->period_left))
3634 perf_swcounter_overflow(counter, nmi, data); 3667 perf_swcounter_overflow(counter, nmi, data, regs);
3635} 3668}
3636 3669
3637static int perf_swcounter_is_counting(struct perf_counter *counter) 3670static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3690,7 +3723,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3723static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3691 enum perf_type_id type, 3724 enum perf_type_id type,
3692 u32 event, u64 nr, int nmi, 3725 u32 event, u64 nr, int nmi,
3693 struct perf_sample_data *data) 3726 struct perf_sample_data *data,
3727 struct pt_regs *regs)
3694{ 3728{
3695 struct perf_counter *counter; 3729 struct perf_counter *counter;
3696 3730
@@ -3699,8 +3733,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3699 3733
3700 rcu_read_lock(); 3734 rcu_read_lock();
3701 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3735 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3702 if (perf_swcounter_match(counter, type, event, data->regs)) 3736 if (perf_swcounter_match(counter, type, event, regs))
3703 perf_swcounter_add(counter, nr, nmi, data); 3737 perf_swcounter_add(counter, nr, nmi, data, regs);
3704 } 3738 }
3705 rcu_read_unlock(); 3739 rcu_read_unlock();
3706} 3740}
@@ -3721,7 +3755,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3721 3755
3722static void do_perf_swcounter_event(enum perf_type_id type, u32 event, 3756static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3723 u64 nr, int nmi, 3757 u64 nr, int nmi,
3724 struct perf_sample_data *data) 3758 struct perf_sample_data *data,
3759 struct pt_regs *regs)
3725{ 3760{
3726 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3761 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3727 int *recursion = perf_swcounter_recursion_context(cpuctx); 3762 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3734,7 +3769,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3734 barrier(); 3769 barrier();
3735 3770
3736 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3771 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3737 nr, nmi, data); 3772 nr, nmi, data, regs);
3738 rcu_read_lock(); 3773 rcu_read_lock();
3739 /* 3774 /*
3740 * doesn't really matter which of the child contexts the 3775 * doesn't really matter which of the child contexts the
@@ -3742,7 +3777,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3742 */ 3777 */
3743 ctx = rcu_dereference(current->perf_counter_ctxp); 3778 ctx = rcu_dereference(current->perf_counter_ctxp);
3744 if (ctx) 3779 if (ctx)
3745 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); 3780 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
3746 rcu_read_unlock(); 3781 rcu_read_unlock();
3747 3782
3748 barrier(); 3783 barrier();
@@ -3756,11 +3791,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3756 struct pt_regs *regs, u64 addr) 3791 struct pt_regs *regs, u64 addr)
3757{ 3792{
3758 struct perf_sample_data data = { 3793 struct perf_sample_data data = {
3759 .regs = regs,
3760 .addr = addr, 3794 .addr = addr,
3761 }; 3795 };
3762 3796
3763 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); 3797 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
3798 &data, regs);
3764} 3799}
3765 3800
3766static void perf_swcounter_read(struct perf_counter *counter) 3801static void perf_swcounter_read(struct perf_counter *counter)
@@ -3797,6 +3832,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{ 3832{
3798 enum hrtimer_restart ret = HRTIMER_RESTART; 3833 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data; 3834 struct perf_sample_data data;
3835 struct pt_regs *regs;
3800 struct perf_counter *counter; 3836 struct perf_counter *counter;
3801 u64 period; 3837 u64 period;
3802 3838
@@ -3804,17 +3840,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3804 counter->pmu->read(counter); 3840 counter->pmu->read(counter);
3805 3841
3806 data.addr = 0; 3842 data.addr = 0;
3807 data.regs = get_irq_regs(); 3843 regs = get_irq_regs();
3808 /* 3844 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt 3845 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP. 3846 * context, provide the next best thing, the user IP.
3811 */ 3847 */
3812 if ((counter->attr.exclude_kernel || !data.regs) && 3848 if ((counter->attr.exclude_kernel || !regs) &&
3813 !counter->attr.exclude_user) 3849 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current); 3850 regs = task_pt_regs(current);
3815 3851
3816 if (data.regs) { 3852 if (regs) {
3817 if (perf_counter_overflow(counter, 0, &data)) 3853 if (perf_counter_overflow(counter, 0, &data, regs))
3818 ret = HRTIMER_NORESTART; 3854 ret = HRTIMER_NORESTART;
3819 } 3855 }
3820 3856
@@ -3950,15 +3986,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3950 }; 3986 };
3951 3987
3952 struct perf_sample_data data = { 3988 struct perf_sample_data data = {
3953 .regs = get_irq_regs(),
3954 .addr = addr, 3989 .addr = addr,
3955 .raw = &raw, 3990 .raw = &raw,
3956 }; 3991 };
3957 3992
3958 if (!data.regs) 3993 struct pt_regs *regs = get_irq_regs();
3959 data.regs = task_pt_regs(current); 3994
3995 if (!regs)
3996 regs = task_pt_regs(current);
3960 3997
3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); 3998 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3999 &data, regs);
3962} 4000}
3963EXPORT_SYMBOL_GPL(perf_tpcounter_event); 4001EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3964 4002
@@ -4170,8 +4208,8 @@ done:
4170static int perf_copy_attr(struct perf_counter_attr __user *uattr, 4208static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4171 struct perf_counter_attr *attr) 4209 struct perf_counter_attr *attr)
4172{ 4210{
4173 int ret;
4174 u32 size; 4211 u32 size;
4212 int ret;
4175 4213
4176 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) 4214 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4177 return -EFAULT; 4215 return -EFAULT;
@@ -4196,19 +4234,19 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4196 4234
4197 /* 4235 /*
4198 * If we're handed a bigger struct than we know of, 4236 * If we're handed a bigger struct than we know of,
4199 * ensure all the unknown bits are 0. 4237 * ensure all the unknown bits are 0 - i.e. new
4238 * user-space does not rely on any kernel feature
4239 * extensions we dont know about yet.
4200 */ 4240 */
4201 if (size > sizeof(*attr)) { 4241 if (size > sizeof(*attr)) {
4202 unsigned long val; 4242 unsigned char __user *addr;
4203 unsigned long __user *addr; 4243 unsigned char __user *end;
4204 unsigned long __user *end; 4244 unsigned char val;
4205 4245
4206 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), 4246 addr = (void __user *)uattr + sizeof(*attr);
4207 sizeof(unsigned long)); 4247 end = (void __user *)uattr + size;
4208 end = PTR_ALIGN((void __user *)uattr + size,
4209 sizeof(unsigned long));
4210 4248
4211 for (; addr < end; addr += sizeof(unsigned long)) { 4249 for (; addr < end; addr++) {
4212 ret = get_user(val, addr); 4250 ret = get_user(val, addr);
4213 if (ret) 4251 if (ret)
4214 return ret; 4252 return ret;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
442 442
443#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
444#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
445#include <asm/uaccess.h> 446#include <asm/uaccess.h>
446 447
447static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
448 int count, int *eof, void *data)
449{ 449{
450 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
451 if (count - len < 2) 451 seq_putc(m, '\n');
452 return -EINVAL; 452 return 0;
453 len += sprintf(page + len, "\n");
454 return len;
455} 453}
456 454
457static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
458 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
459{ 462{
460 struct cpumask *mask = data;
461 unsigned long full_count = count, err;
462 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
463 465
464 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
465 return -ENOMEM; 467 return -ENOMEM;
466 468
467 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
468 if (!err) { 470 if (!err) {
469 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
470 err = full_count; 472 err = count;
471 } 473 }
472 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
473 return err; 475 return err;
474} 476}
475 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
476void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
477{ 487{
478 struct proc_dir_entry *entry;
479
480 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
481 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
482 if (!entry)
483 return;
484 entry->data = prof_cpu_mask;
485 entry->read_proc = prof_cpu_mask_read_proc;
486 entry->write_proc = prof_cpu_mask_write_proc;
487} 490}
488 491
489/* 492/*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cd73738f0d5f..ecc637a0d591 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
513 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
514 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
515 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
516 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
517 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
518 } 519 }
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..09113347d328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
165 154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 mutex_lock(&clocksource_mutex);
300 398
301 list_for_each_entry(cs, &clocksource_list, list) { 399 list_for_each_entry(cs, &clocksource_list, list)
302 if (cs->resume) 400 if (cs->resume)
303 cs->resume(); 401 cs->resume();
304 }
305 402
306 clocksource_resume_watchdog(); 403 clocksource_resume_watchdog();
307 404
308 spin_unlock_irqrestore(&clocksource_lock, flags); 405 mutex_unlock(&clocksource_mutex);
309} 406}
310 407
311/** 408/**
@@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 417 clocksource_resume_watchdog();
321} 418}
322 419
420#ifdef CONFIG_GENERIC_TIME
421
323/** 422/**
324 * clocksource_get_next - Returns the selected clocksource 423 * clocksource_select - Select the best clocksource available
424 *
425 * Private function. Must hold clocksource_mutex when called.
325 * 426 *
427 * Select the clocksource with the best rating, or the clocksource,
428 * which is selected by userspace override.
326 */ 429 */
327struct clocksource *clocksource_get_next(void) 430static void clocksource_select(void)
328{ 431{
329 unsigned long flags; 432 struct clocksource *best, *cs;
330 433
331 spin_lock_irqsave(&clocksource_lock, flags); 434 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 435 return;
333 curr_clocksource = next_clocksource; 436 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 437 best = list_first_entry(&clocksource_list, struct clocksource, list);
438 /* Check for the override clocksource. */
439 list_for_each_entry(cs, &clocksource_list, list) {
440 if (strcmp(cs->name, override_name) != 0)
441 continue;
442 /*
443 * Check to make sure we don't switch to a non-highres
444 * capable clocksource if the tick code is in oneshot
445 * mode (highres or nohz)
446 */
447 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
448 tick_oneshot_mode_active()) {
449 /* Override clocksource cannot be used. */
450 printk(KERN_WARNING "Override clocksource %s is not "
451 "HRT compatible. Cannot switch while in "
452 "HRT/NOHZ mode\n", cs->name);
453 override_name[0] = 0;
454 } else
455 /* Override clocksource can be used. */
456 best = cs;
457 break;
458 }
459 if (curr_clocksource != best) {
460 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
461 curr_clocksource = best;
462 timekeeping_notify(curr_clocksource);
335 } 463 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 464}
340 465
341/** 466#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 467
343 * 468static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 469
470#endif
471
472/*
473 * clocksource_done_booting - Called near the end of core bootup
345 * 474 *
346 * Select the clocksource with the best rating, or the clocksource, 475 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 476 * We use fs_initcall because we want this to start before
477 * device_initcall but after subsys_initcall.
348 */ 478 */
349static struct clocksource *select_clocksource(void) 479static int __init clocksource_done_booting(void)
350{ 480{
351 struct clocksource *next; 481 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 482
362 if (next == curr_clocksource) 483 /*
363 return NULL; 484 * Run the watchdog first to eliminate unstable clock sources
485 */
486 clocksource_watchdog_kthread(NULL);
364 487
365 return next; 488 mutex_lock(&clocksource_mutex);
489 clocksource_select();
490 mutex_unlock(&clocksource_mutex);
491 return 0;
366} 492}
493fs_initcall(clocksource_done_booting);
367 494
368/* 495/*
369 * Enqueue the clocksource sorted by rating 496 * Enqueue the clocksource sorted by rating
370 */ 497 */
371static int clocksource_enqueue(struct clocksource *c) 498static void clocksource_enqueue(struct clocksource *cs)
372{ 499{
373 struct list_head *tmp, *entry = &clocksource_list; 500 struct list_head *entry = &clocksource_list;
501 struct clocksource *tmp;
374 502
375 list_for_each(tmp, &clocksource_list) { 503 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 504 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 505 if (tmp->rating >= cs->rating)
383 entry = tmp; 506 entry = &tmp->list;
384 } 507 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 508}
393 509
394/** 510/**
@@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 513 *
398 * Returns -EBUSY if registration fails, zero otherwise. 514 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 515 */
400int clocksource_register(struct clocksource *c) 516int clocksource_register(struct clocksource *cs)
401{ 517{
402 unsigned long flags; 518 mutex_lock(&clocksource_mutex);
403 int ret; 519 clocksource_enqueue(cs);
404 520 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 521 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 522 mutex_unlock(&clocksource_mutex);
407 if (!ret) 523 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 524}
414EXPORT_SYMBOL(clocksource_register); 525EXPORT_SYMBOL(clocksource_register);
415 526
527static void __clocksource_change_rating(struct clocksource *cs, int rating)
528{
529 list_del(&cs->list);
530 cs->rating = rating;
531 clocksource_enqueue(cs);
532 clocksource_select();
533}
534
416/** 535/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 536 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 537 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 538void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 539{
422 unsigned long flags; 540 mutex_lock(&clocksource_mutex);
423 541 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 542 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 543}
544EXPORT_SYMBOL(clocksource_change_rating);
431 545
432/** 546/**
433 * clocksource_unregister - remove a registered clocksource 547 * clocksource_unregister - remove a registered clocksource
434 */ 548 */
435void clocksource_unregister(struct clocksource *cs) 549void clocksource_unregister(struct clocksource *cs)
436{ 550{
437 unsigned long flags; 551 mutex_lock(&clocksource_mutex);
438 552 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 553 list_del(&cs->list);
441 if (clocksource_override == cs) 554 clocksource_select();
442 clocksource_override = NULL; 555 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 556}
557EXPORT_SYMBOL(clocksource_unregister);
446 558
447#ifdef CONFIG_SYSFS 559#ifdef CONFIG_SYSFS
448/** 560/**
@@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 570{
459 ssize_t count = 0; 571 ssize_t count = 0;
460 572
461 spin_lock_irq(&clocksource_lock); 573 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 574 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 575 mutex_unlock(&clocksource_mutex);
464 576
465 return count; 577 return count;
466} 578}
@@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 590 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 591 const char *buf, size_t count)
480{ 592{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 593 size_t ret = count;
483 int len;
484 594
485 /* strings from sysfs write are not 0 terminated! */ 595 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 596 if (count >= sizeof(override_name))
@@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 600 if (buf[count-1] == '\n')
491 count--; 601 count--;
492 602
493 spin_lock_irq(&clocksource_lock); 603 mutex_lock(&clocksource_mutex);
494 604
495 if (count > 0) 605 if (count > 0)
496 memcpy(override_name, buf, count); 606 memcpy(override_name, buf, count);
497 override_name[count] = 0; 607 override_name[count] = 0;
608 clocksource_select();
498 609
499 len = strlen(override_name); 610 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 611
532 return ret; 612 return ret;
533} 613}
@@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 627 struct clocksource *src;
548 ssize_t count = 0; 628 ssize_t count = 0;
549 629
550 spin_lock_irq(&clocksource_lock); 630 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 631 list_for_each_entry(src, &clocksource_list, list) {
552 /* 632 /*
553 * Don't show non-HRES clocksource if the tick code is 633 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 639 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 640 "%s ", src->name);
561 } 641 }
562 spin_unlock_irq(&clocksource_lock); 642 mutex_unlock(&clocksource_mutex);
563 643
564 count += snprintf(buf + count, 644 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 645 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 694 */
615static int __init boot_override_clocksource(char* str) 695static int __init boot_override_clocksource(char* str)
616{ 696{
617 unsigned long flags; 697 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 698 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 699 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 700 mutex_unlock(&clocksource_mutex);
622 return 1; 701 return 1;
623} 702}
624 703
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/tick.h> 20#include <linux/tick.h>
21#include <linux/stop_machine.h>
22
23/* Structure holding internal timekeeping values. */
24struct timekeeper {
25 /* Current clocksource used for timekeeping. */
26 struct clocksource *clock;
27 /* The shift value of the current clocksource. */
28 int shift;
29
30 /* Number of clock cycles in one NTP interval. */
31 cycle_t cycle_interval;
32 /* Number of clock shifted nano seconds in one NTP interval. */
33 u64 xtime_interval;
34 /* Raw nano seconds accumulated per NTP interval. */
35 u32 raw_interval;
36
37 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
38 u64 xtime_nsec;
39 /* Difference between accumulated time and NTP time in ntp
40 * shifted nano seconds. */
41 s64 ntp_error;
42 /* Shift conversion between clock shifted nano seconds and
43 * ntp shifted nano seconds. */
44 int ntp_error_shift;
45 /* NTP adjusted clock multiplier */
46 u32 mult;
47};
48
49struct timekeeper timekeeper;
50
51/**
52 * timekeeper_setup_internals - Set up internals to use clocksource clock.
53 *
54 * @clock: Pointer to clocksource.
55 *
56 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
57 * pair and interval request.
58 *
59 * Unless you're the timekeeping code, you should not be using this!
60 */
61static void timekeeper_setup_internals(struct clocksource *clock)
62{
63 cycle_t interval;
64 u64 tmp;
65
66 timekeeper.clock = clock;
67 clock->cycle_last = clock->read(clock);
21 68
69 /* Do the ns -> cycle conversion first, using original mult */
70 tmp = NTP_INTERVAL_LENGTH;
71 tmp <<= clock->shift;
72 tmp += clock->mult/2;
73 do_div(tmp, clock->mult);
74 if (tmp == 0)
75 tmp = 1;
76
77 interval = (cycle_t) tmp;
78 timekeeper.cycle_interval = interval;
79
80 /* Go back from cycles -> shifted ns */
81 timekeeper.xtime_interval = (u64) interval * clock->mult;
82 timekeeper.raw_interval =
83 ((u64) interval * clock->mult) >> clock->shift;
84
85 timekeeper.xtime_nsec = 0;
86 timekeeper.shift = clock->shift;
87
88 timekeeper.ntp_error = 0;
89 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
90
91 /*
92 * The timekeeper keeps its own mult values for the currently
93 * active clocksource. These value will be adjusted via NTP
94 * to counteract clock drifting.
95 */
96 timekeeper.mult = clock->mult;
97}
98
99/* Timekeeper helper functions. */
100static inline s64 timekeeping_get_ns(void)
101{
102 cycle_t cycle_now, cycle_delta;
103 struct clocksource *clock;
104
105 /* read clocksource: */
106 clock = timekeeper.clock;
107 cycle_now = clock->read(clock);
108
109 /* calculate the delta since the last update_wall_time: */
110 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
111
112 /* return delta convert to nanoseconds using ntp adjusted mult. */
113 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
114 timekeeper.shift);
115}
116
117static inline s64 timekeeping_get_ns_raw(void)
118{
119 cycle_t cycle_now, cycle_delta;
120 struct clocksource *clock;
121
122 /* read clocksource: */
123 clock = timekeeper.clock;
124 cycle_now = clock->read(clock);
125
126 /* calculate the delta since the last update_wall_time: */
127 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
128
129 /* return delta convert to nanoseconds using ntp adjusted mult. */
130 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
131}
22 132
23/* 133/*
24 * This read-write spinlock protects us from races in SMP while 134 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 154 */
45struct timespec xtime __attribute__ ((aligned (16))); 155struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 156struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 157static struct timespec total_sleep_time;
158
159/*
160 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
161 */
162struct timespec raw_time;
48 163
49/* flag for if timekeeping is suspended */ 164/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 165int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 171 timespec_add_ns(&xtime_cache, nsec);
57} 172}
58 173
59struct clocksource *clock; 174/* must hold xtime_lock */
60 175void timekeeping_leap_insert(int leapsecond)
176{
177 xtime.tv_sec += leapsecond;
178 wall_to_monotonic.tv_sec -= leapsecond;
179 update_vsyscall(&xtime, timekeeper.clock);
180}
61 181
62#ifdef CONFIG_GENERIC_TIME 182#ifdef CONFIG_GENERIC_TIME
183
63/** 184/**
64 * clocksource_forward_now - update clock to the current time 185 * timekeeping_forward_now - update clock to the current time
65 * 186 *
66 * Forward the current clock to update its state since the last call to 187 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 188 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 189 * as it avoids having to deal with this time offset explicitly.
69 */ 190 */
70static void clocksource_forward_now(void) 191static void timekeeping_forward_now(void)
71{ 192{
72 cycle_t cycle_now, cycle_delta; 193 cycle_t cycle_now, cycle_delta;
194 struct clocksource *clock;
73 s64 nsec; 195 s64 nsec;
74 196
75 cycle_now = clocksource_read(clock); 197 clock = timekeeper.clock;
198 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 200 clock->cycle_last = cycle_now;
78 201
79 nsec = cyc2ns(clock, cycle_delta); 202 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
203 timekeeper.shift);
80 204
81 /* If arch requires, add in gettimeoffset() */ 205 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 206 nsec += arch_gettimeoffset();
83 207
84 timespec_add_ns(&xtime, nsec); 208 timespec_add_ns(&xtime, nsec);
85 209
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 210 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 211 timespec_add_ns(&raw_time, nsec);
88} 212}
89 213
90/** 214/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
95 */ 219 */
96void getnstimeofday(struct timespec *ts) 220void getnstimeofday(struct timespec *ts)
97{ 221{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 222 unsigned long seq;
100 s64 nsecs; 223 s64 nsecs;
101 224
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 228 seq = read_seqbegin(&xtime_lock);
106 229
107 *ts = xtime; 230 *ts = xtime;
108 231 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 232
118 /* If arch requires, add in gettimeoffset() */ 233 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 234 nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
125 240
126EXPORT_SYMBOL(getnstimeofday); 241EXPORT_SYMBOL(getnstimeofday);
127 242
243ktime_t ktime_get(void)
244{
245 unsigned int seq;
246 s64 secs, nsecs;
247
248 WARN_ON(timekeeping_suspended);
249
250 do {
251 seq = read_seqbegin(&xtime_lock);
252 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
253 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
254 nsecs += timekeeping_get_ns();
255
256 } while (read_seqretry(&xtime_lock, seq));
257 /*
258 * Use ktime_set/ktime_add_ns to create a proper ktime on
259 * 32-bit architectures without CONFIG_KTIME_SCALAR.
260 */
261 return ktime_add_ns(ktime_set(secs, 0), nsecs);
262}
263EXPORT_SYMBOL_GPL(ktime_get);
264
265/**
266 * ktime_get_ts - get the monotonic clock in timespec format
267 * @ts: pointer to timespec variable
268 *
269 * The function calculates the monotonic clock from the realtime
270 * clock and the wall_to_monotonic offset and stores the result
271 * in normalized timespec format in the variable pointed to by @ts.
272 */
273void ktime_get_ts(struct timespec *ts)
274{
275 struct timespec tomono;
276 unsigned int seq;
277 s64 nsecs;
278
279 WARN_ON(timekeeping_suspended);
280
281 do {
282 seq = read_seqbegin(&xtime_lock);
283 *ts = xtime;
284 tomono = wall_to_monotonic;
285 nsecs = timekeeping_get_ns();
286
287 } while (read_seqretry(&xtime_lock, seq));
288
289 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
290 ts->tv_nsec + tomono.tv_nsec + nsecs);
291}
292EXPORT_SYMBOL_GPL(ktime_get_ts);
293
128/** 294/**
129 * do_gettimeofday - Returns the time of day in a timeval 295 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 296 * @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
157 323
158 write_seqlock_irqsave(&xtime_lock, flags); 324 write_seqlock_irqsave(&xtime_lock, flags);
159 325
160 clocksource_forward_now(); 326 timekeeping_forward_now();
161 327
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 328 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 329 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
167 333
168 update_xtime_cache(0); 334 update_xtime_cache(0);
169 335
170 clock->error = 0; 336 timekeeper.ntp_error = 0;
171 ntp_clear(); 337 ntp_clear();
172 338
173 update_vsyscall(&xtime, clock); 339 update_vsyscall(&xtime, timekeeper.clock);
174 340
175 write_sequnlock_irqrestore(&xtime_lock, flags); 341 write_sequnlock_irqrestore(&xtime_lock, flags);
176 342
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 353 *
188 * Accumulates current time interval and initializes new clocksource 354 * Accumulates current time interval and initializes new clocksource
189 */ 355 */
190static void change_clocksource(void) 356static int change_clocksource(void *data)
191{ 357{
192 struct clocksource *new, *old; 358 struct clocksource *new, *old;
193 359
194 new = clocksource_get_next(); 360 new = (struct clocksource *) data;
361
362 timekeeping_forward_now();
363 if (!new->enable || new->enable(new) == 0) {
364 old = timekeeper.clock;
365 timekeeper_setup_internals(new);
366 if (old->disable)
367 old->disable(old);
368 }
369 return 0;
370}
195 371
196 if (clock == new) 372/**
373 * timekeeping_notify - Install a new clock source
374 * @clock: pointer to the clock source
375 *
376 * This function is called from clocksource.c after a new, better clock
377 * source has been registered. The caller holds the clocksource_mutex.
378 */
379void timekeeping_notify(struct clocksource *clock)
380{
381 if (timekeeper.clock == clock)
197 return; 382 return;
383 stop_machine(change_clocksource, clock, NULL);
384 tick_clock_notify();
385}
198 386
199 clocksource_forward_now(); 387#else /* GENERIC_TIME */
200 388
201 if (clocksource_enable(new)) 389static inline void timekeeping_forward_now(void) { }
202 return;
203 390
204 new->raw_time = clock->raw_time; 391/**
205 old = clock; 392 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 393 *
207 clocksource_disable(old); 394 * returns the time in ktime_t format
395 */
396ktime_t ktime_get(void)
397{
398 struct timespec now;
208 399
209 clock->cycle_last = 0; 400 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 401
215 tick_clock_notify(); 402 return timespec_to_ktime(now);
403}
404EXPORT_SYMBOL_GPL(ktime_get);
216 405
217 /* 406/**
218 * We're holding xtime lock and waking up klogd would deadlock 407 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 408 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 409 *
221 clock->name); 410 * The function calculates the monotonic clock from the realtime
222 */ 411 * clock and the wall_to_monotonic offset and stores the result
412 * in normalized timespec format in the variable pointed to by @ts.
413 */
414void ktime_get_ts(struct timespec *ts)
415{
416 struct timespec tomono;
417 unsigned long seq;
418
419 do {
420 seq = read_seqbegin(&xtime_lock);
421 getnstimeofday(ts);
422 tomono = wall_to_monotonic;
423
424 } while (read_seqretry(&xtime_lock, seq));
425
426 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
427 ts->tv_nsec + tomono.tv_nsec);
223} 428}
224#else 429EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 430
226static inline void change_clocksource(void) { } 431#endif /* !GENERIC_TIME */
227#endif 432
433/**
434 * ktime_get_real - get the real (wall-) time in ktime_t format
435 *
436 * returns the time in ktime_t format
437 */
438ktime_t ktime_get_real(void)
439{
440 struct timespec now;
441
442 getnstimeofday(&now);
443
444 return timespec_to_ktime(now);
445}
446EXPORT_SYMBOL_GPL(ktime_get_real);
228 447
229/** 448/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 449 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 455{
237 unsigned long seq; 456 unsigned long seq;
238 s64 nsecs; 457 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 458
241 do { 459 do {
242 seq = read_seqbegin(&xtime_lock); 460 seq = read_seqbegin(&xtime_lock);
243 461 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 462 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 463
255 } while (read_seqretry(&xtime_lock, seq)); 464 } while (read_seqretry(&xtime_lock, seq));
256 465
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
270 do { 479 do {
271 seq = read_seqbegin(&xtime_lock); 480 seq = read_seqbegin(&xtime_lock);
272 481
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 482 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 483
275 } while (read_seqretry(&xtime_lock, seq)); 484 } while (read_seqretry(&xtime_lock, seq));
276 485
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
278} 487}
279 488
280/** 489/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 490 * read_persistent_clock - Return time from the persistent clock.
282 * 491 *
283 * Weak dummy function for arches that do not yet support it. 492 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 493 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 494 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 495 *
287 * XXX - Do be sure to remove it once all arches implement it. 496 * XXX - Do be sure to remove it once all arches implement it.
288 */ 497 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 498void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 499{
291 return 0; 500 ts->tv_sec = 0;
501 ts->tv_nsec = 0;
502}
503
504/**
505 * read_boot_clock - Return time of the system start.
506 *
507 * Weak dummy function for arches that do not yet support it.
508 * Function to read the exact time the system has been started.
509 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
510 *
511 * XXX - Do be sure to remove it once all arches implement it.
512 */
513void __attribute__((weak)) read_boot_clock(struct timespec *ts)
514{
515 ts->tv_sec = 0;
516 ts->tv_nsec = 0;
292} 517}
293 518
294/* 519/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 521 */
297void __init timekeeping_init(void) 522void __init timekeeping_init(void)
298{ 523{
524 struct clocksource *clock;
299 unsigned long flags; 525 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 526 struct timespec now, boot;
527
528 read_persistent_clock(&now);
529 read_boot_clock(&boot);
301 530
302 write_seqlock_irqsave(&xtime_lock, flags); 531 write_seqlock_irqsave(&xtime_lock, flags);
303 532
304 ntp_init(); 533 ntp_init();
305 534
306 clock = clocksource_get_next(); 535 clock = clocksource_default_clock();
307 clocksource_enable(clock); 536 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 537 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 538 timekeeper_setup_internals(clock);
310 539
311 xtime.tv_sec = sec; 540 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 541 xtime.tv_nsec = now.tv_nsec;
542 raw_time.tv_sec = 0;
543 raw_time.tv_nsec = 0;
544 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
545 boot.tv_sec = xtime.tv_sec;
546 boot.tv_nsec = xtime.tv_nsec;
547 }
313 set_normalized_timespec(&wall_to_monotonic, 548 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 549 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 550 update_xtime_cache(0);
316 total_sleep_time = 0; 551 total_sleep_time.tv_sec = 0;
552 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 553 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 554}
319 555
320/* time in seconds when suspend began */ 556/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 557static struct timespec timekeeping_suspend_time;
322 558
323/** 559/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 560 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 567static int timekeeping_resume(struct sys_device *dev)
332{ 568{
333 unsigned long flags; 569 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 570 struct timespec ts;
571
572 read_persistent_clock(&ts);
335 573
336 clocksource_resume(); 574 clocksource_resume();
337 575
338 write_seqlock_irqsave(&xtime_lock, flags); 576 write_seqlock_irqsave(&xtime_lock, flags);
339 577
340 if (now && (now > timekeeping_suspend_time)) { 578 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 579 ts = timespec_sub(ts, timekeeping_suspend_time);
342 580 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 581 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 582 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 583 }
347 update_xtime_cache(0); 584 update_xtime_cache(0);
348 /* re-base the last cycle value */ 585 /* re-base the last cycle value */
349 clock->cycle_last = 0; 586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 587 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 588 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 589 write_sequnlock_irqrestore(&xtime_lock, flags);
354 590
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 602{
367 unsigned long flags; 603 unsigned long flags;
368 604
369 timekeeping_suspend_time = read_persistent_clock(); 605 read_persistent_clock(&timekeeping_suspend_time);
370 606
371 write_seqlock_irqsave(&xtime_lock, flags); 607 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 608 timekeeping_forward_now();
373 timekeeping_suspended = 1; 609 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 610 write_sequnlock_irqrestore(&xtime_lock, flags);
375 611
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 640 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 641 * to compensate for late or lost adjustments.
406 */ 642 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 643static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 644 s64 *offset)
409{ 645{
410 s64 tick_error, i; 646 s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 656 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 657 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 658 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 659 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 660 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 661 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 662 error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 665 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 666 * remove the single look ahead already included in the error.
431 */ 667 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 668 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 669 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 670 error = ((error - tick_error) >> look_ahead) + tick_error;
435 671
436 /* Finally calculate the adjustment shift value. */ 672 /* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 691 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 692 * for other values we can do a bit more work.
457 */ 693 */
458static void clocksource_adjust(s64 offset) 694static void timekeeping_adjust(s64 offset)
459{ 695{
460 s64 error, interval = clock->cycle_interval; 696 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 697 int adj;
462 698
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 699 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 700 if (error > interval) {
465 error >>= 2; 701 error >>= 2;
466 if (likely(error <= interval)) 702 if (likely(error <= interval))
467 adj = 1; 703 adj = 1;
468 else 704 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 705 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 706 } else if (error < -interval) {
471 error >>= 2; 707 error >>= 2;
472 if (likely(error >= -interval)) { 708 if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 710 interval = -interval;
475 offset = -offset; 711 offset = -offset;
476 } else 712 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 713 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 714 } else
479 return; 715 return;
480 716
481 clock->mult += adj; 717 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 718 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 719 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 720 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 721 timekeeper.ntp_error_shift;
486} 722}
487 723
488/** 724/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
492 */ 728 */
493void update_wall_time(void) 729void update_wall_time(void)
494{ 730{
731 struct clocksource *clock;
495 cycle_t offset; 732 cycle_t offset;
733 u64 nsecs;
496 734
497 /* Make sure we're fully resumed: */ 735 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 736 if (unlikely(timekeeping_suspended))
499 return; 737 return;
500 738
739 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 740#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 741 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 742#else
504 offset = clock->cycle_interval; 743 offset = timekeeper.cycle_interval;
505#endif 744#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 745 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 746
508 /* normally this loop will run just once, however in the 747 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 748 * case of lost or late ticks, it will accumulate correctly.
510 */ 749 */
511 while (offset >= clock->cycle_interval) { 750 while (offset >= timekeeper.cycle_interval) {
751 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
752
512 /* accumulate one interval */ 753 /* accumulate one interval */
513 offset -= clock->cycle_interval; 754 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 755 clock->cycle_last += timekeeper.cycle_interval;
515 756
516 clock->xtime_nsec += clock->xtime_interval; 757 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 758 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 759 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 760 xtime.tv_sec++;
520 second_overflow(); 761 second_overflow();
521 } 762 }
522 763
523 clock->raw_time.tv_nsec += clock->raw_interval; 764 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 765 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 766 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 767 raw_time.tv_sec++;
527 } 768 }
528 769
529 /* accumulate error between NTP and clock interval */ 770 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 771 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 772 timekeeper.ntp_error -= timekeeper.xtime_interval <<
773 timekeeper.ntp_error_shift;
532 } 774 }
533 775
534 /* correct the clock when NTP error is too big */ 776 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 777 timekeeping_adjust(offset);
536 778
537 /* 779 /*
538 * Since in the loop above, we accumulate any amount of time 780 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 781 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 782 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 783 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 784 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 785 * cause it to underflow.
544 * 786 *
@@ -550,24 +792,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 792 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 793 * xtime_nsec is not as small.
552 */ 794 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 795 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 796 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 797 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 798 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 799 }
558 800
559 /* store full nanoseconds into xtime after rounding it up and 801 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 802 * add the remainder to the error difference.
561 */ 803 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 804 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 805 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 806 timekeeper.ntp_error += timekeeper.xtime_nsec <<
807 timekeeper.ntp_error_shift;
565 808
566 update_xtime_cache(cyc2ns(clock, offset)); 809 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
810 update_xtime_cache(nsecs);
567 811
568 /* check to see if there is a new clocksource to use */ 812 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 813 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 814}
572 815
573/** 816/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
583 */ 826 */
584void getboottime(struct timespec *ts) 827void getboottime(struct timespec *ts)
585{ 828{
586 set_normalized_timespec(ts, 829 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 830 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 831 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
832 };
833
834 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 835}
590 836
591/** 837/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
594 */ 840 */
595void monotonic_to_bootbased(struct timespec *ts) 841void monotonic_to_bootbased(struct timespec *ts)
596{ 842{
597 ts->tv_sec += total_sleep_time; 843 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 844}
599 845
600unsigned long get_seconds(void) 846unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
603} 849}
604EXPORT_SYMBOL(get_seconds); 850EXPORT_SYMBOL(get_seconds);
605 851
852struct timespec __current_kernel_time(void)
853{
854 return xtime_cache;
855}
606 856
607struct timespec current_kernel_time(void) 857struct timespec current_kernel_time(void)
608{ 858{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
618 return now; 868 return now;
619} 869}
620EXPORT_SYMBOL(current_kernel_time); 870EXPORT_SYMBOL(current_kernel_time);
871
872struct timespec get_monotonic_coarse(void)
873{
874 struct timespec now, mono;
875 unsigned long seq;
876
877 do {
878 seq = read_seqbegin(&xtime_lock);
879
880 now = xtime_cache;
881 mono = wall_to_monotonic;
882 } while (read_seqretry(&xtime_lock, seq));
883
884 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
885 now.tv_nsec + mono.tv_nsec);
886 return now;
887}
diff --git a/kernel/timer.c b/kernel/timer.c
index a3d25f415019..bbb51074680e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -72,6 +72,7 @@ struct tvec_base {
72 spinlock_t lock; 72 spinlock_t lock;
73 struct timer_list *running_timer; 73 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 74 unsigned long timer_jiffies;
75 unsigned long next_timer;
75 struct tvec_root tv1; 76 struct tvec_root tv1;
76 struct tvec tv2; 77 struct tvec tv2;
77 struct tvec tv3; 78 struct tvec tv3;
@@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 623
623 if (timer_pending(timer)) { 624 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 625 detach_timer(timer, 0);
626 if (timer->expires == base->next_timer &&
627 !tbase_get_deferrable(timer->base))
628 base->next_timer = base->timer_jiffies;
625 ret = 1; 629 ret = 1;
626 } else { 630 } else {
627 if (pending_only) 631 if (pending_only)
@@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 667 }
664 668
665 timer->expires = expires; 669 timer->expires = expires;
670 if (time_before(timer->expires, base->next_timer) &&
671 !tbase_get_deferrable(timer->base))
672 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 673 internal_add_timer(base, timer);
667 674
668out_unlock: 675out_unlock:
@@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
781 spin_lock_irqsave(&base->lock, flags); 788 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 789 timer_set_base(timer, base);
783 debug_timer_activate(timer); 790 debug_timer_activate(timer);
791 if (time_before(timer->expires, base->next_timer) &&
792 !tbase_get_deferrable(timer->base))
793 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 794 internal_add_timer(base, timer);
785 /* 795 /*
786 * Check whether the other CPU is idle and needs to be 796 * Check whether the other CPU is idle and needs to be
@@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 827 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 828 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 829 detach_timer(timer, 1);
830 if (timer->expires == base->next_timer &&
831 !tbase_get_deferrable(timer->base))
832 base->next_timer = base->timer_jiffies;
820 ret = 1; 833 ret = 1;
821 } 834 }
822 spin_unlock_irqrestore(&base->lock, flags); 835 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 863 ret = 0;
851 if (timer_pending(timer)) { 864 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 865 detach_timer(timer, 1);
866 if (timer->expires == base->next_timer &&
867 !tbase_get_deferrable(timer->base))
868 base->next_timer = base->timer_jiffies;
853 ret = 1; 869 ret = 1;
854 } 870 }
855out: 871out:
@@ -1007,8 +1023,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1023#ifdef CONFIG_NO_HZ
1008/* 1024/*
1009 * Find out when the next timer event is due to happen. This 1025 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1026 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1027 * This function needs to be called with interrupts disabled.
1012 */ 1028 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1029static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1030{
@@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1150 unsigned long expires;
1135 1151
1136 spin_lock(&base->lock); 1152 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1153 if (time_before_eq(base->next_timer, base->timer_jiffies))
1154 base->next_timer = __next_timer_interrupt(base);
1155 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1156 spin_unlock(&base->lock);
1139 1157
1140 if (time_before_eq(expires, now)) 1158 if (time_before_eq(expires, now))
@@ -1522,6 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1522 INIT_LIST_HEAD(base->tv1.vec + j); 1540 INIT_LIST_HEAD(base->tv1.vec + j);
1523 1541
1524 base->timer_jiffies = jiffies; 1542 base->timer_jiffies = jiffies;
1543 base->next_timer = base->timer_jiffies;
1525 return 0; 1544 return 0;
1526} 1545}
1527 1546
@@ -1534,6 +1553,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1534 timer = list_first_entry(head, struct timer_list, entry); 1553 timer = list_first_entry(head, struct timer_list, entry);
1535 detach_timer(timer, 0); 1554 detach_timer(timer, 0);
1536 timer_set_base(timer, new_base); 1555 timer_set_base(timer, new_base);
1556 if (time_before(timer->expires, new_base->next_timer) &&
1557 !tbase_get_deferrable(timer->base))
1558 new_base->next_timer = timer->expires;
1537 internal_add_timer(new_base, timer); 1559 internal_add_timer(new_base, timer);
1538 } 1560 }
1539} 1561}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 57
58libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc615f84751b..c71e91bf7372 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2414,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2414static void * 2414static void *
2415__g_next(struct seq_file *m, loff_t *pos) 2415__g_next(struct seq_file *m, loff_t *pos)
2416{ 2416{
2417 unsigned long *array = m->private;
2418
2419 if (*pos >= ftrace_graph_count) 2417 if (*pos >= ftrace_graph_count)
2420 return NULL; 2418 return NULL;
2421 return &array[*pos]; 2419 return &ftrace_graph_funcs[*pos];
2422} 2420}
2423 2421
2424static void * 2422static void *
@@ -2482,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2482 ftrace_graph_count = 0; 2480 ftrace_graph_count = 0;
2483 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2481 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2484 } 2482 }
2483 mutex_unlock(&graph_lock);
2485 2484
2486 if (file->f_mode & FMODE_READ) { 2485 if (file->f_mode & FMODE_READ)
2487 ret = seq_open(file, &ftrace_graph_seq_ops); 2486 ret = seq_open(file, &ftrace_graph_seq_ops);
2488 if (!ret) {
2489 struct seq_file *m = file->private_data;
2490 m->private = ftrace_graph_funcs;
2491 }
2492 } else
2493 file->private_data = ftrace_graph_funcs;
2494 mutex_unlock(&graph_lock);
2495 2487
2496 return ret; 2488 return ret;
2497} 2489}
@@ -2560,7 +2552,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2560 size_t cnt, loff_t *ppos) 2552 size_t cnt, loff_t *ppos)
2561{ 2553{
2562 struct trace_parser parser; 2554 struct trace_parser parser;
2563 unsigned long *array;
2564 size_t read = 0; 2555 size_t read = 0;
2565 ssize_t ret; 2556 ssize_t ret;
2566 2557
@@ -2574,12 +2565,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2574 goto out; 2565 goto out;
2575 } 2566 }
2576 2567
2577 if (file->f_mode & FMODE_READ) {
2578 struct seq_file *m = file->private_data;
2579 array = m->private;
2580 } else
2581 array = file->private_data;
2582
2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2584 ret = -ENOMEM; 2569 ret = -ENOMEM;
2585 goto out; 2570 goto out;
@@ -2591,7 +2576,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2591 parser.buffer[parser.idx] = 0; 2576 parser.buffer[parser.idx] = 0;
2592 2577
2593 /* we allow only one expression at a time */ 2578 /* we allow only one expression at a time */
2594 ret = ftrace_set_func(array, &ftrace_graph_count, 2579 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2595 parser.buffer); 2580 parser.buffer);
2596 if (ret) 2581 if (ret)
2597 goto out; 2582 goto out;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6eef38923b07..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 420232a1fbba..a35925d222ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
125 125
126static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
127 127
128#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
129static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
136 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
@@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
242static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
243 243
244/* 244/*
245 * max_tracer_type_len is used to simplify the allocating of
246 * buffers to read userspace tracer names. We keep track of
247 * the longest tracer name registered.
248 */
249static int max_tracer_type_len;
250
251/*
252 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
253 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
254 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -625,7 +618,6 @@ __releases(kernel_lock)
625__acquires(kernel_lock) 618__acquires(kernel_lock)
626{ 619{
627 struct tracer *t; 620 struct tracer *t;
628 int len;
629 int ret = 0; 621 int ret = 0;
630 622
631 if (!type->name) { 623 if (!type->name) {
@@ -633,6 +625,11 @@ __acquires(kernel_lock)
633 return -1; 625 return -1;
634 } 626 }
635 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
636 /* 633 /*
637 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
638 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -647,7 +644,7 @@ __acquires(kernel_lock)
647 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
648 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
649 /* already found */ 646 /* already found */
650 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
651 type->name); 648 type->name);
652 ret = -1; 649 ret = -1;
653 goto out; 650 goto out;
@@ -698,9 +695,6 @@ __acquires(kernel_lock)
698 695
699 type->next = trace_types; 696 type->next = trace_types;
700 trace_types = type; 697 trace_types = type;
701 len = strlen(type->name);
702 if (len > max_tracer_type_len)
703 max_tracer_type_len = len;
704 698
705 out: 699 out:
706 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -709,7 +703,7 @@ __acquires(kernel_lock)
709 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
710 goto out_unlock; 704 goto out_unlock;
711 705
712 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
713 goto out_unlock; 707 goto out_unlock;
714 708
715 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -731,14 +725,13 @@ __acquires(kernel_lock)
731void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
732{ 726{
733 struct tracer **t; 727 struct tracer **t;
734 int len;
735 728
736 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
737 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
738 if (*t == type) 731 if (*t == type)
739 goto found; 732 goto found;
740 } 733 }
741 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
742 goto out; 735 goto out;
743 736
744 found: 737 found:
@@ -751,17 +744,7 @@ void unregister_tracer(struct tracer *type)
751 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
752 current_trace = &nop_trace; 745 current_trace = &nop_trace;
753 } 746 }
754 747out:
755 if (strlen(type->name) != max_tracer_type_len)
756 goto out;
757
758 max_tracer_type_len = 0;
759 for (t = &trace_types; *t; t = &(*t)->next) {
760 len = strlen((*t)->name);
761 if (len > max_tracer_type_len)
762 max_tracer_type_len = len;
763 }
764 out:
765 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
766} 749}
767 750
@@ -2610,7 +2593,7 @@ static ssize_t
2610tracing_set_trace_read(struct file *filp, char __user *ubuf, 2593tracing_set_trace_read(struct file *filp, char __user *ubuf,
2611 size_t cnt, loff_t *ppos) 2594 size_t cnt, loff_t *ppos)
2612{ 2595{
2613 char buf[max_tracer_type_len+2]; 2596 char buf[MAX_TRACER_SIZE+2];
2614 int r; 2597 int r;
2615 2598
2616 mutex_lock(&trace_types_lock); 2599 mutex_lock(&trace_types_lock);
@@ -2760,15 +2743,15 @@ static ssize_t
2760tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2743tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2761 size_t cnt, loff_t *ppos) 2744 size_t cnt, loff_t *ppos)
2762{ 2745{
2763 char buf[max_tracer_type_len+1]; 2746 char buf[MAX_TRACER_SIZE+1];
2764 int i; 2747 int i;
2765 size_t ret; 2748 size_t ret;
2766 int err; 2749 int err;
2767 2750
2768 ret = cnt; 2751 ret = cnt;
2769 2752
2770 if (cnt > max_tracer_type_len) 2753 if (cnt > MAX_TRACER_SIZE)
2771 cnt = max_tracer_type_len; 2754 cnt = MAX_TRACER_SIZE;
2772 2755
2773 if (copy_from_user(&buf, ubuf, cnt)) 2756 if (copy_from_user(&buf, ubuf, cnt))
2774 return -EFAULT; 2757 return -EFAULT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86bcff94791a..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <trace/boot.h> 12#include <trace/boot.h>
13#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
14#include <trace/power.h>
15 14
16#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
17#include <linux/ftrace_event.h> 16#include <linux/ftrace_event.h>
@@ -37,7 +36,6 @@ enum trace_type {
37 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_POWER,
41 TRACE_BLK, 39 TRACE_BLK,
42 40
43 __TRACE_LAST_TYPE, 41 __TRACE_LAST_TYPE,
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
207 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 205 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
208 TRACE_GRAPH_RET); \ 206 TRACE_GRAPH_RET); \
209 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 207 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
210 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
211 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 208 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
212 TRACE_KMEM_ALLOC); \ 209 TRACE_KMEM_ALLOC); \
213 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a431748ddd6e..ead3d724599d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to) 330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331); 331);
332 332
333FTRACE_ENTRY(power, trace_power,
334
335 TRACE_POWER,
336
337 F_STRUCT(
338 __field_struct( struct power_trace, state_data )
339 __field_desc( s64, state_data, stamp )
340 __field_desc( s64, state_data, end )
341 __field_desc( int, state_data, type )
342 __field_desc( int, state_data, state )
343 ),
344
345 F_printk("%llx->%llx type:%u state:%u",
346 __entry->stamp, __entry->end,
347 __entry->type, __entry->state)
348);
349
350FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
351 334
352 TRACE_KMEM_ALLOC, 335 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 55a25c933d15..dd44b8768867 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,6 +8,57 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count++) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable();
49 if (!ret)
50 return 0;
51
52 kfree(trace_profile_buf_nmi);
53fail_buf_nmi:
54 kfree(trace_profile_buf);
55fail_buf:
56 total_profile_count--;
57 atomic_dec(&event->profile_count);
58
59 return ret;
60}
61
11int ftrace_profile_enable(int event_id) 62int ftrace_profile_enable(int event_id)
12{ 63{
13 struct ftrace_event_call *event; 64 struct ftrace_event_call *event;
@@ -17,7 +68,7 @@ int ftrace_profile_enable(int event_id)
17 list_for_each_entry(event, &ftrace_events, list) { 68 list_for_each_entry(event, &ftrace_events, list) {
18 if (event->id == event_id && event->profile_enable && 69 if (event->id == event_id && event->profile_enable &&
19 try_module_get(event->mod)) { 70 try_module_get(event->mod)) {
20 ret = event->profile_enable(event); 71 ret = ftrace_profile_enable_event(event);
21 break; 72 break;
22 } 73 }
23 } 74 }
@@ -26,6 +77,33 @@ int ftrace_profile_enable(int event_id)
26 return ret; 77 return ret;
27} 78}
28 79
80static void ftrace_profile_disable_event(struct ftrace_event_call *event)
81{
82 char *buf, *nmi_buf;
83
84 if (!atomic_add_negative(-1, &event->profile_count))
85 return;
86
87 event->profile_disable();
88
89 if (!--total_profile_count) {
90 buf = trace_profile_buf;
91 rcu_assign_pointer(trace_profile_buf, NULL);
92
93 nmi_buf = trace_profile_buf_nmi;
94 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
95
96 /*
97 * Ensure every events in profiling have finished before
98 * releasing the buffers
99 */
100 synchronize_sched();
101
102 free_percpu(buf);
103 free_percpu(nmi_buf);
104 }
105}
106
29void ftrace_profile_disable(int event_id) 107void ftrace_profile_disable(int event_id)
30{ 108{
31 struct ftrace_event_call *event; 109 struct ftrace_event_call *event;
@@ -33,7 +111,7 @@ void ftrace_profile_disable(int event_id)
33 mutex_lock(&event_mutex); 111 mutex_lock(&event_mutex);
34 list_for_each_entry(event, &ftrace_events, list) { 112 list_for_each_entry(event, &ftrace_events, list) {
35 if (event->id == event_id) { 113 if (event->id == event_id) {
36 event->profile_disable(event); 114 ftrace_profile_disable_event(event);
37 module_put(event->mod); 115 module_put(event->mod);
38 break; 116 break;
39 } 117 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 56c260b83a9c..6f03c8a1105e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -271,42 +271,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
271static void * 271static void *
272t_next(struct seq_file *m, void *v, loff_t *pos) 272t_next(struct seq_file *m, void *v, loff_t *pos)
273{ 273{
274 struct list_head *list = m->private; 274 struct ftrace_event_call *call = v;
275 struct ftrace_event_call *call;
276 275
277 (*pos)++; 276 (*pos)++;
278 277
279 for (;;) { 278 list_for_each_entry_continue(call, &ftrace_events, list) {
280 if (list == &ftrace_events)
281 return NULL;
282
283 call = list_entry(list, struct ftrace_event_call, list);
284
285 /* 279 /*
286 * The ftrace subsystem is for showing formats only. 280 * The ftrace subsystem is for showing formats only.
287 * They can not be enabled or disabled via the event files. 281 * They can not be enabled or disabled via the event files.
288 */ 282 */
289 if (call->regfunc) 283 if (call->regfunc)
290 break; 284 return call;
291
292 list = list->next;
293 } 285 }
294 286
295 m->private = list->next; 287 return NULL;
296
297 return call;
298} 288}
299 289
300static void *t_start(struct seq_file *m, loff_t *pos) 290static void *t_start(struct seq_file *m, loff_t *pos)
301{ 291{
302 struct ftrace_event_call *call = NULL; 292 struct ftrace_event_call *call;
303 loff_t l; 293 loff_t l;
304 294
305 mutex_lock(&event_mutex); 295 mutex_lock(&event_mutex);
306 296
307 m->private = ftrace_events.next; 297 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
308 for (l = 0; l <= *pos; ) { 298 for (l = 0; l <= *pos; ) {
309 call = t_next(m, NULL, &l); 299 call = t_next(m, call, &l);
310 if (!call) 300 if (!call)
311 break; 301 break;
312 } 302 }
@@ -316,37 +306,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
316static void * 306static void *
317s_next(struct seq_file *m, void *v, loff_t *pos) 307s_next(struct seq_file *m, void *v, loff_t *pos)
318{ 308{
319 struct list_head *list = m->private; 309 struct ftrace_event_call *call = v;
320 struct ftrace_event_call *call;
321 310
322 (*pos)++; 311 (*pos)++;
323 312
324 retry: 313 list_for_each_entry_continue(call, &ftrace_events, list) {
325 if (list == &ftrace_events) 314 if (call->enabled)
326 return NULL; 315 return call;
327
328 call = list_entry(list, struct ftrace_event_call, list);
329
330 if (!call->enabled) {
331 list = list->next;
332 goto retry;
333 } 316 }
334 317
335 m->private = list->next; 318 return NULL;
336
337 return call;
338} 319}
339 320
340static void *s_start(struct seq_file *m, loff_t *pos) 321static void *s_start(struct seq_file *m, loff_t *pos)
341{ 322{
342 struct ftrace_event_call *call = NULL; 323 struct ftrace_event_call *call;
343 loff_t l; 324 loff_t l;
344 325
345 mutex_lock(&event_mutex); 326 mutex_lock(&event_mutex);
346 327
347 m->private = ftrace_events.next; 328 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
348 for (l = 0; l <= *pos; ) { 329 for (l = 0; l <= *pos; ) {
349 call = s_next(m, NULL, &l); 330 call = s_next(m, call, &l);
350 if (!call) 331 if (!call)
351 break; 332 break;
352 } 333 }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
42 struct trace_power *entry;
43 struct trace_array_cpu *data;
44 struct trace_array *tr = power_trace;
45
46 if (!trace_power_enabled)
47 return;
48
49 buffer = tr->buffer;
50
51 preempt_disable();
52 it->end = ktime_get();
53 data = tr->data[smp_processor_id()];
54
55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
56 sizeof(*entry), 0, 0);
57 if (!event)
58 goto out;
59 entry = ring_buffer_event_data(event);
60 entry->state_data = *it;
61 if (!filter_check_discard(call, entry, buffer, event))
62 trace_buffer_unlock_commit(buffer, event, 0, 0);
63 out:
64 preempt_enable();
65}
66
67static void probe_power_mark(struct power_trace *it, unsigned int type,
68 unsigned int level)
69{
70 struct ftrace_event_call *call = &event_power;
71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
73 struct trace_power *entry;
74 struct trace_array_cpu *data;
75 struct trace_array *tr = power_trace;
76
77 if (!trace_power_enabled)
78 return;
79
80 buffer = tr->buffer;
81
82 memset(it, 0, sizeof(struct power_trace));
83 it->state = level;
84 it->type = type;
85 it->stamp = ktime_get();
86 preempt_disable();
87 it->end = it->stamp;
88 data = tr->data[smp_processor_id()];
89
90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
91 sizeof(*entry), 0, 0);
92 if (!event)
93 goto out;
94 entry = ring_buffer_event_data(event);
95 entry->state_data = *it;
96 if (!filter_check_discard(call, entry, buffer, event))
97 trace_buffer_unlock_commit(buffer, event, 0, 0);
98 out:
99 preempt_enable();
100}
101
102static int tracing_power_register(void)
103{
104 int ret;
105
106 ret = register_trace_power_start(probe_power_start);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_start\n");
110 return ret;
111 }
112 ret = register_trace_power_end(probe_power_end);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_end\n");
116 goto fail_start;
117 }
118 ret = register_trace_power_mark(probe_power_mark);
119 if (ret) {
120 pr_info("power trace: Couldn't activate tracepoint"
121 " probe to trace_power_mark\n");
122 goto fail_end;
123 }
124 return ret;
125fail_end:
126 unregister_trace_power_end(probe_power_end);
127fail_start:
128 unregister_trace_power_start(probe_power_start);
129 return ret;
130}
131
132static void start_power_trace(struct trace_array *tr)
133{
134 trace_power_enabled = 1;
135}
136
137static void stop_power_trace(struct trace_array *tr)
138{
139 trace_power_enabled = 0;
140}
141
142static void power_trace_reset(struct trace_array *tr)
143{
144 trace_power_enabled = 0;
145 unregister_trace_power_start(probe_power_start);
146 unregister_trace_power_end(probe_power_end);
147 unregister_trace_power_mark(probe_power_mark);
148}
149
150
151static int power_trace_init(struct trace_array *tr)
152{
153 power_trace = tr;
154
155 trace_power_enabled = 1;
156 tracing_power_register();
157
158 tracing_reset_online_cpus(tr);
159 return 0;
160}
161
162static enum print_line_t power_print_line(struct trace_iterator *iter)
163{
164 int ret = 0;
165 struct trace_entry *entry = iter->ent;
166 struct trace_power *field ;
167 struct power_trace *it;
168 struct trace_seq *s = &iter->seq;
169 struct timespec stamp;
170 struct timespec duration;
171
172 trace_assign_type(field, entry);
173 it = &field->state_data;
174 stamp = ktime_to_timespec(it->stamp);
175 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
176
177 if (entry->type == TRACE_POWER) {
178 if (it->type == POWER_CSTATE)
179 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
180 stamp.tv_sec,
181 stamp.tv_nsec,
182 it->state, iter->cpu,
183 duration.tv_sec,
184 duration.tv_nsec);
185 if (it->type == POWER_PSTATE)
186 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
187 stamp.tv_sec,
188 stamp.tv_nsec,
189 it->state, iter->cpu);
190 if (!ret)
191 return TRACE_TYPE_PARTIAL_LINE;
192 return TRACE_TYPE_HANDLED;
193 }
194 return TRACE_TYPE_UNHANDLED;
195}
196
197static void power_print_header(struct seq_file *s)
198{
199 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
200 seq_puts(s, "# | | |\n");
201}
202
203static struct tracer power_tracer __read_mostly =
204{
205 .name = "power",
206 .init = power_trace_init,
207 .start = start_power_trace,
208 .stop = stop_power_trace,
209 .reset = power_trace_reset,
210 .print_line = power_print_line,
211 .print_header = power_print_header,
212};
213
214static int init_power_trace(void)
215{
216 return register_tracer(&power_tracer);
217}
218device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..7a3550cf2597 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
384 384
385static void prof_syscall_enter(struct pt_regs *regs, long id) 385static void prof_syscall_enter(struct pt_regs *regs, long id)
386{ 386{
387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data; 387 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec;
389 unsigned long flags;
390 char *raw_data;
389 int syscall_nr; 391 int syscall_nr;
390 int size; 392 int size;
393 int cpu;
391 394
392 syscall_nr = syscall_get_nr(current, regs); 395 syscall_nr = syscall_get_nr(current, regs);
393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
402 size = ALIGN(size + sizeof(u32), sizeof(u64)); 405 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32); 406 size -= sizeof(u32);
404 407
405 do { 408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
406 char raw_data[size]; 409 "profile buffer not large enough"))
410 return;
411
412 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags);
407 414
408 /* zero the dead bytes from align to not leak stack to user */ 415 cpu = smp_processor_id();
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
410 424
411 rec = (struct syscall_trace_enter *) raw_data; 425 raw_data = per_cpu_ptr(raw_data, cpu);
412 tracing_generic_entry_update(&rec->ent, 0, 0); 426
413 rec->ent.type = sys_data->enter_id; 427 /* zero the dead bytes from align to not leak stack to user */
414 rec->nr = syscall_nr; 428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 429
416 (unsigned long *)&rec->args); 430 rec = (struct syscall_trace_enter *) raw_data;
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); 431 tracing_generic_entry_update(&rec->ent, 0, 0);
418 } while(0); 432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args);
436 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
437
438end:
439 local_irq_restore(flags);
419} 440}
420 441
421int reg_prof_syscall_enter(char *name) 442int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
460static void prof_syscall_exit(struct pt_regs *regs, long ret) 481static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{ 482{
462 struct syscall_metadata *sys_data; 483 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec; 484 struct syscall_trace_exit *rec;
485 unsigned long flags;
464 int syscall_nr; 486 int syscall_nr;
487 char *raw_data;
488 int size;
489 int cpu;
465 490
466 syscall_nr = syscall_get_nr(current, regs); 491 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
471 if (!sys_data) 496 if (!sys_data)
472 return; 497 return;
473 498
474 tracing_generic_entry_update(&rec.ent, 0, 0); 499 /* We can probably do that at build time */
475 rec.ent.type = sys_data->exit_id; 500 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
476 rec.nr = syscall_nr; 501 size -= sizeof(u32);
477 rec.ret = syscall_get_return_value(current, regs);
478 502
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); 503 /*
504 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime?
506 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
508 "exit event has grown above profile buffer size"))
509 return;
510
511 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags);
513 cpu = smp_processor_id();
514
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs);
534
535 perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
536
537end:
538 local_irq_restore(flags);
480} 539}
481 540
482int reg_prof_syscall_exit(char *name) 541int reg_prof_syscall_exit(char *name)