aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c216
-rw-r--r--kernel/auditfilter.c9
-rw-r--r--kernel/auditsc.c40
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c824
-rw-r--r--kernel/irq/chip.c25
-rw-r--r--kernel/irq/manage.c44
-rw-r--r--kernel/irq/proc.c24
-rw-r--r--kernel/itimer.c18
-rw-r--r--kernel/kmod.c164
-rw-r--r--kernel/kprobes.c113
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/module.c47
-rw-r--r--kernel/params.c29
-rw-r--r--kernel/posix-cpu-timers.c15
-rw-r--r--kernel/posix-timers.c15
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c7
-rw-r--r--kernel/signal.c58
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/time.c254
-rw-r--r--kernel/time/Kconfig25
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/clockevents.c345
-rw-r--r--kernel/time/clocksource.c246
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c480
-rw-r--r--kernel/time/tick-common.c346
-rw-r--r--kernel/time/tick-internal.h110
-rw-r--r--kernel/time/tick-oneshot.c84
-rw-r--r--kernel/time/tick-sched.c565
-rw-r--r--kernel/time/timer_list.c287
-rw-r--r--kernel/time/timer_stats.c411
-rw-r--r--kernel/timer.c290
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/workqueue.c7
39 files changed, 4658 insertions, 513 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d9b690ac684b..76c9a11b72d6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2,7 +2,7 @@
2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. 2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
3 * System-call specific features have moved to auditsc.c 3 * System-call specific features have moved to auditsc.c
4 * 4 *
5 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 5 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
6 * All Rights Reserved. 6 * All Rights Reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
@@ -65,7 +65,9 @@
65 * (Initialization happens after skb_init is called.) */ 65 * (Initialization happens after skb_init is called.) */
66static int audit_initialized; 66static int audit_initialized;
67 67
68/* No syscall auditing will take place unless audit_enabled != 0. */ 68/* 0 - no auditing
69 * 1 - auditing enabled
70 * 2 - auditing enabled and configuration is locked/unchangeable. */
69int audit_enabled; 71int audit_enabled;
70 72
71/* Default state when kernel boots without any parameters. */ 73/* Default state when kernel boots without any parameters. */
@@ -239,102 +241,150 @@ void audit_log_lost(const char *message)
239 241
240static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) 242static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
241{ 243{
242 int old = audit_rate_limit; 244 int res, rc = 0, old = audit_rate_limit;
245
246 /* check if we are locked */
247 if (audit_enabled == 2)
248 res = 0;
249 else
250 res = 1;
243 251
244 if (sid) { 252 if (sid) {
245 char *ctx = NULL; 253 char *ctx = NULL;
246 u32 len; 254 u32 len;
247 int rc; 255 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
248 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
249 return rc;
250 else
251 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 256 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
252 "audit_rate_limit=%d old=%d by auid=%u subj=%s", 257 "audit_rate_limit=%d old=%d by auid=%u"
253 limit, old, loginuid, ctx); 258 " subj=%s res=%d",
254 kfree(ctx); 259 limit, old, loginuid, ctx, res);
255 } else 260 kfree(ctx);
256 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 261 } else
257 "audit_rate_limit=%d old=%d by auid=%u", 262 res = 0; /* Something weird, deny request */
258 limit, old, loginuid); 263 }
259 audit_rate_limit = limit; 264 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
260 return 0; 265 "audit_rate_limit=%d old=%d by auid=%u res=%d",
266 limit, old, loginuid, res);
267
268 /* If we are allowed, make the change */
269 if (res == 1)
270 audit_rate_limit = limit;
271 /* Not allowed, update reason */
272 else if (rc == 0)
273 rc = -EPERM;
274 return rc;
261} 275}
262 276
263static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 277static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
264{ 278{
265 int old = audit_backlog_limit; 279 int res, rc = 0, old = audit_backlog_limit;
280
281 /* check if we are locked */
282 if (audit_enabled == 2)
283 res = 0;
284 else
285 res = 1;
266 286
267 if (sid) { 287 if (sid) {
268 char *ctx = NULL; 288 char *ctx = NULL;
269 u32 len; 289 u32 len;
270 int rc; 290 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
271 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
272 return rc;
273 else
274 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 291 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
275 "audit_backlog_limit=%d old=%d by auid=%u subj=%s", 292 "audit_backlog_limit=%d old=%d by auid=%u"
276 limit, old, loginuid, ctx); 293 " subj=%s res=%d",
277 kfree(ctx); 294 limit, old, loginuid, ctx, res);
278 } else 295 kfree(ctx);
279 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 296 } else
280 "audit_backlog_limit=%d old=%d by auid=%u", 297 res = 0; /* Something weird, deny request */
281 limit, old, loginuid); 298 }
282 audit_backlog_limit = limit; 299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
283 return 0; 300 "audit_backlog_limit=%d old=%d by auid=%u res=%d",
301 limit, old, loginuid, res);
302
303 /* If we are allowed, make the change */
304 if (res == 1)
305 audit_backlog_limit = limit;
306 /* Not allowed, update reason */
307 else if (rc == 0)
308 rc = -EPERM;
309 return rc;
284} 310}
285 311
286static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 312static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
287{ 313{
288 int old = audit_enabled; 314 int res, rc = 0, old = audit_enabled;
289 315
290 if (state != 0 && state != 1) 316 if (state < 0 || state > 2)
291 return -EINVAL; 317 return -EINVAL;
292 318
319 /* check if we are locked */
320 if (audit_enabled == 2)
321 res = 0;
322 else
323 res = 1;
324
293 if (sid) { 325 if (sid) {
294 char *ctx = NULL; 326 char *ctx = NULL;
295 u32 len; 327 u32 len;
296 int rc; 328 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
297 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
298 return rc;
299 else
300 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 329 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
301 "audit_enabled=%d old=%d by auid=%u subj=%s", 330 "audit_enabled=%d old=%d by auid=%u"
302 state, old, loginuid, ctx); 331 " subj=%s res=%d",
303 kfree(ctx); 332 state, old, loginuid, ctx, res);
304 } else 333 kfree(ctx);
305 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 334 } else
306 "audit_enabled=%d old=%d by auid=%u", 335 res = 0; /* Something weird, deny request */
307 state, old, loginuid); 336 }
308 audit_enabled = state; 337 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
309 return 0; 338 "audit_enabled=%d old=%d by auid=%u res=%d",
339 state, old, loginuid, res);
340
341 /* If we are allowed, make the change */
342 if (res == 1)
343 audit_enabled = state;
344 /* Not allowed, update reason */
345 else if (rc == 0)
346 rc = -EPERM;
347 return rc;
310} 348}
311 349
312static int audit_set_failure(int state, uid_t loginuid, u32 sid) 350static int audit_set_failure(int state, uid_t loginuid, u32 sid)
313{ 351{
314 int old = audit_failure; 352 int res, rc = 0, old = audit_failure;
315 353
316 if (state != AUDIT_FAIL_SILENT 354 if (state != AUDIT_FAIL_SILENT
317 && state != AUDIT_FAIL_PRINTK 355 && state != AUDIT_FAIL_PRINTK
318 && state != AUDIT_FAIL_PANIC) 356 && state != AUDIT_FAIL_PANIC)
319 return -EINVAL; 357 return -EINVAL;
320 358
359 /* check if we are locked */
360 if (audit_enabled == 2)
361 res = 0;
362 else
363 res = 1;
364
321 if (sid) { 365 if (sid) {
322 char *ctx = NULL; 366 char *ctx = NULL;
323 u32 len; 367 u32 len;
324 int rc; 368 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
325 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
326 return rc;
327 else
328 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 369 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
329 "audit_failure=%d old=%d by auid=%u subj=%s", 370 "audit_failure=%d old=%d by auid=%u"
330 state, old, loginuid, ctx); 371 " subj=%s res=%d",
331 kfree(ctx); 372 state, old, loginuid, ctx, res);
332 } else 373 kfree(ctx);
333 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 374 } else
334 "audit_failure=%d old=%d by auid=%u", 375 res = 0; /* Something weird, deny request */
335 state, old, loginuid); 376 }
336 audit_failure = state; 377 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
337 return 0; 378 "audit_failure=%d old=%d by auid=%u res=%d",
379 state, old, loginuid, res);
380
381 /* If we are allowed, make the change */
382 if (res == 1)
383 audit_failure = state;
384 /* Not allowed, update reason */
385 else if (rc == 0)
386 rc = -EPERM;
387 return rc;
338} 388}
339 389
340static int kauditd_thread(void *dummy) 390static int kauditd_thread(void *dummy)
@@ -599,6 +649,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
599 case AUDIT_DEL: 649 case AUDIT_DEL:
600 if (nlmsg_len(nlh) < sizeof(struct audit_rule)) 650 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
601 return -EINVAL; 651 return -EINVAL;
652 if (audit_enabled == 2) {
653 ab = audit_log_start(NULL, GFP_KERNEL,
654 AUDIT_CONFIG_CHANGE);
655 if (ab) {
656 audit_log_format(ab,
657 "pid=%d uid=%u auid=%u",
658 pid, uid, loginuid);
659 if (sid) {
660 if (selinux_sid_to_string(
661 sid, &ctx, &len)) {
662 audit_log_format(ab,
663 " ssid=%u", sid);
664 /* Maybe call audit_panic? */
665 } else
666 audit_log_format(ab,
667 " subj=%s", ctx);
668 kfree(ctx);
669 }
670 audit_log_format(ab, " audit_enabled=%d res=0",
671 audit_enabled);
672 audit_log_end(ab);
673 }
674 return -EPERM;
675 }
602 /* fallthrough */ 676 /* fallthrough */
603 case AUDIT_LIST: 677 case AUDIT_LIST:
604 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 678 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
@@ -609,6 +683,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
609 case AUDIT_DEL_RULE: 683 case AUDIT_DEL_RULE:
610 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 684 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
611 return -EINVAL; 685 return -EINVAL;
686 if (audit_enabled == 2) {
687 ab = audit_log_start(NULL, GFP_KERNEL,
688 AUDIT_CONFIG_CHANGE);
689 if (ab) {
690 audit_log_format(ab,
691 "pid=%d uid=%u auid=%u",
692 pid, uid, loginuid);
693 if (sid) {
694 if (selinux_sid_to_string(
695 sid, &ctx, &len)) {
696 audit_log_format(ab,
697 " ssid=%u", sid);
698 /* Maybe call audit_panic? */
699 } else
700 audit_log_format(ab,
701 " subj=%s", ctx);
702 kfree(ctx);
703 }
704 audit_log_format(ab, " audit_enabled=%d res=0",
705 audit_enabled);
706 audit_log_end(ab);
707 }
708 return -EPERM;
709 }
612 /* fallthrough */ 710 /* fallthrough */
613 case AUDIT_LIST_RULES: 711 case AUDIT_LIST_RULES:
614 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 712 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 87865f8b4ce3..3749193aed8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -937,9 +937,10 @@ static void audit_update_watch(struct audit_parent *parent,
937 } 937 }
938 938
939 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 939 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
940 audit_log_format(ab, "audit updated rules specifying path="); 940 audit_log_format(ab, "op=updated rules specifying path=");
941 audit_log_untrustedstring(ab, owatch->path); 941 audit_log_untrustedstring(ab, owatch->path);
942 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); 942 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
943 audit_log_format(ab, " list=%d res=1", r->listnr);
943 audit_log_end(ab); 944 audit_log_end(ab);
944 945
945 audit_remove_watch(owatch); 946 audit_remove_watch(owatch);
@@ -969,14 +970,14 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
969 e = container_of(r, struct audit_entry, rule); 970 e = container_of(r, struct audit_entry, rule);
970 971
971 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 972 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
972 audit_log_format(ab, "audit implicitly removed rule path="); 973 audit_log_format(ab, "op=remove rule path=");
973 audit_log_untrustedstring(ab, w->path); 974 audit_log_untrustedstring(ab, w->path);
974 if (r->filterkey) { 975 if (r->filterkey) {
975 audit_log_format(ab, " key="); 976 audit_log_format(ab, " key=");
976 audit_log_untrustedstring(ab, r->filterkey); 977 audit_log_untrustedstring(ab, r->filterkey);
977 } else 978 } else
978 audit_log_format(ab, " key=(null)"); 979 audit_log_format(ab, " key=(null)");
979 audit_log_format(ab, " list=%d", r->listnr); 980 audit_log_format(ab, " list=%d res=1", r->listnr);
980 audit_log_end(ab); 981 audit_log_end(ab);
981 982
982 list_del(&r->rlist); 983 list_del(&r->rlist);
@@ -1410,7 +1411,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1410 audit_log_format(ab, " subj=%s", ctx); 1411 audit_log_format(ab, " subj=%s", ctx);
1411 kfree(ctx); 1412 kfree(ctx);
1412 } 1413 }
1413 audit_log_format(ab, " %s rule key=", action); 1414 audit_log_format(ab, " op=%s rule key=", action);
1414 if (rule->filterkey) 1415 if (rule->filterkey)
1415 audit_log_untrustedstring(ab, rule->filterkey); 1416 audit_log_untrustedstring(ab, rule->filterkey);
1416 else 1417 else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 298897559ca4..359955800dd2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -170,6 +170,11 @@ struct audit_aux_data_sockaddr {
170 char a[0]; 170 char a[0];
171}; 171};
172 172
173struct audit_aux_data_fd_pair {
174 struct audit_aux_data d;
175 int fd[2];
176};
177
173struct audit_aux_data_path { 178struct audit_aux_data_path {
174 struct audit_aux_data d; 179 struct audit_aux_data d;
175 struct dentry *dentry; 180 struct dentry *dentry;
@@ -961,6 +966,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
961 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); 966 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
962 break; } 967 break; }
963 968
969 case AUDIT_FD_PAIR: {
970 struct audit_aux_data_fd_pair *axs = (void *)aux;
971 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
972 break; }
973
964 } 974 }
965 audit_log_end(ab); 975 audit_log_end(ab);
966 } 976 }
@@ -1815,6 +1825,36 @@ int audit_socketcall(int nargs, unsigned long *args)
1815} 1825}
1816 1826
1817/** 1827/**
1828 * __audit_fd_pair - record audit data for pipe and socketpair
1829 * @fd1: the first file descriptor
1830 * @fd2: the second file descriptor
1831 *
1832 * Returns 0 for success or NULL context or < 0 on error.
1833 */
1834int __audit_fd_pair(int fd1, int fd2)
1835{
1836 struct audit_context *context = current->audit_context;
1837 struct audit_aux_data_fd_pair *ax;
1838
1839 if (likely(!context)) {
1840 return 0;
1841 }
1842
1843 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
1844 if (!ax) {
1845 return -ENOMEM;
1846 }
1847
1848 ax->fd[0] = fd1;
1849 ax->fd[1] = fd2;
1850
1851 ax->d.type = AUDIT_FD_PAIR;
1852 ax->d.next = context->aux;
1853 context->aux = (void *)ax;
1854 return 0;
1855}
1856
1857/**
1818 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto 1858 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
1819 * @len: data length in user space 1859 * @len: data length in user space
1820 * @a: data address in kernel space 1860 * @a: data address in kernel space
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b6293d94d96..d154cc786489 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
858 init_sigpending(&sig->shared_pending); 858 init_sigpending(&sig->shared_pending);
859 INIT_LIST_HEAD(&sig->posix_timers); 859 INIT_LIST_HEAD(&sig->posix_timers);
860 860
861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
862 sig->it_real_incr.tv64 = 0; 862 sig->it_real_incr.tv64 = 0;
863 sig->real_timer.function = it_real_fn; 863 sig->real_timer.function = it_real_fn;
864 sig->tsk = tsk; 864 sig->tsk = tsk;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a737de857d3..e749e7df14b1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1134 1134
1135 if (sec != MAX_SCHEDULE_TIMEOUT) { 1135 if (sec != MAX_SCHEDULE_TIMEOUT) {
1136 to = &timeout; 1136 to = &timeout;
1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); 1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1138 hrtimer_init_sleeper(to, current); 1138 hrtimer_init_sleeper(to, current);
1139 to->timer.expires = ktime_set(sec, nsec); 1139 to->timer.expires = ktime_set(sec, nsec);
1140 } 1140 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f44e499e8fca..476cb0c0b4a4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * linux/kernel/hrtimer.c 2 * linux/kernel/hrtimer.c
3 * 3 *
4 * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
6 * 7 *
7 * High-resolution kernel timers 8 * High-resolution kernel timers
8 * 9 *
@@ -31,12 +32,17 @@
31 */ 32 */
32 33
33#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
34#include <linux/module.h> 36#include <linux/module.h>
35#include <linux/percpu.h> 37#include <linux/percpu.h>
36#include <linux/hrtimer.h> 38#include <linux/hrtimer.h>
37#include <linux/notifier.h> 39#include <linux/notifier.h>
38#include <linux/syscalls.h> 40#include <linux/syscalls.h>
41#include <linux/kallsyms.h>
39#include <linux/interrupt.h> 42#include <linux/interrupt.h>
43#include <linux/tick.h>
44#include <linux/seq_file.h>
45#include <linux/err.h>
40 46
41#include <asm/uaccess.h> 47#include <asm/uaccess.h>
42 48
@@ -45,7 +51,7 @@
45 * 51 *
46 * returns the time in ktime_t format 52 * returns the time in ktime_t format
47 */ 53 */
48static ktime_t ktime_get(void) 54ktime_t ktime_get(void)
49{ 55{
50 struct timespec now; 56 struct timespec now;
51 57
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void)
59 * 65 *
60 * returns the time in ktime_t format 66 * returns the time in ktime_t format
61 */ 67 */
62static ktime_t ktime_get_real(void) 68ktime_t ktime_get_real(void)
63{ 69{
64 struct timespec now; 70 struct timespec now;
65 71
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
79 * This ensures that we capture erroneous accesses to these clock ids 85 * This ensures that we capture erroneous accesses to these clock ids
80 * rather than moving them into the range of valid clock id's. 86 * rather than moving them into the range of valid clock id's.
81 */ 87 */
82 88DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
83#define MAX_HRTIMER_BASES 2
84
85static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
86{ 89{
90
91 .clock_base =
87 { 92 {
88 .index = CLOCK_REALTIME, 93 {
89 .get_time = &ktime_get_real, 94 .index = CLOCK_REALTIME,
90 .resolution = KTIME_REALTIME_RES, 95 .get_time = &ktime_get_real,
91 }, 96 .resolution = KTIME_LOW_RES,
92 { 97 },
93 .index = CLOCK_MONOTONIC, 98 {
94 .get_time = &ktime_get, 99 .index = CLOCK_MONOTONIC,
95 .resolution = KTIME_MONOTONIC_RES, 100 .get_time = &ktime_get,
96 }, 101 .resolution = KTIME_LOW_RES,
102 },
103 }
97}; 104};
98 105
99/** 106/**
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
125 * Get the coarse grained time at the softirq based on xtime and 132 * Get the coarse grained time at the softirq based on xtime and
126 * wall_to_monotonic. 133 * wall_to_monotonic.
127 */ 134 */
128static void hrtimer_get_softirq_time(struct hrtimer_base *base) 135static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
129{ 136{
130 ktime_t xtim, tomono; 137 ktime_t xtim, tomono;
138 struct timespec xts;
131 unsigned long seq; 139 unsigned long seq;
132 140
133 do { 141 do {
134 seq = read_seqbegin(&xtime_lock); 142 seq = read_seqbegin(&xtime_lock);
135 xtim = timespec_to_ktime(xtime); 143#ifdef CONFIG_NO_HZ
136 tomono = timespec_to_ktime(wall_to_monotonic); 144 getnstimeofday(&xts);
137 145#else
146 xts = xtime;
147#endif
138 } while (read_seqretry(&xtime_lock, seq)); 148 } while (read_seqretry(&xtime_lock, seq));
139 149
140 base[CLOCK_REALTIME].softirq_time = xtim; 150 xtim = timespec_to_ktime(xts);
141 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); 151 tomono = timespec_to_ktime(wall_to_monotonic);
152 base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
153 base->clock_base[CLOCK_MONOTONIC].softirq_time =
154 ktime_add(xtim, tomono);
155}
156
157/*
158 * Helper function to check, whether the timer is running the callback
159 * function
160 */
161static inline int hrtimer_callback_running(struct hrtimer *timer)
162{
163 return timer->state & HRTIMER_STATE_CALLBACK;
142} 164}
143 165
144/* 166/*
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
147 */ 169 */
148#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
149 171
150#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
151
152/* 172/*
153 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock 173 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
154 * means that all timers which are tied to this base via timer->base are 174 * means that all timers which are tied to this base via timer->base are
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
161 * possible to set timer->base = NULL and drop the lock: the timer remains 181 * possible to set timer->base = NULL and drop the lock: the timer remains
162 * locked. 182 * locked.
163 */ 183 */
164static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, 184static
165 unsigned long *flags) 185struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
186 unsigned long *flags)
166{ 187{
167 struct hrtimer_base *base; 188 struct hrtimer_clock_base *base;
168 189
169 for (;;) { 190 for (;;) {
170 base = timer->base; 191 base = timer->base;
171 if (likely(base != NULL)) { 192 if (likely(base != NULL)) {
172 spin_lock_irqsave(&base->lock, *flags); 193 spin_lock_irqsave(&base->cpu_base->lock, *flags);
173 if (likely(base == timer->base)) 194 if (likely(base == timer->base))
174 return base; 195 return base;
175 /* The timer has migrated to another CPU: */ 196 /* The timer has migrated to another CPU: */
176 spin_unlock_irqrestore(&base->lock, *flags); 197 spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
177 } 198 }
178 cpu_relax(); 199 cpu_relax();
179 } 200 }
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
182/* 203/*
183 * Switch the timer base to the current CPU when possible. 204 * Switch the timer base to the current CPU when possible.
184 */ 205 */
185static inline struct hrtimer_base * 206static inline struct hrtimer_clock_base *
186switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) 207switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
187{ 208{
188 struct hrtimer_base *new_base; 209 struct hrtimer_clock_base *new_base;
210 struct hrtimer_cpu_base *new_cpu_base;
189 211
190 new_base = &__get_cpu_var(hrtimer_bases)[base->index]; 212 new_cpu_base = &__get_cpu_var(hrtimer_bases);
213 new_base = &new_cpu_base->clock_base[base->index];
191 214
192 if (base != new_base) { 215 if (base != new_base) {
193 /* 216 /*
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
199 * completed. There is no conflict as we hold the lock until 222 * completed. There is no conflict as we hold the lock until
200 * the timer is enqueued. 223 * the timer is enqueued.
201 */ 224 */
202 if (unlikely(base->curr_timer == timer)) 225 if (unlikely(hrtimer_callback_running(timer)))
203 return base; 226 return base;
204 227
205 /* See the comment in lock_timer_base() */ 228 /* See the comment in lock_timer_base() */
206 timer->base = NULL; 229 timer->base = NULL;
207 spin_unlock(&base->lock); 230 spin_unlock(&base->cpu_base->lock);
208 spin_lock(&new_base->lock); 231 spin_lock(&new_base->cpu_base->lock);
209 timer->base = new_base; 232 timer->base = new_base;
210 } 233 }
211 return new_base; 234 return new_base;
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
213 236
214#else /* CONFIG_SMP */ 237#else /* CONFIG_SMP */
215 238
216#define set_curr_timer(b, t) do { } while (0) 239static inline struct hrtimer_clock_base *
217
218static inline struct hrtimer_base *
219lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 240lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
220{ 241{
221 struct hrtimer_base *base = timer->base; 242 struct hrtimer_clock_base *base = timer->base;
222 243
223 spin_lock_irqsave(&base->lock, *flags); 244 spin_lock_irqsave(&base->cpu_base->lock, *flags);
224 245
225 return base; 246 return base;
226} 247}
227 248
228#define switch_hrtimer_base(t, b) (b) 249# define switch_hrtimer_base(t, b) (b)
229 250
230#endif /* !CONFIG_SMP */ 251#endif /* !CONFIG_SMP */
231 252
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
256 277
257 return ktime_add(kt, tmp); 278 return ktime_add(kt, tmp);
258} 279}
259
260#else /* CONFIG_KTIME_SCALAR */
261
262# endif /* !CONFIG_KTIME_SCALAR */ 280# endif /* !CONFIG_KTIME_SCALAR */
263 281
264/* 282/*
265 * Divide a ktime value by a nanosecond value 283 * Divide a ktime value by a nanosecond value
266 */ 284 */
267static unsigned long ktime_divns(const ktime_t kt, s64 div) 285unsigned long ktime_divns(const ktime_t kt, s64 div)
268{ 286{
269 u64 dclc, inc, dns; 287 u64 dclc, inc, dns;
270 int sft = 0; 288 int sft = 0;
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div)
281 299
282 return (unsigned long) dclc; 300 return (unsigned long) dclc;
283} 301}
284
285#else /* BITS_PER_LONG < 64 */
286# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
287#endif /* BITS_PER_LONG >= 64 */ 302#endif /* BITS_PER_LONG >= 64 */
288 303
304/* High resolution timer related functions */
305#ifdef CONFIG_HIGH_RES_TIMERS
306
307/*
308 * High resolution timer enabled ?
309 */
310static int hrtimer_hres_enabled __read_mostly = 1;
311
312/*
313 * Enable / Disable high resolution mode
314 */
315static int __init setup_hrtimer_hres(char *str)
316{
317 if (!strcmp(str, "off"))
318 hrtimer_hres_enabled = 0;
319 else if (!strcmp(str, "on"))
320 hrtimer_hres_enabled = 1;
321 else
322 return 0;
323 return 1;
324}
325
326__setup("highres=", setup_hrtimer_hres);
327
328/*
329 * hrtimer_high_res_enabled - query, if the highres mode is enabled
330 */
331static inline int hrtimer_is_hres_enabled(void)
332{
333 return hrtimer_hres_enabled;
334}
335
336/*
337 * Is the high resolution mode active ?
338 */
339static inline int hrtimer_hres_active(void)
340{
341 return __get_cpu_var(hrtimer_bases).hres_active;
342}
343
344/*
345 * Reprogram the event source with checking both queues for the
346 * next event
347 * Called with interrupts disabled and base->lock held
348 */
349static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
350{
351 int i;
352 struct hrtimer_clock_base *base = cpu_base->clock_base;
353 ktime_t expires;
354
355 cpu_base->expires_next.tv64 = KTIME_MAX;
356
357 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
358 struct hrtimer *timer;
359
360 if (!base->first)
361 continue;
362 timer = rb_entry(base->first, struct hrtimer, node);
363 expires = ktime_sub(timer->expires, base->offset);
364 if (expires.tv64 < cpu_base->expires_next.tv64)
365 cpu_base->expires_next = expires;
366 }
367
368 if (cpu_base->expires_next.tv64 != KTIME_MAX)
369 tick_program_event(cpu_base->expires_next, 1);
370}
371
372/*
373 * Shared reprogramming for clock_realtime and clock_monotonic
374 *
375 * When a timer is enqueued and expires earlier than the already enqueued
376 * timers, we have to check, whether it expires earlier than the timer for
377 * which the clock event device was armed.
378 *
379 * Called with interrupts disabled and base->cpu_base.lock held
380 */
381static int hrtimer_reprogram(struct hrtimer *timer,
382 struct hrtimer_clock_base *base)
383{
384 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
385 ktime_t expires = ktime_sub(timer->expires, base->offset);
386 int res;
387
388 /*
389 * When the callback is running, we do not reprogram the clock event
390 * device. The timer callback is either running on a different CPU or
391 * the callback is executed in the hrtimer_interupt context. The
392 * reprogramming is handled either by the softirq, which called the
393 * callback or at the end of the hrtimer_interrupt.
394 */
395 if (hrtimer_callback_running(timer))
396 return 0;
397
398 if (expires.tv64 >= expires_next->tv64)
399 return 0;
400
401 /*
402 * Clockevents returns -ETIME, when the event was in the past.
403 */
404 res = tick_program_event(expires, 0);
405 if (!IS_ERR_VALUE(res))
406 *expires_next = expires;
407 return res;
408}
409
410
411/*
412 * Retrigger next event is called after clock was set
413 *
414 * Called with interrupts disabled via on_each_cpu()
415 */
416static void retrigger_next_event(void *arg)
417{
418 struct hrtimer_cpu_base *base;
419 struct timespec realtime_offset;
420 unsigned long seq;
421
422 if (!hrtimer_hres_active())
423 return;
424
425 do {
426 seq = read_seqbegin(&xtime_lock);
427 set_normalized_timespec(&realtime_offset,
428 -wall_to_monotonic.tv_sec,
429 -wall_to_monotonic.tv_nsec);
430 } while (read_seqretry(&xtime_lock, seq));
431
432 base = &__get_cpu_var(hrtimer_bases);
433
434 /* Adjust CLOCK_REALTIME offset */
435 spin_lock(&base->lock);
436 base->clock_base[CLOCK_REALTIME].offset =
437 timespec_to_ktime(realtime_offset);
438
439 hrtimer_force_reprogram(base);
440 spin_unlock(&base->lock);
441}
442
443/*
444 * Clock realtime was set
445 *
446 * Change the offset of the realtime clock vs. the monotonic
447 * clock.
448 *
449 * We might have to reprogram the high resolution timer interrupt. On
450 * SMP we call the architecture specific code to retrigger _all_ high
451 * resolution timer interrupts. On UP we just disable interrupts and
452 * call the high resolution interrupt code.
453 */
454void clock_was_set(void)
455{
456 /* Retrigger the CPU local events everywhere */
457 on_each_cpu(retrigger_next_event, NULL, 0, 1);
458}
459
460/*
461 * Check, whether the timer is on the callback pending list
462 */
463static inline int hrtimer_cb_pending(const struct hrtimer *timer)
464{
465 return timer->state & HRTIMER_STATE_PENDING;
466}
467
468/*
469 * Remove a timer from the callback pending list
470 */
471static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
472{
473 list_del_init(&timer->cb_entry);
474}
475
476/*
477 * Initialize the high resolution related parts of cpu_base
478 */
479static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
480{
481 base->expires_next.tv64 = KTIME_MAX;
482 base->hres_active = 0;
483 INIT_LIST_HEAD(&base->cb_pending);
484}
485
486/*
487 * Initialize the high resolution related parts of a hrtimer
488 */
489static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
490{
491 INIT_LIST_HEAD(&timer->cb_entry);
492}
493
494/*
495 * When High resolution timers are active, try to reprogram. Note, that in case
496 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
497 * check happens. The timer gets enqueued into the rbtree. The reprogramming
498 * and expiry check is done in the hrtimer_interrupt or in the softirq.
499 */
500static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
501 struct hrtimer_clock_base *base)
502{
503 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
504
505 /* Timer is expired, act upon the callback mode */
506 switch(timer->cb_mode) {
507 case HRTIMER_CB_IRQSAFE_NO_RESTART:
508 /*
509 * We can call the callback from here. No restart
510 * happens, so no danger of recursion
511 */
512 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
513 return 1;
514 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
515 /*
516 * This is solely for the sched tick emulation with
517 * dynamic tick support to ensure that we do not
518 * restart the tick right on the edge and end up with
519 * the tick timer in the softirq ! The calling site
520 * takes care of this.
521 */
522 return 1;
523 case HRTIMER_CB_IRQSAFE:
524 case HRTIMER_CB_SOFTIRQ:
525 /*
526 * Move everything else into the softirq pending list !
527 */
528 list_add_tail(&timer->cb_entry,
529 &base->cpu_base->cb_pending);
530 timer->state = HRTIMER_STATE_PENDING;
531 raise_softirq(HRTIMER_SOFTIRQ);
532 return 1;
533 default:
534 BUG();
535 }
536 }
537 return 0;
538}
539
540/*
541 * Switch to high resolution mode
542 */
543static void hrtimer_switch_to_hres(void)
544{
545 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
546 unsigned long flags;
547
548 if (base->hres_active)
549 return;
550
551 local_irq_save(flags);
552
553 if (tick_init_highres()) {
554 local_irq_restore(flags);
555 return;
556 }
557 base->hres_active = 1;
558 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
559 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
560
561 tick_setup_sched_timer();
562
563 /* "Retrigger" the interrupt to get things going */
564 retrigger_next_event(NULL);
565 local_irq_restore(flags);
566 printk(KERN_INFO "Switched to high resolution mode on CPU %d\n",
567 smp_processor_id());
568}
569
570#else
571
572static inline int hrtimer_hres_active(void) { return 0; }
573static inline int hrtimer_is_hres_enabled(void) { return 0; }
574static inline void hrtimer_switch_to_hres(void) { }
575static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
576static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
577 struct hrtimer_clock_base *base)
578{
579 return 0;
580}
581static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
582static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
583static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
584static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
585
586#endif /* CONFIG_HIGH_RES_TIMERS */
587
588#ifdef CONFIG_TIMER_STATS
589void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
590{
591 if (timer->start_site)
592 return;
593
594 timer->start_site = addr;
595 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
596 timer->start_pid = current->pid;
597}
598#endif
599
289/* 600/*
290 * Counterpart to lock_timer_base above: 601 * Counterpart to lock_timer_base above:
291 */ 602 */
292static inline 603static inline
293void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 604void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
294{ 605{
295 spin_unlock_irqrestore(&timer->base->lock, *flags); 606 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
296} 607}
297 608
298/** 609/**
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
342 * The timer is inserted in expiry order. Insertion into the 653 * The timer is inserted in expiry order. Insertion into the
343 * red black tree is O(log(n)). Must hold the base lock. 654 * red black tree is O(log(n)). Must hold the base lock.
344 */ 655 */
345static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 656static void enqueue_hrtimer(struct hrtimer *timer,
657 struct hrtimer_clock_base *base, int reprogram)
346{ 658{
347 struct rb_node **link = &base->active.rb_node; 659 struct rb_node **link = &base->active.rb_node;
348 struct rb_node *parent = NULL; 660 struct rb_node *parent = NULL;
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
368 * Insert the timer to the rbtree and check whether it 680 * Insert the timer to the rbtree and check whether it
369 * replaces the first pending timer 681 * replaces the first pending timer
370 */ 682 */
371 rb_link_node(&timer->node, parent, link);
372 rb_insert_color(&timer->node, &base->active);
373
374 if (!base->first || timer->expires.tv64 < 683 if (!base->first || timer->expires.tv64 <
375 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 684 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
685 /*
686 * Reprogram the clock event device. When the timer is already
687 * expired hrtimer_enqueue_reprogram has either called the
688 * callback or added it to the pending list and raised the
689 * softirq.
690 *
691 * This is a NOP for !HIGHRES
692 */
693 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
694 return;
695
376 base->first = &timer->node; 696 base->first = &timer->node;
697 }
698
699 rb_link_node(&timer->node, parent, link);
700 rb_insert_color(&timer->node, &base->active);
701 /*
702 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
703 * state of a possibly running callback.
704 */
705 timer->state |= HRTIMER_STATE_ENQUEUED;
377} 706}
378 707
379/* 708/*
380 * __remove_hrtimer - internal function to remove a timer 709 * __remove_hrtimer - internal function to remove a timer
381 * 710 *
382 * Caller must hold the base lock. 711 * Caller must hold the base lock.
712 *
713 * High resolution timer mode reprograms the clock event device when the
714 * timer is the one which expires next. The caller can disable this by setting
715 * reprogram to zero. This is useful, when the context does a reprogramming
716 * anyway (e.g. timer interrupt)
383 */ 717 */
384static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 718static void __remove_hrtimer(struct hrtimer *timer,
719 struct hrtimer_clock_base *base,
720 unsigned long newstate, int reprogram)
385{ 721{
386 /* 722 /* High res. callback list. NOP for !HIGHRES */
387 * Remove the timer from the rbtree and replace the 723 if (hrtimer_cb_pending(timer))
388 * first entry pointer if necessary. 724 hrtimer_remove_cb_pending(timer);
389 */ 725 else {
390 if (base->first == &timer->node) 726 /*
391 base->first = rb_next(&timer->node); 727 * Remove the timer from the rbtree and replace the
392 rb_erase(&timer->node, &base->active); 728 * first entry pointer if necessary.
393 rb_set_parent(&timer->node, &timer->node); 729 */
730 if (base->first == &timer->node) {
731 base->first = rb_next(&timer->node);
732 /* Reprogram the clock event device. if enabled */
733 if (reprogram && hrtimer_hres_active())
734 hrtimer_force_reprogram(base->cpu_base);
735 }
736 rb_erase(&timer->node, &base->active);
737 }
738 timer->state = newstate;
394} 739}
395 740
396/* 741/*
397 * remove hrtimer, called with base lock held 742 * remove hrtimer, called with base lock held
398 */ 743 */
399static inline int 744static inline int
400remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 745remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
401{ 746{
402 if (hrtimer_active(timer)) { 747 if (hrtimer_is_queued(timer)) {
403 __remove_hrtimer(timer, base); 748 int reprogram;
749
750 /*
751 * Remove the timer and force reprogramming when high
752 * resolution mode is active and the timer is on the current
753 * CPU. If we remove a timer on another CPU, reprogramming is
754 * skipped. The interrupt event on this CPU is fired and
755 * reprogramming happens in the interrupt handler. This is a
756 * rare case and less expensive than a smp call.
757 */
758 timer_stats_hrtimer_clear_start_info(timer);
759 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
760 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
761 reprogram);
404 return 1; 762 return 1;
405 } 763 }
406 return 0; 764 return 0;
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
419int 777int
420hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 778hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
421{ 779{
422 struct hrtimer_base *base, *new_base; 780 struct hrtimer_clock_base *base, *new_base;
423 unsigned long flags; 781 unsigned long flags;
424 int ret; 782 int ret;
425 783
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
431 /* Switch the timer base, if necessary: */ 789 /* Switch the timer base, if necessary: */
432 new_base = switch_hrtimer_base(timer, base); 790 new_base = switch_hrtimer_base(timer, base);
433 791
434 if (mode == HRTIMER_REL) { 792 if (mode == HRTIMER_MODE_REL) {
435 tim = ktime_add(tim, new_base->get_time()); 793 tim = ktime_add(tim, new_base->get_time());
436 /* 794 /*
437 * CONFIG_TIME_LOW_RES is a temporary way for architectures 795 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
446 } 804 }
447 timer->expires = tim; 805 timer->expires = tim;
448 806
449 enqueue_hrtimer(timer, new_base); 807 timer_stats_hrtimer_set_start_info(timer);
808
809 enqueue_hrtimer(timer, new_base, base == new_base);
450 810
451 unlock_hrtimer_base(timer, &flags); 811 unlock_hrtimer_base(timer, &flags);
452 812
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
466 */ 826 */
467int hrtimer_try_to_cancel(struct hrtimer *timer) 827int hrtimer_try_to_cancel(struct hrtimer *timer)
468{ 828{
469 struct hrtimer_base *base; 829 struct hrtimer_clock_base *base;
470 unsigned long flags; 830 unsigned long flags;
471 int ret = -1; 831 int ret = -1;
472 832
473 base = lock_hrtimer_base(timer, &flags); 833 base = lock_hrtimer_base(timer, &flags);
474 834
475 if (base->curr_timer != timer) 835 if (!hrtimer_callback_running(timer))
476 ret = remove_hrtimer(timer, base); 836 ret = remove_hrtimer(timer, base);
477 837
478 unlock_hrtimer_base(timer, &flags); 838 unlock_hrtimer_base(timer, &flags);
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
508 */ 868 */
509ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 869ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
510{ 870{
511 struct hrtimer_base *base; 871 struct hrtimer_clock_base *base;
512 unsigned long flags; 872 unsigned long flags;
513 ktime_t rem; 873 ktime_t rem;
514 874
515 base = lock_hrtimer_base(timer, &flags); 875 base = lock_hrtimer_base(timer, &flags);
516 rem = ktime_sub(timer->expires, timer->base->get_time()); 876 rem = ktime_sub(timer->expires, base->get_time());
517 unlock_hrtimer_base(timer, &flags); 877 unlock_hrtimer_base(timer, &flags);
518 878
519 return rem; 879 return rem;
520} 880}
521EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 881EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
522 882
523#ifdef CONFIG_NO_IDLE_HZ 883#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
524/** 884/**
525 * hrtimer_get_next_event - get the time until next expiry event 885 * hrtimer_get_next_event - get the time until next expiry event
526 * 886 *
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
529 */ 889 */
530ktime_t hrtimer_get_next_event(void) 890ktime_t hrtimer_get_next_event(void)
531{ 891{
532 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 892 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
893 struct hrtimer_clock_base *base = cpu_base->clock_base;
533 ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 894 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
534 unsigned long flags; 895 unsigned long flags;
535 int i; 896 int i;
536 897
537 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 898 spin_lock_irqsave(&cpu_base->lock, flags);
538 struct hrtimer *timer;
539 899
540 spin_lock_irqsave(&base->lock, flags); 900 if (!hrtimer_hres_active()) {
541 if (!base->first) { 901 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
542 spin_unlock_irqrestore(&base->lock, flags); 902 struct hrtimer *timer;
543 continue; 903
904 if (!base->first)
905 continue;
906
907 timer = rb_entry(base->first, struct hrtimer, node);
908 delta.tv64 = timer->expires.tv64;
909 delta = ktime_sub(delta, base->get_time());
910 if (delta.tv64 < mindelta.tv64)
911 mindelta.tv64 = delta.tv64;
544 } 912 }
545 timer = rb_entry(base->first, struct hrtimer, node);
546 delta.tv64 = timer->expires.tv64;
547 spin_unlock_irqrestore(&base->lock, flags);
548 delta = ktime_sub(delta, base->get_time());
549 if (delta.tv64 < mindelta.tv64)
550 mindelta.tv64 = delta.tv64;
551 } 913 }
914
915 spin_unlock_irqrestore(&cpu_base->lock, flags);
916
552 if (mindelta.tv64 < 0) 917 if (mindelta.tv64 < 0)
553 mindelta.tv64 = 0; 918 mindelta.tv64 = 0;
554 return mindelta; 919 return mindelta;
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void)
564void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 929void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
565 enum hrtimer_mode mode) 930 enum hrtimer_mode mode)
566{ 931{
567 struct hrtimer_base *bases; 932 struct hrtimer_cpu_base *cpu_base;
568 933
569 memset(timer, 0, sizeof(struct hrtimer)); 934 memset(timer, 0, sizeof(struct hrtimer));
570 935
571 bases = __raw_get_cpu_var(hrtimer_bases); 936 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
572 937
573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 938 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
574 clock_id = CLOCK_MONOTONIC; 939 clock_id = CLOCK_MONOTONIC;
575 940
576 timer->base = &bases[clock_id]; 941 timer->base = &cpu_base->clock_base[clock_id];
577 rb_set_parent(&timer->node, &timer->node); 942 hrtimer_init_timer_hres(timer);
943
944#ifdef CONFIG_TIMER_STATS
945 timer->start_site = NULL;
946 timer->start_pid = -1;
947 memset(timer->start_comm, 0, TASK_COMM_LEN);
948#endif
578} 949}
579EXPORT_SYMBOL_GPL(hrtimer_init); 950EXPORT_SYMBOL_GPL(hrtimer_init);
580 951
@@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
588 */ 959 */
589int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 960int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
590{ 961{
591 struct hrtimer_base *bases; 962 struct hrtimer_cpu_base *cpu_base;
592 963
593 bases = __raw_get_cpu_var(hrtimer_bases); 964 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
594 *tp = ktime_to_timespec(bases[which_clock].resolution); 965 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
595 966
596 return 0; 967 return 0;
597} 968}
598EXPORT_SYMBOL_GPL(hrtimer_get_res); 969EXPORT_SYMBOL_GPL(hrtimer_get_res);
599 970
971#ifdef CONFIG_HIGH_RES_TIMERS
972
973/*
974 * High resolution timer interrupt
975 * Called with interrupts disabled
976 */
977void hrtimer_interrupt(struct clock_event_device *dev)
978{
979 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
980 struct hrtimer_clock_base *base;
981 ktime_t expires_next, now;
982 int i, raise = 0;
983
984 BUG_ON(!cpu_base->hres_active);
985 cpu_base->nr_events++;
986 dev->next_event.tv64 = KTIME_MAX;
987
988 retry:
989 now = ktime_get();
990
991 expires_next.tv64 = KTIME_MAX;
992
993 base = cpu_base->clock_base;
994
995 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
996 ktime_t basenow;
997 struct rb_node *node;
998
999 spin_lock(&cpu_base->lock);
1000
1001 basenow = ktime_add(now, base->offset);
1002
1003 while ((node = base->first)) {
1004 struct hrtimer *timer;
1005
1006 timer = rb_entry(node, struct hrtimer, node);
1007
1008 if (basenow.tv64 < timer->expires.tv64) {
1009 ktime_t expires;
1010
1011 expires = ktime_sub(timer->expires,
1012 base->offset);
1013 if (expires.tv64 < expires_next.tv64)
1014 expires_next = expires;
1015 break;
1016 }
1017
1018 /* Move softirq callbacks to the pending list */
1019 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1020 __remove_hrtimer(timer, base,
1021 HRTIMER_STATE_PENDING, 0);
1022 list_add_tail(&timer->cb_entry,
1023 &base->cpu_base->cb_pending);
1024 raise = 1;
1025 continue;
1026 }
1027
1028 __remove_hrtimer(timer, base,
1029 HRTIMER_STATE_CALLBACK, 0);
1030 timer_stats_account_hrtimer(timer);
1031
1032 /*
1033 * Note: We clear the CALLBACK bit after
1034 * enqueue_hrtimer to avoid reprogramming of
1035 * the event hardware. This happens at the end
1036 * of this function anyway.
1037 */
1038 if (timer->function(timer) != HRTIMER_NORESTART) {
1039 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1040 enqueue_hrtimer(timer, base, 0);
1041 }
1042 timer->state &= ~HRTIMER_STATE_CALLBACK;
1043 }
1044 spin_unlock(&cpu_base->lock);
1045 base++;
1046 }
1047
1048 cpu_base->expires_next = expires_next;
1049
1050 /* Reprogramming necessary ? */
1051 if (expires_next.tv64 != KTIME_MAX) {
1052 if (tick_program_event(expires_next, 0))
1053 goto retry;
1054 }
1055
1056 /* Raise softirq ? */
1057 if (raise)
1058 raise_softirq(HRTIMER_SOFTIRQ);
1059}
1060
1061static void run_hrtimer_softirq(struct softirq_action *h)
1062{
1063 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1064
1065 spin_lock_irq(&cpu_base->lock);
1066
1067 while (!list_empty(&cpu_base->cb_pending)) {
1068 enum hrtimer_restart (*fn)(struct hrtimer *);
1069 struct hrtimer *timer;
1070 int restart;
1071
1072 timer = list_entry(cpu_base->cb_pending.next,
1073 struct hrtimer, cb_entry);
1074
1075 timer_stats_account_hrtimer(timer);
1076
1077 fn = timer->function;
1078 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1079 spin_unlock_irq(&cpu_base->lock);
1080
1081 restart = fn(timer);
1082
1083 spin_lock_irq(&cpu_base->lock);
1084
1085 timer->state &= ~HRTIMER_STATE_CALLBACK;
1086 if (restart == HRTIMER_RESTART) {
1087 BUG_ON(hrtimer_active(timer));
1088 /*
1089 * Enqueue the timer, allow reprogramming of the event
1090 * device
1091 */
1092 enqueue_hrtimer(timer, timer->base, 1);
1093 } else if (hrtimer_active(timer)) {
1094 /*
1095 * If the timer was rearmed on another CPU, reprogram
1096 * the event device.
1097 */
1098 if (timer->base->first == &timer->node)
1099 hrtimer_reprogram(timer, timer->base);
1100 }
1101 }
1102 spin_unlock_irq(&cpu_base->lock);
1103}
1104
1105#endif /* CONFIG_HIGH_RES_TIMERS */
1106
600/* 1107/*
601 * Expire the per base hrtimer-queue: 1108 * Expire the per base hrtimer-queue:
602 */ 1109 */
603static inline void run_hrtimer_queue(struct hrtimer_base *base) 1110static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1111 int index)
604{ 1112{
605 struct rb_node *node; 1113 struct rb_node *node;
1114 struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
606 1115
607 if (!base->first) 1116 if (!base->first)
608 return; 1117 return;
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
610 if (base->get_softirq_time) 1119 if (base->get_softirq_time)
611 base->softirq_time = base->get_softirq_time(); 1120 base->softirq_time = base->get_softirq_time();
612 1121
613 spin_lock_irq(&base->lock); 1122 spin_lock_irq(&cpu_base->lock);
614 1123
615 while ((node = base->first)) { 1124 while ((node = base->first)) {
616 struct hrtimer *timer; 1125 struct hrtimer *timer;
617 int (*fn)(struct hrtimer *); 1126 enum hrtimer_restart (*fn)(struct hrtimer *);
618 int restart; 1127 int restart;
619 1128
620 timer = rb_entry(node, struct hrtimer, node); 1129 timer = rb_entry(node, struct hrtimer, node);
621 if (base->softirq_time.tv64 <= timer->expires.tv64) 1130 if (base->softirq_time.tv64 <= timer->expires.tv64)
622 break; 1131 break;
623 1132
1133 timer_stats_account_hrtimer(timer);
1134
624 fn = timer->function; 1135 fn = timer->function;
625 set_curr_timer(base, timer); 1136 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
626 __remove_hrtimer(timer, base); 1137 spin_unlock_irq(&cpu_base->lock);
627 spin_unlock_irq(&base->lock);
628 1138
629 restart = fn(timer); 1139 restart = fn(timer);
630 1140
631 spin_lock_irq(&base->lock); 1141 spin_lock_irq(&cpu_base->lock);
632 1142
1143 timer->state &= ~HRTIMER_STATE_CALLBACK;
633 if (restart != HRTIMER_NORESTART) { 1144 if (restart != HRTIMER_NORESTART) {
634 BUG_ON(hrtimer_active(timer)); 1145 BUG_ON(hrtimer_active(timer));
635 enqueue_hrtimer(timer, base); 1146 enqueue_hrtimer(timer, base, 0);
636 } 1147 }
637 } 1148 }
638 set_curr_timer(base, NULL); 1149 spin_unlock_irq(&cpu_base->lock);
639 spin_unlock_irq(&base->lock);
640} 1150}
641 1151
642/* 1152/*
643 * Called from timer softirq every jiffy, expire hrtimers: 1153 * Called from timer softirq every jiffy, expire hrtimers:
1154 *
1155 * For HRT its the fall back code to run the softirq in the timer
1156 * softirq context in case the hrtimer initialization failed or has
1157 * not been done yet.
644 */ 1158 */
645void hrtimer_run_queues(void) 1159void hrtimer_run_queues(void)
646{ 1160{
647 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 1161 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
648 int i; 1162 int i;
649 1163
650 hrtimer_get_softirq_time(base); 1164 if (hrtimer_hres_active())
1165 return;
1166
1167 /*
1168 * This _is_ ugly: We have to check in the softirq context,
1169 * whether we can switch to highres and / or nohz mode. The
1170 * clocksource switch happens in the timer interrupt with
1171 * xtime_lock held. Notification from there only sets the
1172 * check bit in the tick_oneshot code, otherwise we might
1173 * deadlock vs. xtime_lock.
1174 */
1175 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1176 hrtimer_switch_to_hres();
651 1177
652 for (i = 0; i < MAX_HRTIMER_BASES; i++) 1178 hrtimer_get_softirq_time(cpu_base);
653 run_hrtimer_queue(&base[i]); 1179
1180 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1181 run_hrtimer_queue(cpu_base, i);
654} 1182}
655 1183
656/* 1184/*
657 * Sleep related functions: 1185 * Sleep related functions:
658 */ 1186 */
659static int hrtimer_wakeup(struct hrtimer *timer) 1187static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
660{ 1188{
661 struct hrtimer_sleeper *t = 1189 struct hrtimer_sleeper *t =
662 container_of(timer, struct hrtimer_sleeper, timer); 1190 container_of(timer, struct hrtimer_sleeper, timer);
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
673{ 1201{
674 sl->timer.function = hrtimer_wakeup; 1202 sl->timer.function = hrtimer_wakeup;
675 sl->task = task; 1203 sl->task = task;
1204#ifdef CONFIG_HIGH_RES_TIMERS
1205 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
1206#endif
676} 1207}
677 1208
678static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1209static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
683 set_current_state(TASK_INTERRUPTIBLE); 1214 set_current_state(TASK_INTERRUPTIBLE);
684 hrtimer_start(&t->timer, t->timer.expires, mode); 1215 hrtimer_start(&t->timer, t->timer.expires, mode);
685 1216
686 schedule(); 1217 if (likely(t->task))
1218 schedule();
687 1219
688 hrtimer_cancel(&t->timer); 1220 hrtimer_cancel(&t->timer);
689 mode = HRTIMER_ABS; 1221 mode = HRTIMER_MODE_ABS;
690 1222
691 } while (t->task && !signal_pending(current)); 1223 } while (t->task && !signal_pending(current));
692 1224
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
702 1234
703 restart->fn = do_no_restart_syscall; 1235 restart->fn = do_no_restart_syscall;
704 1236
705 hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); 1237 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
706 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1238 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
707 1239
708 if (do_nanosleep(&t, HRTIMER_ABS)) 1240 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
709 return 0; 1241 return 0;
710 1242
711 rmtp = (struct timespec __user *) restart->arg1; 1243 rmtp = (struct timespec __user *) restart->arg1;
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
738 return 0; 1270 return 0;
739 1271
740 /* Absolute timers do not update the rmtp value and restart: */ 1272 /* Absolute timers do not update the rmtp value and restart: */
741 if (mode == HRTIMER_ABS) 1273 if (mode == HRTIMER_MODE_ABS)
742 return -ERESTARTNOHAND; 1274 return -ERESTARTNOHAND;
743 1275
744 if (rmtp) { 1276 if (rmtp) {
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
771 if (!timespec_valid(&tu)) 1303 if (!timespec_valid(&tu))
772 return -EINVAL; 1304 return -EINVAL;
773 1305
774 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); 1306 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
775} 1307}
776 1308
777/* 1309/*
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
779 */ 1311 */
780static void __devinit init_hrtimers_cpu(int cpu) 1312static void __devinit init_hrtimers_cpu(int cpu)
781{ 1313{
782 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); 1314 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
783 int i; 1315 int i;
784 1316
785 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 1317 spin_lock_init(&cpu_base->lock);
786 spin_lock_init(&base->lock); 1318 lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
787 lockdep_set_class(&base->lock, &base->lock_key); 1319
788 } 1320 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1321 cpu_base->clock_base[i].cpu_base = cpu_base;
1322
1323 hrtimer_init_hres(cpu_base);
789} 1324}
790 1325
791#ifdef CONFIG_HOTPLUG_CPU 1326#ifdef CONFIG_HOTPLUG_CPU
792 1327
793static void migrate_hrtimer_list(struct hrtimer_base *old_base, 1328static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
794 struct hrtimer_base *new_base) 1329 struct hrtimer_clock_base *new_base)
795{ 1330{
796 struct hrtimer *timer; 1331 struct hrtimer *timer;
797 struct rb_node *node; 1332 struct rb_node *node;
798 1333
799 while ((node = rb_first(&old_base->active))) { 1334 while ((node = rb_first(&old_base->active))) {
800 timer = rb_entry(node, struct hrtimer, node); 1335 timer = rb_entry(node, struct hrtimer, node);
801 __remove_hrtimer(timer, old_base); 1336 BUG_ON(hrtimer_callback_running(timer));
1337 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
802 timer->base = new_base; 1338 timer->base = new_base;
803 enqueue_hrtimer(timer, new_base); 1339 /*
1340 * Enqueue the timer. Allow reprogramming of the event device
1341 */
1342 enqueue_hrtimer(timer, new_base, 1);
804 } 1343 }
805} 1344}
806 1345
807static void migrate_hrtimers(int cpu) 1346static void migrate_hrtimers(int cpu)
808{ 1347{
809 struct hrtimer_base *old_base, *new_base; 1348 struct hrtimer_cpu_base *old_base, *new_base;
810 int i; 1349 int i;
811 1350
812 BUG_ON(cpu_online(cpu)); 1351 BUG_ON(cpu_online(cpu));
813 old_base = per_cpu(hrtimer_bases, cpu); 1352 old_base = &per_cpu(hrtimer_bases, cpu);
814 new_base = get_cpu_var(hrtimer_bases); 1353 new_base = &get_cpu_var(hrtimer_bases);
815
816 local_irq_disable();
817 1354
818 for (i = 0; i < MAX_HRTIMER_BASES; i++) { 1355 tick_cancel_sched_timer(cpu);
819 1356
820 spin_lock(&new_base->lock); 1357 local_irq_disable();
821 spin_lock(&old_base->lock);
822
823 BUG_ON(old_base->curr_timer);
824 1358
825 migrate_hrtimer_list(old_base, new_base); 1359 spin_lock(&new_base->lock);
1360 spin_lock(&old_base->lock);
826 1361
827 spin_unlock(&old_base->lock); 1362 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
828 spin_unlock(&new_base->lock); 1363 migrate_hrtimer_list(&old_base->clock_base[i],
829 old_base++; 1364 &new_base->clock_base[i]);
830 new_base++;
831 } 1365 }
1366 spin_unlock(&old_base->lock);
1367 spin_unlock(&new_base->lock);
832 1368
833 local_irq_enable(); 1369 local_irq_enable();
834 put_cpu_var(hrtimer_bases); 1370 put_cpu_var(hrtimer_bases);
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
848 1384
849#ifdef CONFIG_HOTPLUG_CPU 1385#ifdef CONFIG_HOTPLUG_CPU
850 case CPU_DEAD: 1386 case CPU_DEAD:
1387 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
851 migrate_hrtimers(cpu); 1388 migrate_hrtimers(cpu);
852 break; 1389 break;
853#endif 1390#endif
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void)
868 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1405 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
869 (void *)(long)smp_processor_id()); 1406 (void *)(long)smp_processor_id());
870 register_cpu_notifier(&hrtimers_nb); 1407 register_cpu_notifier(&hrtimers_nb);
1408#ifdef CONFIG_HIGH_RES_TIMERS
1409 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
1410#endif
871} 1411}
872 1412
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 475e8a71bcdc..0133f4f9e9f0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data);
168/** 168/**
169 * set_irq_data - set irq type data for an irq 169 * set_irq_data - set irq type data for an irq
170 * @irq: Interrupt number 170 * @irq: Interrupt number
171 * @data: Pointer to interrupt specific data 171 * @entry: Pointer to MSI descriptor data
172 * 172 *
173 * Set the hardware irq controller data for an irq 173 * Set the hardware irq controller data for an irq
174 */ 174 */
@@ -230,10 +230,6 @@ static void default_enable(unsigned int irq)
230 */ 230 */
231static void default_disable(unsigned int irq) 231static void default_disable(unsigned int irq)
232{ 232{
233 struct irq_desc *desc = irq_desc + irq;
234
235 if (!(desc->status & IRQ_DELAYED_DISABLE))
236 desc->chip->mask(irq);
237} 233}
238 234
239/* 235/*
@@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
298 294
299 if (unlikely(desc->status & IRQ_INPROGRESS)) 295 if (unlikely(desc->status & IRQ_INPROGRESS))
300 goto out_unlock; 296 goto out_unlock;
301 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
302 kstat_cpu(cpu).irqs[irq]++; 297 kstat_cpu(cpu).irqs[irq]++;
303 298
304 action = desc->action; 299 action = desc->action;
305 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 300 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
301 if (desc->chip->mask)
302 desc->chip->mask(irq);
303 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
304 desc->status |= IRQ_PENDING;
306 goto out_unlock; 305 goto out_unlock;
306 }
307 307
308 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
308 desc->status |= IRQ_INPROGRESS; 309 desc->status |= IRQ_INPROGRESS;
309 spin_unlock(&desc->lock); 310 spin_unlock(&desc->lock);
310 311
@@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
396 397
397 /* 398 /*
398 * If its disabled or no action available 399 * If its disabled or no action available
399 * keep it masked and get out of here 400 * then mask it and get out of here:
400 */ 401 */
401 action = desc->action; 402 action = desc->action;
402 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 403 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
403 desc->status |= IRQ_PENDING; 404 desc->status |= IRQ_PENDING;
405 if (desc->chip->mask)
406 desc->chip->mask(irq);
404 goto out; 407 goto out;
405 } 408 }
406 409
@@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
562 565
563 /* Uninstall? */ 566 /* Uninstall? */
564 if (handle == handle_bad_irq) { 567 if (handle == handle_bad_irq) {
565 if (desc->chip != &no_irq_chip) { 568 if (desc->chip != &no_irq_chip)
566 desc->chip->mask(irq); 569 mask_ack_irq(desc, irq);
567 desc->chip->ack(irq);
568 }
569 desc->status |= IRQ_DISABLED; 570 desc->status |= IRQ_DISABLED;
570 desc->depth = 1; 571 desc->depth = 1;
571 } 572 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index acc5d9fe462b..5597c157442a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq)
38} 38}
39EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
40 40
41/**
42 * irq_can_set_affinity - Check if the affinity of a given irq can be set
43 * @irq: Interrupt to check
44 *
45 */
46int irq_can_set_affinity(unsigned int irq)
47{
48 struct irq_desc *desc = irq_desc + irq;
49
50 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
51 !desc->chip->set_affinity)
52 return 0;
53
54 return 1;
55}
56
57/**
58 * irq_set_affinity - Set the irq affinity of a given irq
59 * @irq: Interrupt to set affinity
60 * @cpumask: cpumask
61 *
62 */
63int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
64{
65 struct irq_desc *desc = irq_desc + irq;
66
67 if (!desc->chip->set_affinity)
68 return -EINVAL;
69
70 set_balance_irq_affinity(irq, cpumask);
71
72#ifdef CONFIG_GENERIC_PENDING_IRQ
73 set_pending_irq(irq, cpumask);
74#else
75 desc->affinity = cpumask;
76 desc->chip->set_affinity(irq, cpumask);
77#endif
78 return 0;
79}
80
41#endif 81#endif
42 82
43/** 83/**
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
281 if (new->flags & IRQF_PERCPU) 321 if (new->flags & IRQF_PERCPU)
282 desc->status |= IRQ_PER_CPU; 322 desc->status |= IRQ_PER_CPU;
283#endif 323#endif
324 /* Exclude IRQ from balancing */
325 if (new->flags & IRQF_NOBALANCING)
326 desc->status |= IRQ_NO_BALANCING;
327
284 if (!shared) { 328 if (!shared) {
285 irq_chip_set_defaults(desc->chip); 329 irq_chip_set_defaults(desc->chip);
286 330
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6d3be06e8ce6..2db91eb54ad8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19#ifdef CONFIG_GENERIC_PENDING_IRQ
20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
21{
22 set_balance_irq_affinity(irq, mask_val);
23
24 /*
25 * Save these away for later use. Re-progam when the
26 * interrupt is pending
27 */
28 set_pending_irq(irq, mask_val);
29}
30#else
31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
32{
33 set_balance_irq_affinity(irq, mask_val);
34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
36}
37#endif
38
39static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
40 int count, int *eof, void *data) 20 int count, int *eof, void *data)
41{ 21{
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
55 cpumask_t new_value, tmp; 35 cpumask_t new_value, tmp;
56 36
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 37 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status)) 38 irq_balancing_disabled(irq))
59 return -EIO; 39 return -EIO;
60 40
61 err = cpumask_parse_user(buffer, count, new_value); 41 err = cpumask_parse_user(buffer, count, new_value);
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
73 code to set default SMP affinity. */ 53 code to set default SMP affinity. */
74 return select_smp_affinity(irq) ? -EINVAL : full_count; 54 return select_smp_affinity(irq) ? -EINVAL : full_count;
75 55
76 proc_set_irq_affinity(irq, new_value); 56 irq_set_affinity(irq, new_value);
77 57
78 return full_count; 58 return full_count;
79} 59}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 204ed7939e75..307c6a632ef6 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(struct hrtimer *timer) 131enum hrtimer_restart it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct signal_struct *sig = 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer); 134 container_of(timer, struct signal_struct, real_timer);
135 135
136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
137 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART;
142 }
143 return HRTIMER_NORESTART; 138 return HRTIMER_NORESTART;
144} 139}
145 140
@@ -231,11 +226,14 @@ again:
231 spin_unlock_irq(&tsk->sighand->siglock); 226 spin_unlock_irq(&tsk->sighand->siglock);
232 goto again; 227 goto again;
233 } 228 }
234 tsk->signal->it_real_incr =
235 timeval_to_ktime(value->it_interval);
236 expires = timeval_to_ktime(value->it_value); 229 expires = timeval_to_ktime(value->it_value);
237 if (expires.tv64 != 0) 230 if (expires.tv64 != 0) {
238 hrtimer_start(timer, expires, HRTIMER_REL); 231 tsk->signal->it_real_incr =
232 timeval_to_ktime(value->it_interval);
233 hrtimer_start(timer, expires, HRTIMER_MODE_REL);
234 } else
235 tsk->signal->it_real_incr.tv64 = 0;
236
239 spin_unlock_irq(&tsk->sighand->siglock); 237 spin_unlock_irq(&tsk->sighand->siglock);
240 break; 238 break;
241 case ITIMER_VIRTUAL: 239 case ITIMER_VIRTUAL:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..9f923f8ce6a0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,8 @@
36#include <linux/resource.h> 36#include <linux/resource.h>
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39extern int delete_module(const char *name, unsigned int flags);
40
39extern int max_threads; 41extern int max_threads;
40 42
41static struct workqueue_struct *khelper_wq; 43static struct workqueue_struct *khelper_wq;
@@ -46,6 +48,7 @@ static struct workqueue_struct *khelper_wq;
46 modprobe_path is set via /proc/sys. 48 modprobe_path is set via /proc/sys.
47*/ 49*/
48char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51struct module_kobject kmod_mk;
49 52
50/** 53/**
51 * request_module - try to load a kernel module 54 * request_module - try to load a kernel module
@@ -75,6 +78,11 @@ int request_module(const char *fmt, ...)
75 static atomic_t kmod_concurrent = ATOMIC_INIT(0); 78 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
76#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 79#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
77 static int kmod_loop_msg; 80 static int kmod_loop_msg;
81 char modalias[16 + MODULE_NAME_LEN] = "MODALIAS=";
82 char *uevent_envp[2] = {
83 modalias,
84 NULL
85 };
78 86
79 va_start(args, fmt); 87 va_start(args, fmt);
80 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
@@ -82,6 +90,12 @@ int request_module(const char *fmt, ...)
82 if (ret >= MODULE_NAME_LEN) 90 if (ret >= MODULE_NAME_LEN)
83 return -ENAMETOOLONG; 91 return -ENAMETOOLONG;
84 92
93 strcpy(&modalias[strlen("MODALIAS=")], module_name);
94 kobject_uevent_env(&kmod_mk.kobj, KOBJ_CHANGE, uevent_envp);
95
96 if (modprobe_path[0] == '\0')
97 goto out;
98
85 /* If modprobe needs a service that is in a module, we get a recursive 99 /* If modprobe needs a service that is in a module, we get a recursive
86 * loop. Limit the number of running kmod threads to max_threads/2 or 100 * loop. Limit the number of running kmod threads to max_threads/2 or
87 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 101 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
@@ -108,9 +122,115 @@ int request_module(const char *fmt, ...)
108 122
109 ret = call_usermodehelper(modprobe_path, argv, envp, 1); 123 ret = call_usermodehelper(modprobe_path, argv, envp, 1);
110 atomic_dec(&kmod_concurrent); 124 atomic_dec(&kmod_concurrent);
125out:
111 return ret; 126 return ret;
112} 127}
113EXPORT_SYMBOL(request_module); 128EXPORT_SYMBOL(request_module);
129
130static ssize_t store_mod_request(struct module_attribute *mattr,
131 struct module *mod,
132 const char *buffer, size_t count)
133{
134 char name[MODULE_NAME_LEN];
135 int ret;
136
137 if (count < 1 || count+1 > MODULE_NAME_LEN)
138 return -EINVAL;
139 memcpy(name, buffer, count);
140 name[count] = '\0';
141 if (name[count-1] == '\n')
142 name[count-1] = '\0';
143
144 ret = request_module(name);
145 if (ret < 0)
146 return ret;
147 return count;
148}
149
150static struct module_attribute mod_request = {
151 .attr = { .name = "mod_request", .mode = S_IWUSR, .owner = THIS_MODULE },
152 .store = store_mod_request,
153};
154
155#ifdef CONFIG_MODULE_UNLOAD
156static ssize_t store_mod_unload(struct module_attribute *mattr,
157 struct module *mod,
158 const char *buffer, size_t count)
159{
160 char name[MODULE_NAME_LEN];
161 int ret;
162
163 if (count < 1 || count+1 > MODULE_NAME_LEN)
164 return -EINVAL;
165 memcpy(name, buffer, count);
166 name[count] = '\0';
167 if (name[count-1] == '\n')
168 name[count-1] = '\0';
169
170 ret = delete_module(name, O_NONBLOCK);
171 if (ret < 0)
172 return ret;
173 return count;
174}
175
176static struct module_attribute mod_unload = {
177 .attr = { .name = "mod_unload", .mode = S_IWUSR, .owner = THIS_MODULE },
178 .store = store_mod_unload,
179};
180#endif
181
182static ssize_t show_mod_request_helper(struct module_attribute *mattr,
183 struct module *mod,
184 char *buffer)
185{
186 return sprintf(buffer, "%s\n", modprobe_path);
187}
188
189static ssize_t store_mod_request_helper(struct module_attribute *mattr,
190 struct module *mod,
191 const char *buffer, size_t count)
192{
193 if (count < 1 || count+1 > KMOD_PATH_LEN)
194 return -EINVAL;
195 memcpy(modprobe_path, buffer, count);
196 modprobe_path[count] = '\0';
197 if (modprobe_path[count-1] == '\n')
198 modprobe_path[count-1] = '\0';
199 return count;
200}
201
202static struct module_attribute mod_request_helper = {
203 .attr = {
204 .name = "mod_request_helper",
205 .mode = S_IWUSR | S_IRUGO,
206 .owner = THIS_MODULE
207 },
208 .show = show_mod_request_helper,
209 .store = store_mod_request_helper,
210};
211
212void __init kmod_sysfs_init(void)
213{
214 int ret;
215
216 kmod_mk.mod = THIS_MODULE;
217 kobj_set_kset_s(&kmod_mk, module_subsys);
218 kobject_set_name(&kmod_mk.kobj, "kmod");
219 kobject_init(&kmod_mk.kobj);
220 ret = kobject_add(&kmod_mk.kobj);
221 if (ret < 0)
222 goto out;
223
224 ret = sysfs_create_file(&kmod_mk.kobj, &mod_request_helper.attr);
225 ret = sysfs_create_file(&kmod_mk.kobj, &mod_request.attr);
226#ifdef CONFIG_MODULE_UNLOAD
227 ret = sysfs_create_file(&kmod_mk.kobj, &mod_unload.attr);
228#endif
229
230 kobject_uevent(&kmod_mk.kobj, KOBJ_ADD);
231out:
232 return;
233}
114#endif /* CONFIG_KMOD */ 234#endif /* CONFIG_KMOD */
115 235
116struct subprocess_info { 236struct subprocess_info {
@@ -217,7 +337,10 @@ static int wait_for_helper(void *data)
217 sub_info->retval = ret; 337 sub_info->retval = ret;
218 } 338 }
219 339
220 complete(sub_info->complete); 340 if (sub_info->wait < 0)
341 kfree(sub_info);
342 else
343 complete(sub_info->complete);
221 return 0; 344 return 0;
222} 345}
223 346
@@ -239,6 +362,9 @@ static void __call_usermodehelper(struct work_struct *work)
239 pid = kernel_thread(____call_usermodehelper, sub_info, 362 pid = kernel_thread(____call_usermodehelper, sub_info,
240 CLONE_VFORK | SIGCHLD); 363 CLONE_VFORK | SIGCHLD);
241 364
365 if (wait < 0)
366 return;
367
242 if (pid < 0) { 368 if (pid < 0) {
243 sub_info->retval = pid; 369 sub_info->retval = pid;
244 complete(sub_info->complete); 370 complete(sub_info->complete);
@@ -253,6 +379,9 @@ static void __call_usermodehelper(struct work_struct *work)
253 * @envp: null-terminated environment list 379 * @envp: null-terminated environment list
254 * @session_keyring: session keyring for process (NULL for an empty keyring) 380 * @session_keyring: session keyring for process (NULL for an empty keyring)
255 * @wait: wait for the application to finish and return status. 381 * @wait: wait for the application to finish and return status.
382 * when -1 don't wait at all, but you get no useful error back when
383 * the program couldn't be exec'ed. This makes it safe to call
384 * from interrupt context.
256 * 385 *
257 * Runs a user-space application. The application is started 386 * Runs a user-space application. The application is started
258 * asynchronously if wait is not set, and runs as a child of keventd. 387 * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +394,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
265 struct key *session_keyring, int wait) 394 struct key *session_keyring, int wait)
266{ 395{
267 DECLARE_COMPLETION_ONSTACK(done); 396 DECLARE_COMPLETION_ONSTACK(done);
268 struct subprocess_info sub_info = { 397 struct subprocess_info *sub_info;
269 .work = __WORK_INITIALIZER(sub_info.work, 398 int retval;
270 __call_usermodehelper),
271 .complete = &done,
272 .path = path,
273 .argv = argv,
274 .envp = envp,
275 .ring = session_keyring,
276 .wait = wait,
277 .retval = 0,
278 };
279 399
280 if (!khelper_wq) 400 if (!khelper_wq)
281 return -EBUSY; 401 return -EBUSY;
@@ -283,9 +403,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
283 if (path[0] == '\0') 403 if (path[0] == '\0')
284 return 0; 404 return 0;
285 405
286 queue_work(khelper_wq, &sub_info.work); 406 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
407 if (!sub_info)
408 return -ENOMEM;
409
410 INIT_WORK(&sub_info->work, __call_usermodehelper);
411 sub_info->complete = &done;
412 sub_info->path = path;
413 sub_info->argv = argv;
414 sub_info->envp = envp;
415 sub_info->ring = session_keyring;
416 sub_info->wait = wait;
417
418 queue_work(khelper_wq, &sub_info->work);
419 if (wait < 0) /* task has freed sub_info */
420 return 0;
287 wait_for_completion(&done); 421 wait_for_completion(&done);
288 return sub_info.retval; 422 retval = sub_info->retval;
423 kfree(sub_info);
424 return retval;
289} 425}
290EXPORT_SYMBOL(call_usermodehelper_keys); 426EXPORT_SYMBOL(call_usermodehelper_keys);
291 427
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6fcf8dd148d0..d25a9ada3f8e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -39,6 +39,8 @@
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h> 41#include <linux/freezer.h>
42#include <linux/seq_file.h>
43#include <linux/debugfs.h>
42#include <asm-generic/sections.h> 44#include <asm-generic/sections.h>
43#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
44#include <asm/errno.h> 46#include <asm/errno.h>
@@ -778,6 +780,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
778 return -ENOSYS; 780 return -ENOSYS;
779} 781}
780 782
783static int __kprobes pre_handler_kretprobe(struct kprobe *p,
784 struct pt_regs *regs)
785{
786 return 0;
787}
788
781#endif /* ARCH_SUPPORTS_KRETPROBES */ 789#endif /* ARCH_SUPPORTS_KRETPROBES */
782 790
783void __kprobes unregister_kretprobe(struct kretprobe *rp) 791void __kprobes unregister_kretprobe(struct kretprobe *rp)
@@ -815,7 +823,109 @@ static int __init init_kprobes(void)
815 return err; 823 return err;
816} 824}
817 825
818__initcall(init_kprobes); 826#ifdef CONFIG_DEBUG_FS
827static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
828 const char *sym, int offset,char *modname)
829{
830 char *kprobe_type;
831
832 if (p->pre_handler == pre_handler_kretprobe)
833 kprobe_type = "r";
834 else if (p->pre_handler == setjmp_pre_handler)
835 kprobe_type = "j";
836 else
837 kprobe_type = "k";
838 if (sym)
839 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type,
840 sym, offset, (modname ? modname : " "));
841 else
842 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr);
843}
844
845static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
846{
847 return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
848}
849
850static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
851{
852 (*pos)++;
853 if (*pos >= KPROBE_TABLE_SIZE)
854 return NULL;
855 return pos;
856}
857
858static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
859{
860 /* Nothing to do */
861}
862
863static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
864{
865 struct hlist_head *head;
866 struct hlist_node *node;
867 struct kprobe *p, *kp;
868 const char *sym = NULL;
869 unsigned int i = *(loff_t *) v;
870 unsigned long size, offset = 0;
871 char *modname, namebuf[128];
872
873 head = &kprobe_table[i];
874 preempt_disable();
875 hlist_for_each_entry_rcu(p, node, head, hlist) {
876 sym = kallsyms_lookup((unsigned long)p->addr, &size,
877 &offset, &modname, namebuf);
878 if (p->pre_handler == aggr_pre_handler) {
879 list_for_each_entry_rcu(kp, &p->list, list)
880 report_probe(pi, kp, sym, offset, modname);
881 } else
882 report_probe(pi, p, sym, offset, modname);
883 }
884 preempt_enable();
885 return 0;
886}
887
888static struct seq_operations kprobes_seq_ops = {
889 .start = kprobe_seq_start,
890 .next = kprobe_seq_next,
891 .stop = kprobe_seq_stop,
892 .show = show_kprobe_addr
893};
894
895static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
896{
897 return seq_open(filp, &kprobes_seq_ops);
898}
899
900static struct file_operations debugfs_kprobes_operations = {
901 .open = kprobes_open,
902 .read = seq_read,
903 .llseek = seq_lseek,
904 .release = seq_release,
905};
906
907static int __kprobes debugfs_kprobe_init(void)
908{
909 struct dentry *dir, *file;
910
911 dir = debugfs_create_dir("kprobes", NULL);
912 if (!dir)
913 return -ENOMEM;
914
915 file = debugfs_create_file("list", 0444, dir , 0 ,
916 &debugfs_kprobes_operations);
917 if (!file) {
918 debugfs_remove(dir);
919 return -ENOMEM;
920 }
921
922 return 0;
923}
924
925late_initcall(debugfs_kprobe_init);
926#endif /* CONFIG_DEBUG_FS */
927
928module_init(init_kprobes);
819 929
820EXPORT_SYMBOL_GPL(register_kprobe); 930EXPORT_SYMBOL_GPL(register_kprobe);
821EXPORT_SYMBOL_GPL(unregister_kprobe); 931EXPORT_SYMBOL_GPL(unregister_kprobe);
@@ -824,4 +934,3 @@ EXPORT_SYMBOL_GPL(unregister_jprobe);
824EXPORT_SYMBOL_GPL(jprobe_return); 934EXPORT_SYMBOL_GPL(jprobe_return);
825EXPORT_SYMBOL_GPL(register_kretprobe); 935EXPORT_SYMBOL_GPL(register_kretprobe);
826EXPORT_SYMBOL_GPL(unregister_kretprobe); 936EXPORT_SYMBOL_GPL(unregister_kretprobe);
827
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 592c576d77a7..a08a17218dfa 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2228,6 +2228,10 @@ out_calc_hash:
2228 2228
2229 curr->lockdep_depth++; 2229 curr->lockdep_depth++;
2230 check_chain_key(curr); 2230 check_chain_key(curr);
2231#ifdef CONFIG_DEBUG_LOCKDEP
2232 if (unlikely(!debug_locks))
2233 return 0;
2234#endif
2231 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 2235 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
2232 debug_locks_off(); 2236 debug_locks_off();
2233 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 2237 printk("BUG: MAX_LOCK_DEPTH too low!\n");
diff --git a/kernel/module.c b/kernel/module.c
index 8a94e054230c..8c25b1a04fa6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -653,20 +653,11 @@ static void wait_for_zero_refcount(struct module *mod)
653 mutex_lock(&module_mutex); 653 mutex_lock(&module_mutex);
654} 654}
655 655
656asmlinkage long 656int delete_module(const char *name, unsigned int flags)
657sys_delete_module(const char __user *name_user, unsigned int flags)
658{ 657{
659 struct module *mod; 658 struct module *mod;
660 char name[MODULE_NAME_LEN];
661 int ret, forced = 0; 659 int ret, forced = 0;
662 660
663 if (!capable(CAP_SYS_MODULE))
664 return -EPERM;
665
666 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
667 return -EFAULT;
668 name[MODULE_NAME_LEN-1] = '\0';
669
670 if (mutex_lock_interruptible(&module_mutex) != 0) 661 if (mutex_lock_interruptible(&module_mutex) != 0)
671 return -EINTR; 662 return -EINTR;
672 663
@@ -727,6 +718,21 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
727 return ret; 718 return ret;
728} 719}
729 720
721asmlinkage long
722sys_delete_module(const char __user *name_user, unsigned int flags)
723{
724 char name[MODULE_NAME_LEN];
725
726 if (!capable(CAP_SYS_MODULE))
727 return -EPERM;
728
729 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
730 return -EFAULT;
731 name[MODULE_NAME_LEN-1] = '\0';
732
733 return delete_module(name, flags);
734}
735
730static void print_unload_info(struct seq_file *m, struct module *mod) 736static void print_unload_info(struct seq_file *m, struct module *mod)
731{ 737{
732 struct module_use *use; 738 struct module_use *use;
@@ -1068,7 +1074,8 @@ static inline void remove_sect_attrs(struct module *mod)
1068} 1074}
1069#endif /* CONFIG_KALLSYMS */ 1075#endif /* CONFIG_KALLSYMS */
1070 1076
1071static int module_add_modinfo_attrs(struct module *mod) 1077#ifdef CONFIG_SYSFS
1078int module_add_modinfo_attrs(struct module *mod)
1072{ 1079{
1073 struct module_attribute *attr; 1080 struct module_attribute *attr;
1074 struct module_attribute *temp_attr; 1081 struct module_attribute *temp_attr;
@@ -1094,7 +1101,7 @@ static int module_add_modinfo_attrs(struct module *mod)
1094 return error; 1101 return error;
1095} 1102}
1096 1103
1097static void module_remove_modinfo_attrs(struct module *mod) 1104void module_remove_modinfo_attrs(struct module *mod)
1098{ 1105{
1099 struct module_attribute *attr; 1106 struct module_attribute *attr;
1100 int i; 1107 int i;
@@ -1109,8 +1116,10 @@ static void module_remove_modinfo_attrs(struct module *mod)
1109 } 1116 }
1110 kfree(mod->modinfo_attrs); 1117 kfree(mod->modinfo_attrs);
1111} 1118}
1119#endif
1112 1120
1113static int mod_sysfs_init(struct module *mod) 1121#ifdef CONFIG_SYSFS
1122int mod_sysfs_init(struct module *mod)
1114{ 1123{
1115 int err; 1124 int err;
1116 1125
@@ -1133,7 +1142,7 @@ out:
1133 return err; 1142 return err;
1134} 1143}
1135 1144
1136static int mod_sysfs_setup(struct module *mod, 1145int mod_sysfs_setup(struct module *mod,
1137 struct kernel_param *kparam, 1146 struct kernel_param *kparam,
1138 unsigned int num_params) 1147 unsigned int num_params)
1139{ 1148{
@@ -1169,16 +1178,14 @@ out_unreg:
1169out: 1178out:
1170 return err; 1179 return err;
1171} 1180}
1181#endif
1172 1182
1173static void mod_kobject_remove(struct module *mod) 1183static void mod_kobject_remove(struct module *mod)
1174{ 1184{
1175 module_remove_modinfo_attrs(mod); 1185 module_remove_modinfo_attrs(mod);
1176 module_param_sysfs_remove(mod); 1186 module_param_sysfs_remove(mod);
1177 if (mod->mkobj.drivers_dir) 1187 kobject_unregister(mod->mkobj.drivers_dir);
1178 kobject_unregister(mod->mkobj.drivers_dir); 1188 kobject_unregister(mod->holders_dir);
1179 if (mod->holders_dir)
1180 kobject_unregister(mod->holders_dir);
1181
1182 kobject_unregister(&mod->mkobj.kobj); 1189 kobject_unregister(&mod->mkobj.kobj);
1183} 1190}
1184 1191
@@ -2345,6 +2352,7 @@ void print_modules(void)
2345 printk("\n"); 2352 printk("\n");
2346} 2353}
2347 2354
2355#ifdef CONFIG_SYSFS
2348static char *make_driver_name(struct device_driver *drv) 2356static char *make_driver_name(struct device_driver *drv)
2349{ 2357{
2350 char *driver_name; 2358 char *driver_name;
@@ -2419,6 +2427,7 @@ void module_remove_driver(struct device_driver *drv)
2419 } 2427 }
2420} 2428}
2421EXPORT_SYMBOL(module_remove_driver); 2429EXPORT_SYMBOL(module_remove_driver);
2430#endif
2422 2431
2423#ifdef CONFIG_MODVERSIONS 2432#ifdef CONFIG_MODVERSIONS
2424/* Generate the signature for struct module here, too, for modversions. */ 2433/* Generate the signature for struct module here, too, for modversions. */
diff --git a/kernel/params.c b/kernel/params.c
index 553cf7d6a4be..7a751570b56d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -30,8 +30,6 @@
30#define DEBUGP(fmt, a...) 30#define DEBUGP(fmt, a...)
31#endif 31#endif
32 32
33static struct kobj_type module_ktype;
34
35static inline char dash2underscore(char c) 33static inline char dash2underscore(char c)
36{ 34{
37 if (c == '-') 35 if (c == '-')
@@ -391,6 +389,7 @@ struct module_param_attrs
391 struct param_attribute attrs[0]; 389 struct param_attribute attrs[0];
392}; 390};
393 391
392#ifdef CONFIG_SYSFS
394#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 393#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
395 394
396static ssize_t param_attr_show(struct module_attribute *mattr, 395static ssize_t param_attr_show(struct module_attribute *mattr,
@@ -426,6 +425,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
426 return len; 425 return len;
427 return err; 426 return err;
428} 427}
428#endif
429 429
430#ifdef CONFIG_MODULES 430#ifdef CONFIG_MODULES
431#define __modinit 431#define __modinit
@@ -433,6 +433,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
433#define __modinit __init 433#define __modinit __init
434#endif 434#endif
435 435
436#ifdef CONFIG_SYSFS
436/* 437/*
437 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME 438 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
438 * @mk: struct module_kobject (contains parent kobject) 439 * @mk: struct module_kobject (contains parent kobject)
@@ -500,9 +501,7 @@ param_sysfs_setup(struct module_kobject *mk,
500 return mp; 501 return mp;
501} 502}
502 503
503
504#ifdef CONFIG_MODULES 504#ifdef CONFIG_MODULES
505
506/* 505/*
507 * module_param_sysfs_setup - setup sysfs support for one module 506 * module_param_sysfs_setup - setup sysfs support for one module
508 * @mod: module 507 * @mod: module
@@ -625,7 +624,6 @@ static void __init param_sysfs_builtin(void)
625 624
626 625
627/* module-related sysfs stuff */ 626/* module-related sysfs stuff */
628#ifdef CONFIG_SYSFS
629 627
630#define to_module_attr(n) container_of(n, struct module_attribute, attr); 628#define to_module_attr(n) container_of(n, struct module_attribute, attr);
631#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 629#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
@@ -673,6 +671,8 @@ static struct sysfs_ops module_sysfs_ops = {
673 .store = module_attr_store, 671 .store = module_attr_store,
674}; 672};
675 673
674static struct kobj_type module_ktype;
675
676static int uevent_filter(struct kset *kset, struct kobject *kobj) 676static int uevent_filter(struct kset *kset, struct kobject *kobj)
677{ 677{
678 struct kobj_type *ktype = get_ktype(kobj); 678 struct kobj_type *ktype = get_ktype(kobj);
@@ -686,19 +686,12 @@ static struct kset_uevent_ops module_uevent_ops = {
686 .filter = uevent_filter, 686 .filter = uevent_filter,
687}; 687};
688 688
689#else 689decl_subsys(module, &module_ktype, &module_uevent_ops);
690static struct sysfs_ops module_sysfs_ops = {
691 .show = NULL,
692 .store = NULL,
693};
694#endif
695 690
696static struct kobj_type module_ktype = { 691static struct kobj_type module_ktype = {
697 .sysfs_ops = &module_sysfs_ops, 692 .sysfs_ops = &module_sysfs_ops,
698}; 693};
699 694
700decl_subsys(module, &module_ktype, &module_uevent_ops);
701
702/* 695/*
703 * param_sysfs_init - wrapper for built-in params support 696 * param_sysfs_init - wrapper for built-in params support
704 */ 697 */
@@ -714,11 +707,21 @@ static int __init param_sysfs_init(void)
714 } 707 }
715 708
716 param_sysfs_builtin(); 709 param_sysfs_builtin();
710 kmod_sysfs_init();
717 711
718 return 0; 712 return 0;
719} 713}
720subsys_initcall(param_sysfs_init); 714subsys_initcall(param_sysfs_init);
721 715
716#else
717#if 0
718static struct sysfs_ops module_sysfs_ops = {
719 .show = NULL,
720 .store = NULL,
721};
722#endif
723#endif
724
722EXPORT_SYMBOL(param_set_byte); 725EXPORT_SYMBOL(param_set_byte);
723EXPORT_SYMBOL(param_get_byte); 726EXPORT_SYMBOL(param_get_byte);
724EXPORT_SYMBOL(param_set_short); 727EXPORT_SYMBOL(param_set_short);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c3e1e6dfb5b..657f77697415 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
304 * should be able to see it. 304 * should be able to see it.
305 */ 305 */
306 struct task_struct *p; 306 struct task_struct *p;
307 read_lock(&tasklist_lock); 307 rcu_read_lock();
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else if (p->tgid == pid && p->signal) { 315 } else {
316 error = cpu_clock_sample_group(which_clock, 316 read_lock(&tasklist_lock);
317 p, &rtn); 317 if (p->tgid == pid && p->signal) {
318 error =
319 cpu_clock_sample_group(which_clock,
320 p, &rtn);
321 }
322 read_unlock(&tasklist_lock);
318 } 323 }
319 } 324 }
320 read_unlock(&tasklist_lock); 325 rcu_read_unlock();
321 } 326 }
322 327
323 if (error) 328 if (error)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a1bf61617839..44318ca71978 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
145 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
146static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
147 147
148static int posix_timer_fn(struct hrtimer *data); 148static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
149 149
150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
151 151
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
334 334
335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
336 */ 336 */
337static int posix_timer_fn(struct hrtimer *timer) 337static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
338{ 338{
339 struct k_itimer *timr; 339 struct k_itimer *timr;
340 unsigned long flags; 340 unsigned long flags;
341 int si_private = 0; 341 int si_private = 0;
342 int ret = HRTIMER_NORESTART; 342 enum hrtimer_restart ret = HRTIMER_NORESTART;
343 343
344 timr = container_of(timer, struct k_itimer, it.real.timer); 344 timr = container_of(timer, struct k_itimer, it.real.timer);
345 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer)
356 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
357 timr->it_overrun += 357 timr->it_overrun +=
358 hrtimer_forward(timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time, 359 hrtimer_cb_get_time(timer),
360 timr->it.real.interval); 360 timr->it.real.interval);
361 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
362 ++timr->it_requeue_pending; 362 ++timr->it_requeue_pending;
@@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags,
722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) 722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
723 return 0; 723 return 0;
724 724
725 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 725 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
727 timr->it.real.timer.function = posix_timer_fn; 727 timr->it.real.timer.function = posix_timer_fn;
728 728
@@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags,
734 /* SIGEV_NONE timers are not queued ! See common_timer_get */ 734 /* SIGEV_NONE timers are not queued ! See common_timer_get */
735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
736 /* Setup correct expiry time for relative timers */ 736 /* Setup correct expiry time for relative timers */
737 if (mode == HRTIMER_REL) 737 if (mode == HRTIMER_MODE_REL)
738 timer->expires = ktime_add(timer->expires, 738 timer->expires = ktime_add(timer->expires,
739 timer->base->get_time()); 739 timer->base->get_time());
740 return 0; 740 return 0;
@@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
950 struct timespec *tsave, struct timespec __user *rmtp) 950 struct timespec *tsave, struct timespec __user *rmtp)
951{ 951{
952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
953 HRTIMER_ABS : HRTIMER_REL, which_clock); 953 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
954 which_clock);
954} 955}
955 956
956asmlinkage long 957asmlinkage long
diff --git a/kernel/printk.c b/kernel/printk.c
index 0c151877ff71..4b47e59248df 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -54,7 +54,7 @@ int console_printk[4] = {
54}; 54};
55 55
56/* 56/*
57 * Low lever drivers may need that to know if they can schedule in 57 * Low level drivers may need that to know if they can schedule in
58 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
59 */ 59 */
60int oops_in_progress; 60int oops_in_progress;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da46fd8..180978cb2f75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
625 /* Setup the timer, when timeout != NULL */ 625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout)) 626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires, 627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS); 628 HRTIMER_MODE_ABS);
629 629
630 for (;;) { 630 for (;;) {
631 /* Try to acquire the lock: */ 631 /* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 08f86178aa34..0dc757246d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1853 struct mm_struct *mm = next->mm; 1853 struct mm_struct *mm = next->mm;
1854 struct mm_struct *oldmm = prev->active_mm; 1854 struct mm_struct *oldmm = prev->active_mm;
1855 1855
1856 /*
1857 * For paravirt, this is coupled with an exit in switch_to to
1858 * combine the page table reload and the switch backend into
1859 * one hypercall.
1860 */
1861 arch_enter_lazy_cpu_mode();
1862
1856 if (!mm) { 1863 if (!mm) {
1857 next->active_mm = oldmm; 1864 next->active_mm = oldmm;
1858 atomic_inc(&oldmm->mm_count); 1865 atomic_inc(&oldmm->mm_count);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8072e568bbe0..e2a7d4bf7d57 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
457{ 457{
458 int signr = __dequeue_signal(&tsk->pending, mask, info); 458 int signr = __dequeue_signal(&tsk->pending, mask, info);
459 if (!signr) 459 if (!signr) {
460 signr = __dequeue_signal(&tsk->signal->shared_pending, 460 signr = __dequeue_signal(&tsk->signal->shared_pending,
461 mask, info); 461 mask, info);
462 /*
463 * itimer signal ?
464 *
465 * itimers are process shared and we restart periodic
466 * itimers in the signal delivery path to prevent DoS
467 * attacks in the high resolution timer case. This is
468 * compliant with the old way of self restarting
469 * itimers, as the SIGALRM is a legacy signal and only
470 * queued once. Changing the restart behaviour to
471 * restart the timer in the signal dequeue path is
472 * reducing the timer noise on heavy loaded !highres
473 * systems too.
474 */
475 if (unlikely(signr == SIGALRM)) {
476 struct hrtimer *tmr = &tsk->signal->real_timer;
477
478 if (!hrtimer_is_queued(tmr) &&
479 tsk->signal->it_real_incr.tv64 != 0) {
480 hrtimer_forward(tmr, tmr->base->get_time(),
481 tsk->signal->it_real_incr);
482 hrtimer_restart(tmr);
483 }
484 }
485 }
462 recalc_sigpending_tsk(tsk); 486 recalc_sigpending_tsk(tsk);
463 if (signr && unlikely(sig_kernel_stop(signr))) { 487 if (signr && unlikely(sig_kernel_stop(signr))) {
464 /* 488 /*
465 * Set a marker that we have dequeued a stop signal. Our 489 * Set a marker that we have dequeued a stop signal. Our
466 * caller might release the siglock and then the pending 490 * caller might release the siglock and then the pending
467 * stop signal it is about to process is no longer in the 491 * stop signal it is about to process is no longer in the
468 * pending bitmasks, but must still be cleared by a SIGCONT 492 * pending bitmasks, but must still be cleared by a SIGCONT
469 * (and overruled by a SIGKILL). So those cases clear this 493 * (and overruled by a SIGKILL). So those cases clear this
470 * shared flag after we've set it. Note that this flag may 494 * shared flag after we've set it. Note that this flag may
471 * remain set after the signal we return is ignored or 495 * remain set after the signal we return is ignored or
472 * handled. That doesn't matter because its only purpose 496 * handled. That doesn't matter because its only purpose
473 * is to alert stop-signal processing code when another 497 * is to alert stop-signal processing code when another
474 * processor has come along and cleared the flag. 498 * processor has come along and cleared the flag.
475 */ 499 */
476 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 500 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
477 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 501 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
478 } 502 }
479 if ( signr && 503 if ( signr &&
480 ((info->si_code & __SI_MASK) == __SI_TIMER) && 504 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
481 info->si_sys_private){ 505 info->si_sys_private){
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 918e52df090e..8b75008e2bd8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -17,6 +17,7 @@
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/tick.h>
20 21
21#include <asm/irq.h> 22#include <asm/irq.h>
22/* 23/*
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq);
273 274
274#endif 275#endif
275 276
277/*
278 * Enter an interrupt context.
279 */
280void irq_enter(void)
281{
282 __irq_enter();
283#ifdef CONFIG_NO_HZ
284 if (idle_cpu(smp_processor_id()))
285 tick_nohz_update_jiffies();
286#endif
287}
288
276#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 289#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
277# define invoke_softirq() __do_softirq() 290# define invoke_softirq() __do_softirq()
278#else 291#else
@@ -289,6 +302,12 @@ void irq_exit(void)
289 sub_preempt_count(IRQ_EXIT_OFFSET); 302 sub_preempt_count(IRQ_EXIT_OFFSET);
290 if (!in_interrupt() && local_softirq_pending()) 303 if (!in_interrupt() && local_softirq_pending())
291 invoke_softirq(); 304 invoke_softirq();
305
306#ifdef CONFIG_NO_HZ
307 /* Make sure that timer wheel updates are propagated */
308 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
309 tick_nohz_stop_sched_tick();
310#endif
292 preempt_enable_no_resched(); 311 preempt_enable_no_resched();
293} 312}
294 313
diff --git a/kernel/time.c b/kernel/time.c
index 0e017bff4c19..c6c80ea5d0ea 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec)
470 return tv; 470 return tv;
471} 471}
472 472
473/*
474 * Convert jiffies to milliseconds and back.
475 *
476 * Avoid unnecessary multiplications/divisions in the
477 * two most common HZ cases:
478 */
479unsigned int jiffies_to_msecs(const unsigned long j)
480{
481#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
482 return (MSEC_PER_SEC / HZ) * j;
483#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
484 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
485#else
486 return (j * MSEC_PER_SEC) / HZ;
487#endif
488}
489EXPORT_SYMBOL(jiffies_to_msecs);
490
491unsigned int jiffies_to_usecs(const unsigned long j)
492{
493#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
494 return (USEC_PER_SEC / HZ) * j;
495#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
496 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
497#else
498 return (j * USEC_PER_SEC) / HZ;
499#endif
500}
501EXPORT_SYMBOL(jiffies_to_usecs);
502
503/*
504 * When we convert to jiffies then we interpret incoming values
505 * the following way:
506 *
507 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
508 *
509 * - 'too large' values [that would result in larger than
510 * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
511 *
512 * - all other values are converted to jiffies by either multiplying
513 * the input value by a factor or dividing it with a factor
514 *
515 * We must also be careful about 32-bit overflows.
516 */
517unsigned long msecs_to_jiffies(const unsigned int m)
518{
519 /*
520 * Negative value, means infinite timeout:
521 */
522 if ((int)m < 0)
523 return MAX_JIFFY_OFFSET;
524
525#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
526 /*
527 * HZ is equal to or smaller than 1000, and 1000 is a nice
528 * round multiple of HZ, divide with the factor between them,
529 * but round upwards:
530 */
531 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
532#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
533 /*
534 * HZ is larger than 1000, and HZ is a nice round multiple of
535 * 1000 - simply multiply with the factor between them.
536 *
537 * But first make sure the multiplication result cannot
538 * overflow:
539 */
540 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
541 return MAX_JIFFY_OFFSET;
542
543 return m * (HZ / MSEC_PER_SEC);
544#else
545 /*
546 * Generic case - multiply, round and divide. But first
547 * check that if we are doing a net multiplication, that
548 * we wouldnt overflow:
549 */
550 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
551 return MAX_JIFFY_OFFSET;
552
553 return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
554#endif
555}
556EXPORT_SYMBOL(msecs_to_jiffies);
557
558unsigned long usecs_to_jiffies(const unsigned int u)
559{
560 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
561 return MAX_JIFFY_OFFSET;
562#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
563 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
564#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
565 return u * (HZ / USEC_PER_SEC);
566#else
567 return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
568#endif
569}
570EXPORT_SYMBOL(usecs_to_jiffies);
571
572/*
573 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
574 * that a remainder subtract here would not do the right thing as the
575 * resolution values don't fall on second boundries. I.e. the line:
576 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
577 *
578 * Rather, we just shift the bits off the right.
579 *
580 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
581 * value to a scaled second value.
582 */
583unsigned long
584timespec_to_jiffies(const struct timespec *value)
585{
586 unsigned long sec = value->tv_sec;
587 long nsec = value->tv_nsec + TICK_NSEC - 1;
588
589 if (sec >= MAX_SEC_IN_JIFFIES){
590 sec = MAX_SEC_IN_JIFFIES;
591 nsec = 0;
592 }
593 return (((u64)sec * SEC_CONVERSION) +
594 (((u64)nsec * NSEC_CONVERSION) >>
595 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
596
597}
598EXPORT_SYMBOL(timespec_to_jiffies);
599
600void
601jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
602{
603 /*
604 * Convert jiffies to nanoseconds and separate with
605 * one divide.
606 */
607 u64 nsec = (u64)jiffies * TICK_NSEC;
608 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
609}
610EXPORT_SYMBOL(jiffies_to_timespec);
611
612/* Same for "timeval"
613 *
614 * Well, almost. The problem here is that the real system resolution is
615 * in nanoseconds and the value being converted is in micro seconds.
616 * Also for some machines (those that use HZ = 1024, in-particular),
617 * there is a LARGE error in the tick size in microseconds.
618
619 * The solution we use is to do the rounding AFTER we convert the
620 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
621 * Instruction wise, this should cost only an additional add with carry
622 * instruction above the way it was done above.
623 */
624unsigned long
625timeval_to_jiffies(const struct timeval *value)
626{
627 unsigned long sec = value->tv_sec;
628 long usec = value->tv_usec;
629
630 if (sec >= MAX_SEC_IN_JIFFIES){
631 sec = MAX_SEC_IN_JIFFIES;
632 usec = 0;
633 }
634 return (((u64)sec * SEC_CONVERSION) +
635 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
636 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
637}
638
639void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
640{
641 /*
642 * Convert jiffies to nanoseconds and separate with
643 * one divide.
644 */
645 u64 nsec = (u64)jiffies * TICK_NSEC;
646 long tv_usec;
647
648 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
649 tv_usec /= NSEC_PER_USEC;
650 value->tv_usec = tv_usec;
651}
652
653/*
654 * Convert jiffies/jiffies_64 to clock_t and back.
655 */
656clock_t jiffies_to_clock_t(long x)
657{
658#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
659 return x / (HZ / USER_HZ);
660#else
661 u64 tmp = (u64)x * TICK_NSEC;
662 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
663 return (long)tmp;
664#endif
665}
666EXPORT_SYMBOL(jiffies_to_clock_t);
667
668unsigned long clock_t_to_jiffies(unsigned long x)
669{
670#if (HZ % USER_HZ)==0
671 if (x >= ~0UL / (HZ / USER_HZ))
672 return ~0UL;
673 return x * (HZ / USER_HZ);
674#else
675 u64 jif;
676
677 /* Don't worry about loss of precision here .. */
678 if (x >= ~0UL / HZ * USER_HZ)
679 return ~0UL;
680
681 /* .. but do try to contain it here */
682 jif = x * (u64) HZ;
683 do_div(jif, USER_HZ);
684 return jif;
685#endif
686}
687EXPORT_SYMBOL(clock_t_to_jiffies);
688
689u64 jiffies_64_to_clock_t(u64 x)
690{
691#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
692 do_div(x, HZ / USER_HZ);
693#else
694 /*
695 * There are better ways that don't overflow early,
696 * but even this doesn't overflow in hundreds of years
697 * in 64 bits, so..
698 */
699 x *= TICK_NSEC;
700 do_div(x, (NSEC_PER_SEC / USER_HZ));
701#endif
702 return x;
703}
704
705EXPORT_SYMBOL(jiffies_64_to_clock_t);
706
707u64 nsec_to_clock_t(u64 x)
708{
709#if (NSEC_PER_SEC % USER_HZ) == 0
710 do_div(x, (NSEC_PER_SEC / USER_HZ));
711#elif (USER_HZ % 512) == 0
712 x *= USER_HZ/512;
713 do_div(x, (NSEC_PER_SEC / 512));
714#else
715 /*
716 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
717 * overflow after 64.99 years.
718 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
719 */
720 x *= 9;
721 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
722 USER_HZ));
723#endif
724 return x;
725}
726
473#if (BITS_PER_LONG < 64) 727#if (BITS_PER_LONG < 64)
474u64 get_jiffies_64(void) 728u64 get_jiffies_64(void)
475{ 729{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000000..f66351126544
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
1#
2# Timer subsystem related configuration options
3#
4config TICK_ONESHOT
5 bool
6 default n
7
8config NO_HZ
9 bool "Tickless System (Dynamic Ticks)"
10 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
11 select TICK_ONESHOT
12 help
13 This option enables a tickless system: timer interrupts will
14 only trigger on an as-needed basis both when the system is
15 busy and when the system is idle.
16
17config HIGH_RES_TIMERS
18 bool "High Resolution Timer Support"
19 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
20 select TICK_ONESHOT
21 help
22 This option enables high resolution timer support. If your
23 hardware is not capable then this option only increases
24 the size of the kernel image.
25
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16fb..93bccba1f265 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
1obj-y += ntp.o clocksource.o jiffies.o 1obj-y += ntp.o clocksource.o jiffies.o timer_list.o
2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
8obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000000..67932ea78c17
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
1/*
2 * linux/kernel/time/clockevents.c
3 *
4 * This file contains functions which manage clock event devices.
5 *
6 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
8 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
9 *
10 * This code is licenced under the GPL version 2. For details see
11 * kernel-base/COPYING.
12 */
13
14#include <linux/clockchips.h>
15#include <linux/hrtimer.h>
16#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h>
20#include <linux/sysdev.h>
21
22/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices);
24static LIST_HEAD(clockevents_released);
25
26/* Notification for clock events */
27static RAW_NOTIFIER_HEAD(clockevents_chain);
28
29/* Protection for the above */
30static DEFINE_SPINLOCK(clockevents_lock);
31
32/**
33 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
34 * @latch: value to convert
35 * @evt: pointer to clock event device descriptor
36 *
37 * Math helper, returns latch value converted to nanoseconds (bound checked)
38 */
39unsigned long clockevent_delta2ns(unsigned long latch,
40 struct clock_event_device *evt)
41{
42 u64 clc = ((u64) latch << evt->shift);
43
44 do_div(clc, evt->mult);
45 if (clc < 1000)
46 clc = 1000;
47 if (clc > LONG_MAX)
48 clc = LONG_MAX;
49
50 return (unsigned long) clc;
51}
52
53/**
54 * clockevents_set_mode - set the operating mode of a clock event device
55 * @dev: device to modify
56 * @mode: new mode
57 *
58 * Must be called with interrupts disabled !
59 */
60void clockevents_set_mode(struct clock_event_device *dev,
61 enum clock_event_mode mode)
62{
63 if (dev->mode != mode) {
64 dev->set_mode(mode, dev);
65 dev->mode = mode;
66 }
67}
68
69/**
70 * clockevents_program_event - Reprogram the clock event device.
71 * @expires: absolute expiry time (monotonic clock)
72 *
73 * Returns 0 on success, -ETIME when the event is in the past.
74 */
75int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
76 ktime_t now)
77{
78 unsigned long long clc;
79 int64_t delta;
80
81 delta = ktime_to_ns(ktime_sub(expires, now));
82
83 if (delta <= 0)
84 return -ETIME;
85
86 dev->next_event = expires;
87
88 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
89 return 0;
90
91 if (delta > dev->max_delta_ns)
92 delta = dev->max_delta_ns;
93 if (delta < dev->min_delta_ns)
94 delta = dev->min_delta_ns;
95
96 clc = delta * dev->mult;
97 clc >>= dev->shift;
98
99 return dev->set_next_event((unsigned long) clc, dev);
100}
101
102/**
103 * clockevents_register_notifier - register a clock events change listener
104 */
105int clockevents_register_notifier(struct notifier_block *nb)
106{
107 int ret;
108
109 spin_lock(&clockevents_lock);
110 ret = raw_notifier_chain_register(&clockevents_chain, nb);
111 spin_unlock(&clockevents_lock);
112
113 return ret;
114}
115
116/**
117 * clockevents_unregister_notifier - unregister a clock events change listener
118 */
119void clockevents_unregister_notifier(struct notifier_block *nb)
120{
121 spin_lock(&clockevents_lock);
122 raw_notifier_chain_unregister(&clockevents_chain, nb);
123 spin_unlock(&clockevents_lock);
124}
125
126/*
127 * Notify about a clock event change. Called with clockevents_lock
128 * held.
129 */
130static void clockevents_do_notify(unsigned long reason, void *dev)
131{
132 raw_notifier_call_chain(&clockevents_chain, reason, dev);
133}
134
135/*
136 * Called after a notify add to make devices availble which were
137 * released from the notifier call.
138 */
139static void clockevents_notify_released(void)
140{
141 struct clock_event_device *dev;
142
143 while (!list_empty(&clockevents_released)) {
144 dev = list_entry(clockevents_released.next,
145 struct clock_event_device, list);
146 list_del(&dev->list);
147 list_add(&dev->list, &clockevent_devices);
148 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
149 }
150}
151
152/**
153 * clockevents_register_device - register a clock event device
154 * @dev: device to register
155 */
156void clockevents_register_device(struct clock_event_device *dev)
157{
158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159
160 spin_lock(&clockevents_lock);
161
162 list_add(&dev->list, &clockevent_devices);
163 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
164 clockevents_notify_released();
165
166 spin_unlock(&clockevents_lock);
167}
168
169/*
170 * Noop handler when we shut down an event device
171 */
172static void clockevents_handle_noop(struct clock_event_device *dev)
173{
174}
175
176/**
177 * clockevents_exchange_device - release and request clock devices
178 * @old: device to release (can be NULL)
179 * @new: device to request (can be NULL)
180 *
181 * Called from the notifier chain. clockevents_lock is held already
182 */
183void clockevents_exchange_device(struct clock_event_device *old,
184 struct clock_event_device *new)
185{
186 unsigned long flags;
187
188 local_irq_save(flags);
189 /*
190 * Caller releases a clock event device. We queue it into the
191 * released list and do a notify add later.
192 */
193 if (old) {
194 old->event_handler = clockevents_handle_noop;
195 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
196 list_del(&old->list);
197 list_add(&old->list, &clockevents_released);
198 }
199
200 if (new) {
201 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
202 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
203 }
204 local_irq_restore(flags);
205}
206
207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events
250 */
251void clockevents_notify(unsigned long reason, void *arg)
252{
253 spin_lock(&clockevents_lock);
254 clockevents_do_notify(reason, arg);
255
256 switch (reason) {
257 case CLOCK_EVT_NOTIFY_CPU_DEAD:
258 /*
259 * Unregister the clock event devices which were
260 * released from the users in the notify chain.
261 */
262 while (!list_empty(&clockevents_released)) {
263 struct clock_event_device *dev;
264
265 dev = list_entry(clockevents_released.next,
266 struct clock_event_device, list);
267 list_del(&dev->list);
268 }
269 break;
270 default:
271 break;
272 }
273 spin_unlock(&clockevents_lock);
274}
275EXPORT_SYMBOL_GPL(clockevents_notify);
276
277#ifdef CONFIG_SYSFS
278
279/**
280 * clockevents_show_registered - sysfs interface for listing clockevents
281 * @dev: unused
282 * @buf: char buffer to be filled with clock events list
283 *
284 * Provides sysfs interface for listing registered clock event devices
285 */
286static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
287{
288 struct list_head *tmp;
289 char *p = buf;
290 int cpu;
291
292 spin_lock(&clockevents_lock);
293
294 list_for_each(tmp, &clockevent_devices) {
295 struct clock_event_device *ce;
296
297 ce = list_entry(tmp, struct clock_event_device, list);
298 p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
299 ce->features, ce->mode);
300 p += sprintf(p, " C:");
301 if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
302 for_each_cpu_mask(cpu, ce->cpumask)
303 p += sprintf(p, " %d", cpu);
304 } else {
305 /*
306 * FIXME: Add the cpu which is handling this sucker
307 */
308 }
309 p += sprintf(p, "\n");
310 }
311
312 spin_unlock(&clockevents_lock);
313
314 return p - buf;
315}
316
317/*
318 * Sysfs setup bits:
319 */
320static SYSDEV_ATTR(registered, 0600,
321 clockevents_show_registered, NULL);
322
323static struct sysdev_class clockevents_sysclass = {
324 set_kset_name("clockevents"),
325};
326
327static struct sys_device clockevents_sys_device = {
328 .id = 0,
329 .cls = &clockevents_sysclass,
330};
331
332static int __init clockevents_sysfs_init(void)
333{
334 int error = sysdev_class_register(&clockevents_sysclass);
335
336 if (!error)
337 error = sysdev_register(&clockevents_sys_device);
338 if (!error)
339 error = sysdev_create_file(
340 &clockevents_sys_device,
341 &attr_registered);
342 return error;
343}
344device_initcall(clockevents_sysfs_init);
345#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9ef176c4e09..193a0793af95 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -29,6 +29,7 @@
29#include <linux/init.h> 29#include <linux/init.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h>
32 33
33/* XXX - Would like a better way for initializing curr_clocksource */ 34/* XXX - Would like a better way for initializing curr_clocksource */
34extern struct clocksource clocksource_jiffies; 35extern struct clocksource clocksource_jiffies;
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
48 */ 49 */
49static struct clocksource *curr_clocksource = &clocksource_jiffies; 50static struct clocksource *curr_clocksource = &clocksource_jiffies;
50static struct clocksource *next_clocksource; 51static struct clocksource *next_clocksource;
52static struct clocksource *clocksource_override;
51static LIST_HEAD(clocksource_list); 53static LIST_HEAD(clocksource_list);
52static DEFINE_SPINLOCK(clocksource_lock); 54static DEFINE_SPINLOCK(clocksource_lock);
53static char override_name[32]; 55static char override_name[32];
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void)
62 finished_booting = 1; 64 finished_booting = 1;
63 return 0; 65 return 0;
64} 66}
65
66late_initcall(clocksource_done_booting); 67late_initcall(clocksource_done_booting);
67 68
69#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
70static LIST_HEAD(watchdog_list);
71static struct clocksource *watchdog;
72static struct timer_list watchdog_timer;
73static DEFINE_SPINLOCK(watchdog_lock);
74static cycle_t watchdog_last;
75/*
76 * Interval: 0.5sec Treshold: 0.0625s
77 */
78#define WATCHDOG_INTERVAL (HZ >> 1)
79#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
80
81static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
82{
83 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
84 return;
85
86 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
87 cs->name, delta);
88 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
89 clocksource_change_rating(cs, 0);
90 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
91 list_del(&cs->wd_list);
92}
93
94static void clocksource_watchdog(unsigned long data)
95{
96 struct clocksource *cs, *tmp;
97 cycle_t csnow, wdnow;
98 int64_t wd_nsec, cs_nsec;
99
100 spin_lock(&watchdog_lock);
101
102 wdnow = watchdog->read();
103 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
104 watchdog_last = wdnow;
105
106 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
107 csnow = cs->read();
108 /* Initialized ? */
109 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
110 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
111 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
112 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
113 /*
114 * We just marked the clocksource as
115 * highres-capable, notify the rest of the
116 * system as well so that we transition
117 * into high-res mode:
118 */
119 tick_clock_notify();
120 }
121 cs->flags |= CLOCK_SOURCE_WATCHDOG;
122 cs->wd_last = csnow;
123 } else {
124 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
125 cs->wd_last = csnow;
126 /* Check the delta. Might remove from the list ! */
127 clocksource_ratewd(cs, cs_nsec - wd_nsec);
128 }
129 }
130
131 if (!list_empty(&watchdog_list)) {
132 __mod_timer(&watchdog_timer,
133 watchdog_timer.expires + WATCHDOG_INTERVAL);
134 }
135 spin_unlock(&watchdog_lock);
136}
137static void clocksource_check_watchdog(struct clocksource *cs)
138{
139 struct clocksource *cse;
140 unsigned long flags;
141
142 spin_lock_irqsave(&watchdog_lock, flags);
143 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
144 int started = !list_empty(&watchdog_list);
145
146 list_add(&cs->wd_list, &watchdog_list);
147 if (!started && watchdog) {
148 watchdog_last = watchdog->read();
149 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
150 add_timer(&watchdog_timer);
151 }
152 } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
153 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
154
155 if (!watchdog || cs->rating > watchdog->rating) {
156 if (watchdog)
157 del_timer(&watchdog_timer);
158 watchdog = cs;
159 init_timer(&watchdog_timer);
160 watchdog_timer.function = clocksource_watchdog;
161
162 /* Reset watchdog cycles */
163 list_for_each_entry(cse, &watchdog_list, wd_list)
164 cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
165 /* Start if list is not empty */
166 if (!list_empty(&watchdog_list)) {
167 watchdog_last = watchdog->read();
168 watchdog_timer.expires =
169 jiffies + WATCHDOG_INTERVAL;
170 add_timer(&watchdog_timer);
171 }
172 }
173 }
174 spin_unlock_irqrestore(&watchdog_lock, flags);
175}
176#else
177static void clocksource_check_watchdog(struct clocksource *cs)
178{
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
180 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
181}
182#endif
183
68/** 184/**
69 * clocksource_get_next - Returns the selected clocksource 185 * clocksource_get_next - Returns the selected clocksource
70 * 186 *
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
84} 200}
85 201
86/** 202/**
87 * select_clocksource - Finds the best registered clocksource. 203 * select_clocksource - Selects the best registered clocksource.
88 * 204 *
89 * Private function. Must hold clocksource_lock when called. 205 * Private function. Must hold clocksource_lock when called.
90 * 206 *
91 * Looks through the list of registered clocksources, returning 207 * Select the clocksource with the best rating, or the clocksource,
92 * the one with the highest rating value. If there is a clocksource 208 * which is selected by userspace override.
93 * name that matches the override string, it returns that clocksource.
94 */ 209 */
95static struct clocksource *select_clocksource(void) 210static struct clocksource *select_clocksource(void)
96{ 211{
97 struct clocksource *best = NULL; 212 struct clocksource *next;
98 struct list_head *tmp;
99 213
100 list_for_each(tmp, &clocksource_list) { 214 if (list_empty(&clocksource_list))
101 struct clocksource *src; 215 return NULL;
102 216
103 src = list_entry(tmp, struct clocksource, list); 217 if (clocksource_override)
104 if (!best) 218 next = clocksource_override;
105 best = src; 219 else
106 220 next = list_entry(clocksource_list.next, struct clocksource,
107 /* check for override: */ 221 list);
108 if (strlen(src->name) == strlen(override_name) && 222
109 !strcmp(src->name, override_name)) { 223 if (next == curr_clocksource)
110 best = src; 224 return NULL;
111 break;
112 }
113 /* pick the highest rating: */
114 if (src->rating > best->rating)
115 best = src;
116 }
117 225
118 return best; 226 return next;
119} 227}
120 228
121/** 229/*
122 * is_registered_source - Checks if clocksource is registered 230 * Enqueue the clocksource sorted by rating
123 * @c: pointer to a clocksource
124 *
125 * Private helper function. Must hold clocksource_lock when called.
126 *
127 * Returns one if the clocksource is already registered, zero otherwise.
128 */ 231 */
129static int is_registered_source(struct clocksource *c) 232static int clocksource_enqueue(struct clocksource *c)
130{ 233{
131 int len = strlen(c->name); 234 struct list_head *tmp, *entry = &clocksource_list;
132 struct list_head *tmp;
133 235
134 list_for_each(tmp, &clocksource_list) { 236 list_for_each(tmp, &clocksource_list) {
135 struct clocksource *src; 237 struct clocksource *cs;
136 238
137 src = list_entry(tmp, struct clocksource, list); 239 cs = list_entry(tmp, struct clocksource, list);
138 if (strlen(src->name) == len && !strcmp(src->name, c->name)) 240 if (cs == c)
139 return 1; 241 return -EBUSY;
242 /* Keep track of the place, where to insert */
243 if (cs->rating >= c->rating)
244 entry = tmp;
140 } 245 }
246 list_add(&c->list, entry);
247
248 if (strlen(c->name) == strlen(override_name) &&
249 !strcmp(c->name, override_name))
250 clocksource_override = c;
141 251
142 return 0; 252 return 0;
143} 253}
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
150 */ 260 */
151int clocksource_register(struct clocksource *c) 261int clocksource_register(struct clocksource *c)
152{ 262{
153 int ret = 0;
154 unsigned long flags; 263 unsigned long flags;
264 int ret;
155 265
156 spin_lock_irqsave(&clocksource_lock, flags); 266 spin_lock_irqsave(&clocksource_lock, flags);
157 /* check if clocksource is already registered */ 267 ret = clocksource_enqueue(c);
158 if (is_registered_source(c)) { 268 if (!ret)
159 printk("register_clocksource: Cannot register %s. "
160 "Already registered!", c->name);
161 ret = -EBUSY;
162 } else {
163 /* register it */
164 list_add(&c->list, &clocksource_list);
165 /* scan the registered clocksources, and pick the best one */
166 next_clocksource = select_clocksource(); 269 next_clocksource = select_clocksource();
167 }
168 spin_unlock_irqrestore(&clocksource_lock, flags); 270 spin_unlock_irqrestore(&clocksource_lock, flags);
271 if (!ret)
272 clocksource_check_watchdog(c);
169 return ret; 273 return ret;
170} 274}
171EXPORT_SYMBOL(clocksource_register); 275EXPORT_SYMBOL(clocksource_register);
172 276
173/** 277/**
174 * clocksource_reselect - Rescan list for next clocksource 278 * clocksource_change_rating - Change the rating of a registered clocksource
175 * 279 *
176 * A quick helper function to be used if a clocksource changes its
177 * rating. Forces the clocksource list to be re-scanned for the best
178 * clocksource.
179 */ 280 */
180void clocksource_reselect(void) 281void clocksource_change_rating(struct clocksource *cs, int rating)
181{ 282{
182 unsigned long flags; 283 unsigned long flags;
183 284
184 spin_lock_irqsave(&clocksource_lock, flags); 285 spin_lock_irqsave(&clocksource_lock, flags);
286 list_del(&cs->list);
287 cs->rating = rating;
288 clocksource_enqueue(cs);
185 next_clocksource = select_clocksource(); 289 next_clocksource = select_clocksource();
186 spin_unlock_irqrestore(&clocksource_lock, flags); 290 spin_unlock_irqrestore(&clocksource_lock, flags);
187} 291}
188EXPORT_SYMBOL(clocksource_reselect);
189 292
190#ifdef CONFIG_SYSFS 293#ifdef CONFIG_SYSFS
191/** 294/**
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
221static ssize_t sysfs_override_clocksource(struct sys_device *dev, 324static ssize_t sysfs_override_clocksource(struct sys_device *dev,
222 const char *buf, size_t count) 325 const char *buf, size_t count)
223{ 326{
327 struct clocksource *ovr = NULL;
328 struct list_head *tmp;
224 size_t ret = count; 329 size_t ret = count;
330 int len;
331
225 /* strings from sysfs write are not 0 terminated! */ 332 /* strings from sysfs write are not 0 terminated! */
226 if (count >= sizeof(override_name)) 333 if (count >= sizeof(override_name))
227 return -EINVAL; 334 return -EINVAL;
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
229 /* strip of \n: */ 336 /* strip of \n: */
230 if (buf[count-1] == '\n') 337 if (buf[count-1] == '\n')
231 count--; 338 count--;
232 if (count < 1)
233 return -EINVAL;
234 339
235 spin_lock_irq(&clocksource_lock); 340 spin_lock_irq(&clocksource_lock);
236 341
237 /* copy the name given: */ 342 if (count > 0)
238 memcpy(override_name, buf, count); 343 memcpy(override_name, buf, count);
239 override_name[count] = 0; 344 override_name[count] = 0;
240 345
241 /* try to select it: */ 346 len = strlen(override_name);
242 next_clocksource = select_clocksource(); 347 if (len) {
348 ovr = clocksource_override;
349 /* try to select it: */
350 list_for_each(tmp, &clocksource_list) {
351 struct clocksource *cs;
352
353 cs = list_entry(tmp, struct clocksource, list);
354 if (strlen(cs->name) == len &&
355 !strcmp(cs->name, override_name))
356 ovr = cs;
357 }
358 }
359
360 /* Reselect, when the override name has changed */
361 if (ovr != clocksource_override) {
362 clocksource_override = ovr;
363 next_clocksource = select_clocksource();
364 }
243 365
244 spin_unlock_irq(&clocksource_lock); 366 spin_unlock_irq(&clocksource_lock);
245 367
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a07..3be8da8fed7e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66}; 65};
67 66
68static int __init init_jiffies_clocksource(void) 67static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f9..eb12509e00bd 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
24 24
25#define MAX_TICKADJ 500 /* microsecs */ 25#define MAX_TICKADJ 500 /* microsecs */
26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
27 TICK_LENGTH_SHIFT) / HZ) 27 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
28 28
29/* 29/*
30 * phase-lock loop variables 30 * phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
46 46
47static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
48{ 48{
49 tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
51 tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 51 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 do_div(tick_length_base, HZ); 54 tick_length_base = second_length;
54 55
55 tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; 56 do_div(second_length, HZ);
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT;
58
59 do_div(tick_length_base, NTP_INTERVAL_FREQ);
56} 60}
57 61
58/** 62/**
@@ -162,7 +166,7 @@ void second_overflow(void)
162 tick_length -= MAX_TICKADJ_SCALED; 166 tick_length -= MAX_TICKADJ_SCALED;
163 } else { 167 } else {
164 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 168 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
165 HZ) << TICK_LENGTH_SHIFT; 169 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
166 time_adjust = 0; 170 time_adjust = 0;
167 } 171 }
168 } 172 }
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
239 result = -EINVAL; 243 result = -EINVAL;
240 goto leave; 244 goto leave;
241 } 245 }
242 time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); 246 time_freq = ((s64)txc->freq * NSEC_PER_USEC)
247 >> (SHIFT_USEC - SHIFT_NSEC);
243 } 248 }
244 249
245 if (txc->modes & ADJ_MAXERROR) { 250 if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
309 freq_adj += time_freq; 314 freq_adj += time_freq;
310 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); 315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
311 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); 316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
312 time_offset = (time_offset / HZ) << SHIFT_UPDATE; 317 time_offset = (time_offset / NTP_INTERVAL_FREQ)
318 << SHIFT_UPDATE;
313 } /* STA_PLL */ 319 } /* STA_PLL */
314 } /* txc->modes & ADJ_OFFSET */ 320 } /* txc->modes & ADJ_OFFSET */
315 if (txc->modes & ADJ_TICK) 321 if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
324 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 330 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
325 txc->offset = save_adjust; 331 txc->offset = save_adjust;
326 else 332 else
327 txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; 333 txc->offset = shift_right(time_offset, SHIFT_UPDATE)
328 txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); 334 * NTP_INTERVAL_FREQ / 1000;
335 txc->freq = (time_freq / NSEC_PER_USEC)
336 << (SHIFT_USEC - SHIFT_NSEC);
329 txc->maxerror = time_maxerror; 337 txc->maxerror = time_maxerror;
330 txc->esterror = time_esterror; 338 txc->esterror = time_esterror;
331 txc->status = time_status; 339 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000000..12b3efeb9f6f
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
1/*
2 * linux/kernel/time/tick-broadcast.c
3 *
4 * This file contains functions which emulate a local clock-event
5 * device via a broadcast event source.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Broadcast support for broken x86 hardware, where the local apic
27 * timer stops in C3 state.
28 */
29
30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock);
33
34/*
35 * Debugging: see timer_list.c
36 */
37struct tick_device *tick_get_broadcast_device(void)
38{
39 return &tick_broadcast_device;
40}
41
42cpumask_t *tick_get_broadcast_mask(void)
43{
44 return &tick_broadcast_mask;
45}
46
47/*
48 * Start the device in periodic mode
49 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
53 tick_setup_periodic(bc, 1);
54}
55
56/*
57 * Check, if the device can be utilized as broadcast device:
58 */
59int tick_check_broadcast_device(struct clock_event_device *dev)
60{
61 if (tick_broadcast_device.evtdev ||
62 (dev->features & CLOCK_EVT_FEAT_C3STOP))
63 return 0;
64
65 clockevents_exchange_device(NULL, dev);
66 tick_broadcast_device.evtdev = dev;
67 if (!cpus_empty(tick_broadcast_mask))
68 tick_broadcast_start_periodic(dev);
69 return 1;
70}
71
72/*
73 * Check, if the device is the broadcast device
74 */
75int tick_is_broadcast_device(struct clock_event_device *dev)
76{
77 return (dev && tick_broadcast_device.evtdev == dev);
78}
79
80/*
81 * Check, if the device is disfunctional and a place holder, which
82 * needs to be handled by the broadcast device.
83 */
84int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
85{
86 unsigned long flags;
87 int ret = 0;
88
89 spin_lock_irqsave(&tick_broadcast_lock, flags);
90
91 /*
92 * Devices might be registered with both periodic and oneshot
93 * mode disabled. This signals, that the device needs to be
94 * operated from the broadcast device and is a placeholder for
95 * the cpu local device.
96 */
97 if (!tick_device_is_functional(dev)) {
98 dev->event_handler = tick_handle_periodic;
99 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1;
102 }
103
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret;
106}
107
108/*
109 * Broadcast the event to the cpus, which are set in the mask
110 */
111int tick_do_broadcast(cpumask_t mask)
112{
113 int ret = 0, cpu = smp_processor_id();
114 struct tick_device *td;
115
116 /*
117 * Check, if the current cpu is in the mask
118 */
119 if (cpu_isset(cpu, mask)) {
120 cpu_clear(cpu, mask);
121 td = &per_cpu(tick_cpu_device, cpu);
122 td->evtdev->event_handler(td->evtdev);
123 ret = 1;
124 }
125
126 if (!cpus_empty(mask)) {
127 /*
128 * It might be necessary to actually check whether the devices
129 * have different broadcast functions. For now, just use the
130 * one of the first device. This works as long as we have this
131 * misfeature only on x86 (lapic)
132 */
133 cpu = first_cpu(mask);
134 td = &per_cpu(tick_cpu_device, cpu);
135 td->evtdev->broadcast(mask);
136 ret = 1;
137 }
138 return ret;
139}
140
141/*
142 * Periodic broadcast:
143 * - invoke the broadcast handlers
144 */
145static void tick_do_periodic_broadcast(void)
146{
147 cpumask_t mask;
148
149 spin_lock(&tick_broadcast_lock);
150
151 cpus_and(mask, cpu_online_map, tick_broadcast_mask);
152 tick_do_broadcast(mask);
153
154 spin_unlock(&tick_broadcast_lock);
155}
156
157/*
158 * Event handler for periodic broadcast ticks
159 */
160static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
161{
162 dev->next_event.tv64 = KTIME_MAX;
163
164 tick_do_periodic_broadcast();
165
166 /*
167 * The device is in periodic mode. No reprogramming necessary:
168 */
169 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
170 return;
171
172 /*
173 * Setup the next period for devices, which do not have
174 * periodic mode:
175 */
176 for (;;) {
177 ktime_t next = ktime_add(dev->next_event, tick_period);
178
179 if (!clockevents_program_event(dev, next, ktime_get()))
180 return;
181 tick_do_periodic_broadcast();
182 }
183}
184
185/*
186 * Powerstate information: The system enters/leaves a state, where
187 * affected devices might stop
188 */
189static void tick_do_broadcast_on_off(void *why)
190{
191 struct clock_event_device *bc, *dev;
192 struct tick_device *td;
193 unsigned long flags, *reason = why;
194 int cpu;
195
196 spin_lock_irqsave(&tick_broadcast_lock, flags);
197
198 cpu = smp_processor_id();
199 td = &per_cpu(tick_cpu_device, cpu);
200 dev = td->evtdev;
201 bc = tick_broadcast_device.evtdev;
202
203 /*
204 * Is the device in broadcast mode forever or is it not
205 * affected by the powerstate ?
206 */
207 if (!dev || !tick_device_is_functional(dev) ||
208 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
209 goto out;
210
211 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
212 if (!cpu_isset(cpu, tick_broadcast_mask)) {
213 cpu_set(cpu, tick_broadcast_mask);
214 if (td->mode == TICKDEV_MODE_PERIODIC)
215 clockevents_set_mode(dev,
216 CLOCK_EVT_MODE_SHUTDOWN);
217 }
218 } else {
219 if (cpu_isset(cpu, tick_broadcast_mask)) {
220 cpu_clear(cpu, tick_broadcast_mask);
221 if (td->mode == TICKDEV_MODE_PERIODIC)
222 tick_setup_periodic(dev, 0);
223 }
224 }
225
226 if (cpus_empty(tick_broadcast_mask))
227 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
228 else {
229 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
230 tick_broadcast_start_periodic(bc);
231 else
232 tick_broadcast_setup_oneshot(bc);
233 }
234out:
235 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
236}
237
238/*
239 * Powerstate information: The system enters/leaves a state, where
240 * affected devices might stop.
241 */
242void tick_broadcast_on_off(unsigned long reason, int *oncpu)
243{
244 int cpu = get_cpu();
245
246 if (cpu == *oncpu)
247 tick_do_broadcast_on_off(&reason);
248 else
249 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
250 &reason, 1, 1);
251 put_cpu();
252}
253
254/*
255 * Set the periodic handler depending on broadcast on/off
256 */
257void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
258{
259 if (!broadcast)
260 dev->event_handler = tick_handle_periodic;
261 else
262 dev->event_handler = tick_handle_periodic_broadcast;
263}
264
265/*
266 * Remove a CPU from broadcasting
267 */
268void tick_shutdown_broadcast(unsigned int *cpup)
269{
270 struct clock_event_device *bc;
271 unsigned long flags;
272 unsigned int cpu = *cpup;
273
274 spin_lock_irqsave(&tick_broadcast_lock, flags);
275
276 bc = tick_broadcast_device.evtdev;
277 cpu_clear(cpu, tick_broadcast_mask);
278
279 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
280 if (bc && cpus_empty(tick_broadcast_mask))
281 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
282 }
283
284 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
285}
286
287#ifdef CONFIG_TICK_ONESHOT
288
289static cpumask_t tick_broadcast_oneshot_mask;
290
291/*
292 * Debugging: see timer_list.c
293 */
294cpumask_t *tick_get_broadcast_oneshot_mask(void)
295{
296 return &tick_broadcast_oneshot_mask;
297}
298
299static int tick_broadcast_set_event(ktime_t expires, int force)
300{
301 struct clock_event_device *bc = tick_broadcast_device.evtdev;
302 ktime_t now = ktime_get();
303 int res;
304
305 for(;;) {
306 res = clockevents_program_event(bc, expires, now);
307 if (!res || !force)
308 return res;
309 now = ktime_get();
310 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
311 }
312}
313
314/*
315 * Reprogram the broadcast device:
316 *
317 * Called with tick_broadcast_lock held and interrupts disabled.
318 */
319static int tick_broadcast_reprogram(void)
320{
321 ktime_t expires = { .tv64 = KTIME_MAX };
322 struct tick_device *td;
323 int cpu;
324
325 /*
326 * Find the event which expires next:
327 */
328 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
329 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
330 td = &per_cpu(tick_cpu_device, cpu);
331 if (td->evtdev->next_event.tv64 < expires.tv64)
332 expires = td->evtdev->next_event;
333 }
334
335 if (expires.tv64 == KTIME_MAX)
336 return 0;
337
338 return tick_broadcast_set_event(expires, 0);
339}
340
341/*
342 * Handle oneshot mode broadcasting
343 */
344static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
345{
346 struct tick_device *td;
347 cpumask_t mask;
348 ktime_t now;
349 int cpu;
350
351 spin_lock(&tick_broadcast_lock);
352again:
353 dev->next_event.tv64 = KTIME_MAX;
354 mask = CPU_MASK_NONE;
355 now = ktime_get();
356 /* Find all expired events */
357 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
358 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
359 td = &per_cpu(tick_cpu_device, cpu);
360 if (td->evtdev->next_event.tv64 <= now.tv64)
361 cpu_set(cpu, mask);
362 }
363
364 /*
365 * Wakeup the cpus which have an expired event. The broadcast
366 * device is reprogrammed in the return from idle code.
367 */
368 if (!tick_do_broadcast(mask)) {
369 /*
370 * The global event did not expire any CPU local
371 * events. This happens in dyntick mode, as the
372 * maximum PIT delta is quite small.
373 */
374 if (tick_broadcast_reprogram())
375 goto again;
376 }
377 spin_unlock(&tick_broadcast_lock);
378}
379
380/*
381 * Powerstate information: The system enters/leaves a state, where
382 * affected devices might stop
383 */
384void tick_broadcast_oneshot_control(unsigned long reason)
385{
386 struct clock_event_device *bc, *dev;
387 struct tick_device *td;
388 unsigned long flags;
389 int cpu;
390
391 spin_lock_irqsave(&tick_broadcast_lock, flags);
392
393 /*
394 * Periodic mode does not care about the enter/exit of power
395 * states
396 */
397 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
398 goto out;
399
400 bc = tick_broadcast_device.evtdev;
401 cpu = smp_processor_id();
402 td = &per_cpu(tick_cpu_device, cpu);
403 dev = td->evtdev;
404
405 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
406 goto out;
407
408 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
409 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
410 cpu_set(cpu, tick_broadcast_oneshot_mask);
411 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
412 if (dev->next_event.tv64 < bc->next_event.tv64)
413 tick_broadcast_set_event(dev->next_event, 1);
414 }
415 } else {
416 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
417 cpu_clear(cpu, tick_broadcast_oneshot_mask);
418 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
419 if (dev->next_event.tv64 != KTIME_MAX)
420 tick_program_event(dev->next_event, 1);
421 }
422 }
423
424out:
425 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
426}
427
428/**
429 * tick_broadcast_setup_highres - setup the broadcast device for highres
430 */
431void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
432{
433 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
434 bc->event_handler = tick_handle_oneshot_broadcast;
435 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
436 bc->next_event.tv64 = KTIME_MAX;
437 }
438}
439
440/*
441 * Select oneshot operating mode for the broadcast device
442 */
443void tick_broadcast_switch_to_oneshot(void)
444{
445 struct clock_event_device *bc;
446 unsigned long flags;
447
448 spin_lock_irqsave(&tick_broadcast_lock, flags);
449
450 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
451 bc = tick_broadcast_device.evtdev;
452 if (bc)
453 tick_broadcast_setup_oneshot(bc);
454 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
455}
456
457
458/*
459 * Remove a dead CPU from broadcasting
460 */
461void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
462{
463 struct clock_event_device *bc;
464 unsigned long flags;
465 unsigned int cpu = *cpup;
466
467 spin_lock_irqsave(&tick_broadcast_lock, flags);
468
469 bc = tick_broadcast_device.evtdev;
470 cpu_clear(cpu, tick_broadcast_oneshot_mask);
471
472 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
473 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
474 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
475 }
476
477 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
478}
479
480#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000000..4500e347f1bb
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,346 @@
1/*
2 * linux/kernel/time/tick-common.c
3 *
4 * This file contains the base functions to manage periodic tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Tick devices
27 */
28DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
29/*
30 * Tick next event: keeps track of the tick time
31 */
32ktime_t tick_next_period;
33ktime_t tick_period;
34static int tick_do_timer_cpu = -1;
35DEFINE_SPINLOCK(tick_device_lock);
36
37/*
38 * Debugging: see timer_list.c
39 */
40struct tick_device *tick_get_device(int cpu)
41{
42 return &per_cpu(tick_cpu_device, cpu);
43}
44
45/**
46 * tick_is_oneshot_available - check for a oneshot capable event device
47 */
48int tick_is_oneshot_available(void)
49{
50 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
51
52 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
53}
54
55/*
56 * Periodic tick
57 */
58static void tick_periodic(int cpu)
59{
60 if (tick_do_timer_cpu == cpu) {
61 write_seqlock(&xtime_lock);
62
63 /* Keep track of the next tick event */
64 tick_next_period = ktime_add(tick_next_period, tick_period);
65
66 do_timer(1);
67 write_sequnlock(&xtime_lock);
68 }
69
70 update_process_times(user_mode(get_irq_regs()));
71 profile_tick(CPU_PROFILING);
72}
73
74/*
75 * Event handler for periodic ticks
76 */
77void tick_handle_periodic(struct clock_event_device *dev)
78{
79 int cpu = smp_processor_id();
80
81 tick_periodic(cpu);
82
83 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
84 return;
85 /*
86 * Setup the next period for devices, which do not have
87 * periodic mode:
88 */
89 for (;;) {
90 ktime_t next = ktime_add(dev->next_event, tick_period);
91
92 if (!clockevents_program_event(dev, next, ktime_get()))
93 return;
94 tick_periodic(cpu);
95 }
96}
97
98/*
99 * Setup the device for a periodic tick
100 */
101void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
102{
103 tick_set_periodic_handler(dev, broadcast);
104
105 /* Broadcast setup ? */
106 if (!tick_device_is_functional(dev))
107 return;
108
109 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
110 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
111 } else {
112 unsigned long seq;
113 ktime_t next;
114
115 do {
116 seq = read_seqbegin(&xtime_lock);
117 next = tick_next_period;
118 } while (read_seqretry(&xtime_lock, seq));
119
120 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
121
122 for (;;) {
123 if (!clockevents_program_event(dev, next, ktime_get()))
124 return;
125 next = ktime_add(next, tick_period);
126 }
127 }
128}
129
130/*
131 * Setup the tick device
132 */
133static void tick_setup_device(struct tick_device *td,
134 struct clock_event_device *newdev, int cpu,
135 cpumask_t cpumask)
136{
137 ktime_t next_event;
138 void (*handler)(struct clock_event_device *) = NULL;
139
140 /*
141 * First device setup ?
142 */
143 if (!td->evtdev) {
144 /*
145 * If no cpu took the do_timer update, assign it to
146 * this cpu:
147 */
148 if (tick_do_timer_cpu == -1) {
149 tick_do_timer_cpu = cpu;
150 tick_next_period = ktime_get();
151 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
152 }
153
154 /*
155 * Startup in periodic mode first.
156 */
157 td->mode = TICKDEV_MODE_PERIODIC;
158 } else {
159 handler = td->evtdev->event_handler;
160 next_event = td->evtdev->next_event;
161 }
162
163 td->evtdev = newdev;
164
165 /*
166 * When the device is not per cpu, pin the interrupt to the
167 * current cpu:
168 */
169 if (!cpus_equal(newdev->cpumask, cpumask))
170 irq_set_affinity(newdev->irq, cpumask);
171
172 /*
173 * When global broadcasting is active, check if the current
174 * device is registered as a placeholder for broadcast mode.
175 * This allows us to handle this x86 misfeature in a generic
176 * way.
177 */
178 if (tick_device_uses_broadcast(newdev, cpu))
179 return;
180
181 if (td->mode == TICKDEV_MODE_PERIODIC)
182 tick_setup_periodic(newdev, 0);
183 else
184 tick_setup_oneshot(newdev, handler, next_event);
185}
186
187/*
188 * Check, if the new registered device should be used.
189 */
190static int tick_check_new_device(struct clock_event_device *newdev)
191{
192 struct clock_event_device *curdev;
193 struct tick_device *td;
194 int cpu, ret = NOTIFY_OK;
195 unsigned long flags;
196 cpumask_t cpumask;
197
198 spin_lock_irqsave(&tick_device_lock, flags);
199
200 cpu = smp_processor_id();
201 if (!cpu_isset(cpu, newdev->cpumask))
202 goto out;
203
204 td = &per_cpu(tick_cpu_device, cpu);
205 curdev = td->evtdev;
206 cpumask = cpumask_of_cpu(cpu);
207
208 /* cpu local device ? */
209 if (!cpus_equal(newdev->cpumask, cpumask)) {
210
211 /*
212 * If the cpu affinity of the device interrupt can not
213 * be set, ignore it.
214 */
215 if (!irq_can_set_affinity(newdev->irq))
216 goto out_bc;
217
218 /*
219 * If we have a cpu local device already, do not replace it
220 * by a non cpu local device
221 */
222 if (curdev && cpus_equal(curdev->cpumask, cpumask))
223 goto out_bc;
224 }
225
226 /*
227 * If we have an active device, then check the rating and the oneshot
228 * feature.
229 */
230 if (curdev) {
231 /*
232 * Prefer one shot capable devices !
233 */
234 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
235 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
236 goto out_bc;
237 /*
238 * Check the rating
239 */
240 if (curdev->rating >= newdev->rating)
241 goto out_bc;
242 }
243
244 /*
245 * Replace the eventually existing device by the new
246 * device. If the current device is the broadcast device, do
247 * not give it back to the clockevents layer !
248 */
249 if (tick_is_broadcast_device(curdev)) {
250 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
251 curdev = NULL;
252 }
253 clockevents_exchange_device(curdev, newdev);
254 tick_setup_device(td, newdev, cpu, cpumask);
255 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
256 tick_oneshot_notify();
257
258 spin_unlock_irqrestore(&tick_device_lock, flags);
259 return NOTIFY_STOP;
260
261out_bc:
262 /*
263 * Can the new device be used as a broadcast device ?
264 */
265 if (tick_check_broadcast_device(newdev))
266 ret = NOTIFY_STOP;
267out:
268 spin_unlock_irqrestore(&tick_device_lock, flags);
269
270 return ret;
271}
272
273/*
274 * Shutdown an event device on a given cpu:
275 *
276 * This is called on a life CPU, when a CPU is dead. So we cannot
277 * access the hardware device itself.
278 * We just set the mode and remove it from the lists.
279 */
280static void tick_shutdown(unsigned int *cpup)
281{
282 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
283 struct clock_event_device *dev = td->evtdev;
284 unsigned long flags;
285
286 spin_lock_irqsave(&tick_device_lock, flags);
287 td->mode = TICKDEV_MODE_PERIODIC;
288 if (dev) {
289 /*
290 * Prevent that the clock events layer tries to call
291 * the set mode function!
292 */
293 dev->mode = CLOCK_EVT_MODE_UNUSED;
294 clockevents_exchange_device(dev, NULL);
295 td->evtdev = NULL;
296 }
297 spin_unlock_irqrestore(&tick_device_lock, flags);
298}
299
300/*
301 * Notification about clock event devices
302 */
303static int tick_notify(struct notifier_block *nb, unsigned long reason,
304 void *dev)
305{
306 switch (reason) {
307
308 case CLOCK_EVT_NOTIFY_ADD:
309 return tick_check_new_device(dev);
310
311 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
312 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
313 tick_broadcast_on_off(reason, dev);
314 break;
315
316 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
317 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
318 tick_broadcast_oneshot_control(reason);
319 break;
320
321 case CLOCK_EVT_NOTIFY_CPU_DEAD:
322 tick_shutdown_broadcast_oneshot(dev);
323 tick_shutdown_broadcast(dev);
324 tick_shutdown(dev);
325 break;
326
327 default:
328 break;
329 }
330
331 return NOTIFY_OK;
332}
333
334static struct notifier_block tick_notifier = {
335 .notifier_call = tick_notify,
336};
337
338/**
339 * tick_init - initialize the tick control
340 *
341 * Register the notifier with the clockevents framework
342 */
343void __init tick_init(void)
344{
345 clockevents_register_notifier(&tick_notifier);
346}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000000..54861a0f29ff
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
1/*
2 * tick internal variable and functions used by low/high res code
3 */
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period;
7extern ktime_t tick_period;
8
9extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
10extern void tick_handle_periodic(struct clock_event_device *dev);
11
12/*
13 * NO_HZ / high resolution timer shared code
14 */
15#ifdef CONFIG_TICK_ONESHOT
16extern void tick_setup_oneshot(struct clock_event_device *newdev,
17 void (*handler)(struct clock_event_device *),
18 ktime_t nextevt);
19extern int tick_program_event(ktime_t expires, int force);
20extern void tick_oneshot_notify(void);
21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
22
23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
25extern void tick_broadcast_oneshot_control(unsigned long reason);
26extern void tick_broadcast_switch_to_oneshot(void);
27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
28# else /* BROADCAST */
29static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
30{
31 BUG();
32}
33static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
34static inline void tick_broadcast_switch_to_oneshot(void) { }
35static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
36# endif /* !BROADCAST */
37
38#else /* !ONESHOT */
39static inline
40void tick_setup_oneshot(struct clock_event_device *newdev,
41 void (*handler)(struct clock_event_device *),
42 ktime_t nextevt)
43{
44 BUG();
45}
46static inline int tick_program_event(ktime_t expires, int force)
47{
48 return 0;
49}
50static inline void tick_oneshot_notify(void) { }
51static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
52{
53 BUG();
54}
55static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
56static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
57#endif /* !TICK_ONESHOT */
58
59/*
60 * Broadcasting support
61 */
62#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
63extern int tick_do_broadcast(cpumask_t mask);
64
65extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
66extern int tick_check_broadcast_device(struct clock_event_device *dev);
67extern int tick_is_broadcast_device(struct clock_event_device *dev);
68extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
69extern void tick_shutdown_broadcast(unsigned int *cpup);
70
71extern void
72tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
73
74#else /* !BROADCAST */
75
76static inline int tick_check_broadcast_device(struct clock_event_device *dev)
77{
78 return 0;
79}
80
81static inline int tick_is_broadcast_device(struct clock_event_device *dev)
82{
83 return 0;
84}
85static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
86 int cpu)
87{
88 return 0;
89}
90static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
91static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
92static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
93
94/*
95 * Set the periodic handler in non broadcast mode
96 */
97static inline void tick_set_periodic_handler(struct clock_event_device *dev,
98 int broadcast)
99{
100 dev->event_handler = tick_handle_periodic;
101}
102#endif /* !BROADCAST */
103
104/*
105 * Check, if the device is functional or a dummy for broadcast
106 */
107static inline int tick_device_is_functional(struct clock_event_device *dev)
108{
109 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
110}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000000..2e8b7ff863cc
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
1/*
2 * linux/kernel/time/tick-oneshot.c
3 *
4 * This file contains functions which manage high resolution tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/**
26 * tick_program_event
27 */
28int tick_program_event(ktime_t expires, int force)
29{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get();
32
33 while (1) {
34 int ret = clockevents_program_event(dev, expires, now);
35
36 if (!ret || !force)
37 return ret;
38 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
40 }
41}
42
43/**
44 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
45 */
46void tick_setup_oneshot(struct clock_event_device *newdev,
47 void (*handler)(struct clock_event_device *),
48 ktime_t next_event)
49{
50 newdev->event_handler = handler;
51 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
52 clockevents_program_event(newdev, next_event, ktime_get());
53}
54
55/**
56 * tick_switch_to_oneshot - switch to oneshot mode
57 */
58int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
59{
60 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
61 struct clock_event_device *dev = td->evtdev;
62
63 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
64 !tick_device_is_functional(dev))
65 return -EINVAL;
66
67 td->mode = TICKDEV_MODE_ONESHOT;
68 dev->event_handler = handler;
69 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
70 tick_broadcast_switch_to_oneshot();
71 return 0;
72}
73
74#ifdef CONFIG_HIGH_RES_TIMERS
75/**
76 * tick_init_highres - switch to high resolution mode
77 *
78 * Called with interrupts disabled.
79 */
80int tick_init_highres(void)
81{
82 return tick_switch_to_oneshot(hrtimer_interrupt);
83}
84#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000000..512a4a906467
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,565 @@
1/*
2 * linux/kernel/time/tick-sched.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * No idle tick implementation for low and high resolution timers
9 *
10 * Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/interrupt.h>
18#include <linux/kernel_stat.h>
19#include <linux/percpu.h>
20#include <linux/profile.h>
21#include <linux/sched.h>
22#include <linux/tick.h>
23
24#include "tick-internal.h"
25
26/*
27 * Per cpu nohz control structure
28 */
29static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
30
31/*
32 * The time, when the last jiffy update happened. Protected by xtime_lock.
33 */
34static ktime_t last_jiffies_update;
35
36struct tick_sched *tick_get_tick_sched(int cpu)
37{
38 return &per_cpu(tick_cpu_sched, cpu);
39}
40
41/*
42 * Must be called with interrupts disabled !
43 */
44static void tick_do_update_jiffies64(ktime_t now)
45{
46 unsigned long ticks = 0;
47 ktime_t delta;
48
49 /* Reevalute with xtime_lock held */
50 write_seqlock(&xtime_lock);
51
52 delta = ktime_sub(now, last_jiffies_update);
53 if (delta.tv64 >= tick_period.tv64) {
54
55 delta = ktime_sub(delta, tick_period);
56 last_jiffies_update = ktime_add(last_jiffies_update,
57 tick_period);
58
59 /* Slow path for long timeouts */
60 if (unlikely(delta.tv64 >= tick_period.tv64)) {
61 s64 incr = ktime_to_ns(tick_period);
62
63 ticks = ktime_divns(delta, incr);
64
65 last_jiffies_update = ktime_add_ns(last_jiffies_update,
66 incr * ticks);
67 }
68 do_timer(++ticks);
69 }
70 write_sequnlock(&xtime_lock);
71}
72
73/*
74 * Initialize and return retrieve the jiffies update.
75 */
76static ktime_t tick_init_jiffy_update(void)
77{
78 ktime_t period;
79
80 write_seqlock(&xtime_lock);
81 /* Did we start the jiffies update yet ? */
82 if (last_jiffies_update.tv64 == 0)
83 last_jiffies_update = tick_next_period;
84 period = last_jiffies_update;
85 write_sequnlock(&xtime_lock);
86 return period;
87}
88
89/*
90 * NOHZ - aka dynamic tick functionality
91 */
92#ifdef CONFIG_NO_HZ
93/*
94 * NO HZ enabled ?
95 */
96static int tick_nohz_enabled __read_mostly = 1;
97
98/*
99 * Enable / Disable tickless mode
100 */
101static int __init setup_tick_nohz(char *str)
102{
103 if (!strcmp(str, "off"))
104 tick_nohz_enabled = 0;
105 else if (!strcmp(str, "on"))
106 tick_nohz_enabled = 1;
107 else
108 return 0;
109 return 1;
110}
111
112__setup("nohz=", setup_tick_nohz);
113
114/**
115 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
116 *
117 * Called from interrupt entry when the CPU was idle
118 *
119 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
120 * must be updated. Otherwise an interrupt handler could use a stale jiffy
121 * value. We do this unconditionally on any cpu, as we don't know whether the
122 * cpu, which has the update task assigned is in a long sleep.
123 */
124void tick_nohz_update_jiffies(void)
125{
126 int cpu = smp_processor_id();
127 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
128 unsigned long flags;
129 ktime_t now;
130
131 if (!ts->tick_stopped)
132 return;
133
134 cpu_clear(cpu, nohz_cpu_mask);
135 now = ktime_get();
136
137 local_irq_save(flags);
138 tick_do_update_jiffies64(now);
139 local_irq_restore(flags);
140}
141
142/**
143 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
144 *
145 * When the next event is more than a tick into the future, stop the idle tick
146 * Called either from the idle loop or from irq_exit() when an idle period was
147 * just interrupted by an interrupt which did not cause a reschedule.
148 */
149void tick_nohz_stop_sched_tick(void)
150{
151 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
152 struct tick_sched *ts;
153 ktime_t last_update, expires, now, delta;
154 int cpu;
155
156 local_irq_save(flags);
157
158 cpu = smp_processor_id();
159 ts = &per_cpu(tick_cpu_sched, cpu);
160
161 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
162 goto end;
163
164 if (need_resched())
165 goto end;
166
167 cpu = smp_processor_id();
168 if (unlikely(local_softirq_pending()))
169 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
170 local_softirq_pending());
171
172 now = ktime_get();
173 /*
174 * When called from irq_exit we need to account the idle sleep time
175 * correctly.
176 */
177 if (ts->tick_stopped) {
178 delta = ktime_sub(now, ts->idle_entrytime);
179 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
180 }
181
182 ts->idle_entrytime = now;
183 ts->idle_calls++;
184
185 /* Read jiffies and the time when jiffies were updated last */
186 do {
187 seq = read_seqbegin(&xtime_lock);
188 last_update = last_jiffies_update;
189 last_jiffies = jiffies;
190 } while (read_seqretry(&xtime_lock, seq));
191
192 /* Get the next timer wheel timer */
193 next_jiffies = get_next_timer_interrupt(last_jiffies);
194 delta_jiffies = next_jiffies - last_jiffies;
195
196 if (rcu_needs_cpu(cpu))
197 delta_jiffies = 1;
198 /*
199 * Do not stop the tick, if we are only one off
200 * or if the cpu is required for rcu
201 */
202 if (!ts->tick_stopped && delta_jiffies == 1)
203 goto out;
204
205 /* Schedule the tick, if we are at least one jiffie off */
206 if ((long)delta_jiffies >= 1) {
207
208 if (delta_jiffies > 1)
209 cpu_set(cpu, nohz_cpu_mask);
210 /*
211 * nohz_stop_sched_tick can be called several times before
212 * the nohz_restart_sched_tick is called. This happens when
213 * interrupts arrive which do not cause a reschedule. In the
214 * first call we save the current tick time, so we can restart
215 * the scheduler tick in nohz_restart_sched_tick.
216 */
217 if (!ts->tick_stopped) {
218 ts->idle_tick = ts->sched_timer.expires;
219 ts->tick_stopped = 1;
220 ts->idle_jiffies = last_jiffies;
221 }
222 /*
223 * calculate the expiry time for the next timer wheel
224 * timer
225 */
226 expires = ktime_add_ns(last_update, tick_period.tv64 *
227 delta_jiffies);
228 ts->idle_expires = expires;
229 ts->idle_sleeps++;
230
231 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
232 hrtimer_start(&ts->sched_timer, expires,
233 HRTIMER_MODE_ABS);
234 /* Check, if the timer was already in the past */
235 if (hrtimer_active(&ts->sched_timer))
236 goto out;
237 } else if(!tick_program_event(expires, 0))
238 goto out;
239 /*
240 * We are past the event already. So we crossed a
241 * jiffie boundary. Update jiffies and raise the
242 * softirq.
243 */
244 tick_do_update_jiffies64(ktime_get());
245 cpu_clear(cpu, nohz_cpu_mask);
246 }
247 raise_softirq_irqoff(TIMER_SOFTIRQ);
248out:
249 ts->next_jiffies = next_jiffies;
250 ts->last_jiffies = last_jiffies;
251end:
252 local_irq_restore(flags);
253}
254
255/**
256 * nohz_restart_sched_tick - restart the idle tick from the idle task
257 *
258 * Restart the idle tick when the CPU is woken up from idle
259 */
260void tick_nohz_restart_sched_tick(void)
261{
262 int cpu = smp_processor_id();
263 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
264 unsigned long ticks;
265 ktime_t now, delta;
266
267 if (!ts->tick_stopped)
268 return;
269
270 /* Update jiffies first */
271 now = ktime_get();
272
273 local_irq_disable();
274 tick_do_update_jiffies64(now);
275 cpu_clear(cpu, nohz_cpu_mask);
276
277 /* Account the idle time */
278 delta = ktime_sub(now, ts->idle_entrytime);
279 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
280
281 /*
282 * We stopped the tick in idle. Update process times would miss the
283 * time we slept as update_process_times does only a 1 tick
284 * accounting. Enforce that this is accounted to idle !
285 */
286 ticks = jiffies - ts->idle_jiffies;
287 /*
288 * We might be one off. Do not randomly account a huge number of ticks!
289 */
290 if (ticks && ticks < LONG_MAX) {
291 add_preempt_count(HARDIRQ_OFFSET);
292 account_system_time(current, HARDIRQ_OFFSET,
293 jiffies_to_cputime(ticks));
294 sub_preempt_count(HARDIRQ_OFFSET);
295 }
296
297 /*
298 * Cancel the scheduled timer and restore the tick
299 */
300 ts->tick_stopped = 0;
301 hrtimer_cancel(&ts->sched_timer);
302 ts->sched_timer.expires = ts->idle_tick;
303
304 while (1) {
305 /* Forward the time to expire in the future */
306 hrtimer_forward(&ts->sched_timer, now, tick_period);
307
308 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
309 hrtimer_start(&ts->sched_timer,
310 ts->sched_timer.expires,
311 HRTIMER_MODE_ABS);
312 /* Check, if the timer was already in the past */
313 if (hrtimer_active(&ts->sched_timer))
314 break;
315 } else {
316 if (!tick_program_event(ts->sched_timer.expires, 0))
317 break;
318 }
319 /* Update jiffies and reread time */
320 tick_do_update_jiffies64(now);
321 now = ktime_get();
322 }
323 local_irq_enable();
324}
325
326static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
327{
328 hrtimer_forward(&ts->sched_timer, now, tick_period);
329 return tick_program_event(ts->sched_timer.expires, 0);
330}
331
332/*
333 * The nohz low res interrupt handler
334 */
335static void tick_nohz_handler(struct clock_event_device *dev)
336{
337 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
338 struct pt_regs *regs = get_irq_regs();
339 ktime_t now = ktime_get();
340
341 dev->next_event.tv64 = KTIME_MAX;
342
343 /* Check, if the jiffies need an update */
344 tick_do_update_jiffies64(now);
345
346 /*
347 * When we are idle and the tick is stopped, we have to touch
348 * the watchdog as we might not schedule for a really long
349 * time. This happens on complete idle SMP systems while
350 * waiting on the login prompt. We also increment the "start
351 * of idle" jiffy stamp so the idle accounting adjustment we
352 * do when we go busy again does not account too much ticks.
353 */
354 if (ts->tick_stopped) {
355 touch_softlockup_watchdog();
356 ts->idle_jiffies++;
357 }
358
359 update_process_times(user_mode(regs));
360 profile_tick(CPU_PROFILING);
361
362 /* Do not restart, when we are in the idle loop */
363 if (ts->tick_stopped)
364 return;
365
366 while (tick_nohz_reprogram(ts, now)) {
367 now = ktime_get();
368 tick_do_update_jiffies64(now);
369 }
370}
371
372/**
373 * tick_nohz_switch_to_nohz - switch to nohz mode
374 */
375static void tick_nohz_switch_to_nohz(void)
376{
377 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
378 ktime_t next;
379
380 if (!tick_nohz_enabled)
381 return;
382
383 local_irq_disable();
384 if (tick_switch_to_oneshot(tick_nohz_handler)) {
385 local_irq_enable();
386 return;
387 }
388
389 ts->nohz_mode = NOHZ_MODE_LOWRES;
390
391 /*
392 * Recycle the hrtimer in ts, so we can share the
393 * hrtimer_forward with the highres code.
394 */
395 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
396 /* Get the next period */
397 next = tick_init_jiffy_update();
398
399 for (;;) {
400 ts->sched_timer.expires = next;
401 if (!tick_program_event(next, 0))
402 break;
403 next = ktime_add(next, tick_period);
404 }
405 local_irq_enable();
406
407 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
408 smp_processor_id());
409}
410
411#else
412
413static inline void tick_nohz_switch_to_nohz(void) { }
414
415#endif /* NO_HZ */
416
417/*
418 * High resolution timer specific code
419 */
420#ifdef CONFIG_HIGH_RES_TIMERS
421/*
422 * We rearm the timer until we get disabled by the idle code
423 * Called with interrupts disabled and timer->base->cpu_base->lock held.
424 */
425static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
426{
427 struct tick_sched *ts =
428 container_of(timer, struct tick_sched, sched_timer);
429 struct hrtimer_cpu_base *base = timer->base->cpu_base;
430 struct pt_regs *regs = get_irq_regs();
431 ktime_t now = ktime_get();
432
433 /* Check, if the jiffies need an update */
434 tick_do_update_jiffies64(now);
435
436 /*
437 * Do not call, when we are not in irq context and have
438 * no valid regs pointer
439 */
440 if (regs) {
441 /*
442 * When we are idle and the tick is stopped, we have to touch
443 * the watchdog as we might not schedule for a really long
444 * time. This happens on complete idle SMP systems while
445 * waiting on the login prompt. We also increment the "start of
446 * idle" jiffy stamp so the idle accounting adjustment we do
447 * when we go busy again does not account too much ticks.
448 */
449 if (ts->tick_stopped) {
450 touch_softlockup_watchdog();
451 ts->idle_jiffies++;
452 }
453 /*
454 * update_process_times() might take tasklist_lock, hence
455 * drop the base lock. sched-tick hrtimers are per-CPU and
456 * never accessible by userspace APIs, so this is safe to do.
457 */
458 spin_unlock(&base->lock);
459 update_process_times(user_mode(regs));
460 profile_tick(CPU_PROFILING);
461 spin_lock(&base->lock);
462 }
463
464 /* Do not restart, when we are in the idle loop */
465 if (ts->tick_stopped)
466 return HRTIMER_NORESTART;
467
468 hrtimer_forward(timer, now, tick_period);
469
470 return HRTIMER_RESTART;
471}
472
473/**
474 * tick_setup_sched_timer - setup the tick emulation timer
475 */
476void tick_setup_sched_timer(void)
477{
478 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
479 ktime_t now = ktime_get();
480
481 /*
482 * Emulate tick processing via per-CPU hrtimers:
483 */
484 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
485 ts->sched_timer.function = tick_sched_timer;
486 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
487
488 /* Get the next period */
489 ts->sched_timer.expires = tick_init_jiffy_update();
490
491 for (;;) {
492 hrtimer_forward(&ts->sched_timer, now, tick_period);
493 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
494 HRTIMER_MODE_ABS);
495 /* Check, if the timer was already in the past */
496 if (hrtimer_active(&ts->sched_timer))
497 break;
498 now = ktime_get();
499 }
500
501#ifdef CONFIG_NO_HZ
502 if (tick_nohz_enabled)
503 ts->nohz_mode = NOHZ_MODE_HIGHRES;
504#endif
505}
506
507void tick_cancel_sched_timer(int cpu)
508{
509 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
510
511 if (ts->sched_timer.base)
512 hrtimer_cancel(&ts->sched_timer);
513 ts->tick_stopped = 0;
514 ts->nohz_mode = NOHZ_MODE_INACTIVE;
515}
516#endif /* HIGH_RES_TIMERS */
517
518/**
519 * Async notification about clocksource changes
520 */
521void tick_clock_notify(void)
522{
523 int cpu;
524
525 for_each_possible_cpu(cpu)
526 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
527}
528
529/*
530 * Async notification about clock event changes
531 */
532void tick_oneshot_notify(void)
533{
534 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
535
536 set_bit(0, &ts->check_clocks);
537}
538
539/**
540 * Check, if a change happened, which makes oneshot possible.
541 *
542 * Called cyclic from the hrtimer softirq (driven by the timer
543 * softirq) allow_nohz signals, that we can switch into low-res nohz
544 * mode, because high resolution timers are disabled (either compile
545 * or runtime).
546 */
547int tick_check_oneshot_change(int allow_nohz)
548{
549 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
550
551 if (!test_and_clear_bit(0, &ts->check_clocks))
552 return 0;
553
554 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
555 return 0;
556
557 if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
558 return 0;
559
560 if (!allow_nohz)
561 return 1;
562
563 tick_nohz_switch_to_nohz();
564 return 0;
565}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000000..f82c635c3d5c
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
1/*
2 * kernel/time/timer_list.c
3 *
4 * List pending timers
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/sched.h>
17#include <linux/seq_file.h>
18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20
21#include <asm/uaccess.h>
22
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26
27/*
28 * This allows printing both to /proc/timer_list and
29 * to the console (on SysRq-Q):
30 */
31#define SEQ_printf(m, x...) \
32 do { \
33 if (m) \
34 seq_printf(m, x); \
35 else \
36 printk(x); \
37 } while (0)
38
39static void print_name_offset(struct seq_file *m, void *sym)
40{
41 unsigned long addr = (unsigned long)sym;
42 char namebuf[KSYM_NAME_LEN+1];
43 unsigned long size, offset;
44 const char *sym_name;
45 char *modname;
46
47 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
48 if (sym_name)
49 SEQ_printf(m, "%s", sym_name);
50 else
51 SEQ_printf(m, "<%p>", sym);
52}
53
54static void
55print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
56{
57#ifdef CONFIG_TIMER_STATS
58 char tmp[TASK_COMM_LEN + 1];
59#endif
60 SEQ_printf(m, " #%d: ", idx);
61 print_name_offset(m, timer);
62 SEQ_printf(m, ", ");
63 print_name_offset(m, timer->function);
64 SEQ_printf(m, ", S:%02lx", timer->state);
65#ifdef CONFIG_TIMER_STATS
66 SEQ_printf(m, ", ");
67 print_name_offset(m, timer->start_site);
68 memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
69 tmp[TASK_COMM_LEN] = 0;
70 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
71#endif
72 SEQ_printf(m, "\n");
73 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
74 (unsigned long long)ktime_to_ns(timer->expires),
75 (unsigned long long)(ktime_to_ns(timer->expires) - now));
76}
77
78static void
79print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
80 u64 now)
81{
82 struct hrtimer *timer, tmp;
83 unsigned long next = 0, i;
84 struct rb_node *curr;
85 unsigned long flags;
86
87next_one:
88 i = 0;
89 spin_lock_irqsave(&base->cpu_base->lock, flags);
90
91 curr = base->first;
92 /*
93 * Crude but we have to do this O(N*N) thing, because
94 * we have to unlock the base when printing:
95 */
96 while (curr && i < next) {
97 curr = rb_next(curr);
98 i++;
99 }
100
101 if (curr) {
102
103 timer = rb_entry(curr, struct hrtimer, node);
104 tmp = *timer;
105 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
106
107 print_timer(m, &tmp, i, now);
108 next++;
109 goto next_one;
110 }
111 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
112}
113
114static void
115print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
116{
117 SEQ_printf(m, " .index: %d\n",
118 base->index);
119 SEQ_printf(m, " .resolution: %Ld nsecs\n",
120 (unsigned long long)ktime_to_ns(base->resolution));
121 SEQ_printf(m, " .get_time: ");
122 print_name_offset(m, base->get_time);
123 SEQ_printf(m, "\n");
124#ifdef CONFIG_HIGH_RES_TIMERS
125 SEQ_printf(m, " .offset: %Ld nsecs\n",
126 ktime_to_ns(base->offset));
127#endif
128 SEQ_printf(m, "active timers:\n");
129 print_active_timers(m, base, now);
130}
131
132static void print_cpu(struct seq_file *m, int cpu, u64 now)
133{
134 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
135 int i;
136
137 SEQ_printf(m, "\ncpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i);
140 print_base(m, cpu_base->clock_base + i, now);
141 }
142#define P(x) \
143 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
144#define P_ns(x) \
145 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
146 (u64)(ktime_to_ns(cpu_base->x)))
147
148#ifdef CONFIG_HIGH_RES_TIMERS
149 P_ns(expires_next);
150 P(hres_active);
151 P(nr_events);
152#endif
153#undef P
154#undef P_ns
155
156#ifdef CONFIG_TICK_ONESHOT
157# define P(x) \
158 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
159# define P_ns(x) \
160 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
161 (u64)(ktime_to_ns(ts->x)))
162 {
163 struct tick_sched *ts = tick_get_tick_sched(cpu);
164 P(nohz_mode);
165 P_ns(idle_tick);
166 P(tick_stopped);
167 P(idle_jiffies);
168 P(idle_calls);
169 P(idle_sleeps);
170 P_ns(idle_entrytime);
171 P_ns(idle_sleeptime);
172 P(last_jiffies);
173 P(next_jiffies);
174 P_ns(idle_expires);
175 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
176 }
177#endif
178
179#undef P
180#undef P_ns
181}
182
183#ifdef CONFIG_GENERIC_CLOCKEVENTS
184static void
185print_tickdevice(struct seq_file *m, struct tick_device *td)
186{
187 struct clock_event_device *dev = td->evtdev;
188
189 SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode);
190
191 SEQ_printf(m, "Clock Event Device: ");
192 if (!dev) {
193 SEQ_printf(m, "<NULL>\n");
194 return;
195 }
196 SEQ_printf(m, "%s\n", dev->name);
197 SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
198 SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
199 SEQ_printf(m, " mult: %ld\n", dev->mult);
200 SEQ_printf(m, " shift: %d\n", dev->shift);
201 SEQ_printf(m, " mode: %d\n", dev->mode);
202 SEQ_printf(m, " next_event: %Ld nsecs\n",
203 (unsigned long long) ktime_to_ns(dev->next_event));
204
205 SEQ_printf(m, " set_next_event: ");
206 print_name_offset(m, dev->set_next_event);
207 SEQ_printf(m, "\n");
208
209 SEQ_printf(m, " set_mode: ");
210 print_name_offset(m, dev->set_mode);
211 SEQ_printf(m, "\n");
212
213 SEQ_printf(m, " event_handler: ");
214 print_name_offset(m, dev->event_handler);
215 SEQ_printf(m, "\n");
216}
217
218static void timer_list_show_tickdevices(struct seq_file *m)
219{
220 int cpu;
221
222#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
223 print_tickdevice(m, tick_get_broadcast_device());
224 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
225 tick_get_broadcast_mask()->bits[0]);
226#ifdef CONFIG_TICK_ONESHOT
227 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
228 tick_get_broadcast_oneshot_mask()->bits[0]);
229#endif
230 SEQ_printf(m, "\n");
231#endif
232 for_each_online_cpu(cpu)
233 print_tickdevice(m, tick_get_device(cpu));
234 SEQ_printf(m, "\n");
235}
236#else
237static void timer_list_show_tickdevices(struct seq_file *m) { }
238#endif
239
240static int timer_list_show(struct seq_file *m, void *v)
241{
242 u64 now = ktime_to_ns(ktime_get());
243 int cpu;
244
245 SEQ_printf(m, "Timer List Version: v0.3\n");
246 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
247 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
248
249 for_each_online_cpu(cpu)
250 print_cpu(m, cpu, now);
251
252 SEQ_printf(m, "\n");
253 timer_list_show_tickdevices(m);
254
255 return 0;
256}
257
258void sysrq_timer_list_show(void)
259{
260 timer_list_show(NULL, NULL);
261}
262
263static int timer_list_open(struct inode *inode, struct file *filp)
264{
265 return single_open(filp, timer_list_show, NULL);
266}
267
268static struct file_operations timer_list_fops = {
269 .open = timer_list_open,
270 .read = seq_read,
271 .llseek = seq_lseek,
272 .release = seq_release,
273};
274
275static int __init init_timer_list_procfs(void)
276{
277 struct proc_dir_entry *pe;
278
279 pe = create_proc_entry("timer_list", 0644, NULL);
280 if (!pe)
281 return -ENOMEM;
282
283 pe->proc_fops = &timer_list_fops;
284
285 return 0;
286}
287__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000000..1bc4882e28e0
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
1/*
2 * kernel/time/timer_stats.c
3 *
4 * Collect timer usage statistics.
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * timer_stats is based on timer_top, a similar functionality which was part of
10 * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
11 * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
12 * on dynamic allocation of the statistics entries and linear search based
13 * lookup combined with a global lock, rather than the static array, hash
14 * and per-CPU locking which is used by timer_stats. It was written for the
15 * pre hrtimer kernel code and therefore did not take hrtimers into account.
16 * Nevertheless it provided the base for the timer_stats implementation and
17 * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
18 * for this effort.
19 *
20 * timer_top.c is
21 * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
22 * Written by Daniel Petrini <d.pensator@gmail.com>
23 * timer_top.c was released under the GNU General Public License version 2
24 *
25 * We export the addresses and counting of timer functions being called,
26 * the pid and cmdline from the owner process if applicable.
27 *
28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats
30 *
31 * Display the information collected so far:
32 * # cat /proc/timer_stats
33 *
34 * This program is free software; you can redistribute it and/or modify
35 * it under the terms of the GNU General Public License version 2 as
36 * published by the Free Software Foundation.
37 */
38
39#include <linux/proc_fs.h>
40#include <linux/module.h>
41#include <linux/spinlock.h>
42#include <linux/sched.h>
43#include <linux/seq_file.h>
44#include <linux/kallsyms.h>
45
46#include <asm/uaccess.h>
47
48/*
49 * This is our basic unit of interest: a timer expiry event identified
50 * by the timer, its start/expire functions and the PID of the task that
51 * started the timer. We count the number of times an event happens:
52 */
53struct entry {
54 /*
55 * Hash list:
56 */
57 struct entry *next;
58
59 /*
60 * Hash keys:
61 */
62 void *timer;
63 void *start_func;
64 void *expire_func;
65 pid_t pid;
66
67 /*
68 * Number of timeout events:
69 */
70 unsigned long count;
71
72 /*
73 * We save the command-line string to preserve
74 * this information past task exit:
75 */
76 char comm[TASK_COMM_LEN + 1];
77
78} ____cacheline_aligned_in_smp;
79
80/*
81 * Spinlock protecting the tables - not taken during lookup:
82 */
83static DEFINE_SPINLOCK(table_lock);
84
85/*
86 * Per-CPU lookup locks for fast hash lookup:
87 */
88static DEFINE_PER_CPU(spinlock_t, lookup_lock);
89
90/*
91 * Mutex to serialize state changes with show-stats activities:
92 */
93static DEFINE_MUTEX(show_mutex);
94
95/*
96 * Collection status, active/inactive:
97 */
98static int __read_mostly active;
99
100/*
101 * Beginning/end timestamps of measurement:
102 */
103static ktime_t time_start, time_stop;
104
105/*
106 * tstat entry structs only get allocated while collection is
107 * active and never freed during that time - this simplifies
108 * things quite a bit.
109 *
110 * They get freed when a new collection period is started.
111 */
112#define MAX_ENTRIES_BITS 10
113#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
114
115static unsigned long nr_entries;
116static struct entry entries[MAX_ENTRIES];
117
118static atomic_t overflow_count;
119
120static void reset_entries(void)
121{
122 nr_entries = 0;
123 memset(entries, 0, sizeof(entries));
124 atomic_set(&overflow_count, 0);
125}
126
127static struct entry *alloc_entry(void)
128{
129 if (nr_entries >= MAX_ENTRIES)
130 return NULL;
131
132 return entries + nr_entries++;
133}
134
135/*
136 * The entries are in a hash-table, for fast lookup:
137 */
138#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
139#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
140#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
141
142#define __tstat_hashfn(entry) \
143 (((unsigned long)(entry)->timer ^ \
144 (unsigned long)(entry)->start_func ^ \
145 (unsigned long)(entry)->expire_func ^ \
146 (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
147
148#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
149
150static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
151
152static int match_entries(struct entry *entry1, struct entry *entry2)
153{
154 return entry1->timer == entry2->timer &&
155 entry1->start_func == entry2->start_func &&
156 entry1->expire_func == entry2->expire_func &&
157 entry1->pid == entry2->pid;
158}
159
160/*
161 * Look up whether an entry matching this item is present
162 * in the hash already. Must be called with irqs off and the
163 * lookup lock held:
164 */
165static struct entry *tstat_lookup(struct entry *entry, char *comm)
166{
167 struct entry **head, *curr, *prev;
168
169 head = tstat_hashentry(entry);
170 curr = *head;
171
172 /*
173 * The fastpath is when the entry is already hashed,
174 * we do this with the lookup lock held, but with the
175 * table lock not held:
176 */
177 while (curr) {
178 if (match_entries(curr, entry))
179 return curr;
180
181 curr = curr->next;
182 }
183 /*
184 * Slowpath: allocate, set up and link a new hash entry:
185 */
186 prev = NULL;
187 curr = *head;
188
189 spin_lock(&table_lock);
190 /*
191 * Make sure we have not raced with another CPU:
192 */
193 while (curr) {
194 if (match_entries(curr, entry))
195 goto out_unlock;
196
197 prev = curr;
198 curr = curr->next;
199 }
200
201 curr = alloc_entry();
202 if (curr) {
203 *curr = *entry;
204 curr->count = 0;
205 memcpy(curr->comm, comm, TASK_COMM_LEN);
206 if (prev)
207 prev->next = curr;
208 else
209 *head = curr;
210 curr->next = NULL;
211 }
212 out_unlock:
213 spin_unlock(&table_lock);
214
215 return curr;
216}
217
218/**
219 * timer_stats_update_stats - Update the statistics for a timer.
220 * @timer: pointer to either a timer_list or a hrtimer
221 * @pid: the pid of the task which set up the timer
222 * @startf: pointer to the function which did the timer setup
223 * @timerf: pointer to the timer callback function of the timer
224 * @comm: name of the process which set up the timer
225 *
226 * When the timer is already registered, then the event counter is
227 * incremented. Otherwise the timer is registered in a free slot.
228 */
229void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
230 void *timerf, char * comm)
231{
232 /*
233 * It doesnt matter which lock we take:
234 */
235 spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
236 struct entry *entry, input;
237 unsigned long flags;
238
239 input.timer = timer;
240 input.start_func = startf;
241 input.expire_func = timerf;
242 input.pid = pid;
243
244 spin_lock_irqsave(lock, flags);
245 if (!active)
246 goto out_unlock;
247
248 entry = tstat_lookup(&input, comm);
249 if (likely(entry))
250 entry->count++;
251 else
252 atomic_inc(&overflow_count);
253
254 out_unlock:
255 spin_unlock_irqrestore(lock, flags);
256}
257
258static void print_name_offset(struct seq_file *m, unsigned long addr)
259{
260 char namebuf[KSYM_NAME_LEN+1];
261 unsigned long size, offset;
262 const char *sym_name;
263 char *modname;
264
265 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
266 if (sym_name)
267 seq_printf(m, "%s", sym_name);
268 else
269 seq_printf(m, "<%p>", (void *)addr);
270}
271
272static int tstats_show(struct seq_file *m, void *v)
273{
274 struct timespec period;
275 struct entry *entry;
276 unsigned long ms;
277 long events = 0;
278 ktime_t time;
279 int i;
280
281 mutex_lock(&show_mutex);
282 /*
283 * If still active then calculate up to now:
284 */
285 if (active)
286 time_stop = ktime_get();
287
288 time = ktime_sub(time_stop, time_start);
289
290 period = ktime_to_timespec(time);
291 ms = period.tv_nsec / 1000000;
292
293 seq_puts(m, "Timer Stats Version: v0.1\n");
294 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
295 if (atomic_read(&overflow_count))
296 seq_printf(m, "Overflow: %d entries\n",
297 atomic_read(&overflow_count));
298
299 for (i = 0; i < nr_entries; i++) {
300 entry = entries + i;
301 seq_printf(m, "%4lu, %5d %-16s ",
302 entry->count, entry->pid, entry->comm);
303
304 print_name_offset(m, (unsigned long)entry->start_func);
305 seq_puts(m, " (");
306 print_name_offset(m, (unsigned long)entry->expire_func);
307 seq_puts(m, ")\n");
308
309 events += entry->count;
310 }
311
312 ms += period.tv_sec * 1000;
313 if (!ms)
314 ms = 1;
315
316 if (events && period.tv_sec)
317 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
318 events / period.tv_sec, events * 1000 / ms);
319 else
320 seq_printf(m, "%ld total events\n", events);
321
322 mutex_unlock(&show_mutex);
323
324 return 0;
325}
326
327/*
328 * After a state change, make sure all concurrent lookup/update
329 * activities have stopped:
330 */
331static void sync_access(void)
332{
333 unsigned long flags;
334 int cpu;
335
336 for_each_online_cpu(cpu) {
337 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
338 /* nothing */
339 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
340 }
341}
342
343static ssize_t tstats_write(struct file *file, const char __user *buf,
344 size_t count, loff_t *offs)
345{
346 char ctl[2];
347
348 if (count != 2 || *offs)
349 return -EINVAL;
350
351 if (copy_from_user(ctl, buf, count))
352 return -EFAULT;
353
354 mutex_lock(&show_mutex);
355 switch (ctl[0]) {
356 case '0':
357 if (active) {
358 active = 0;
359 time_stop = ktime_get();
360 sync_access();
361 }
362 break;
363 case '1':
364 if (!active) {
365 reset_entries();
366 time_start = ktime_get();
367 active = 1;
368 }
369 break;
370 default:
371 count = -EINVAL;
372 }
373 mutex_unlock(&show_mutex);
374
375 return count;
376}
377
378static int tstats_open(struct inode *inode, struct file *filp)
379{
380 return single_open(filp, tstats_show, NULL);
381}
382
383static struct file_operations tstats_fops = {
384 .open = tstats_open,
385 .read = seq_read,
386 .write = tstats_write,
387 .llseek = seq_lseek,
388 .release = seq_release,
389};
390
391void __init init_timer_stats(void)
392{
393 int cpu;
394
395 for_each_possible_cpu(cpu)
396 spin_lock_init(&per_cpu(lookup_lock, cpu));
397}
398
399static int __init init_tstats_procfs(void)
400{
401 struct proc_dir_entry *pe;
402
403 pe = create_proc_entry("timer_stats", 0644, NULL);
404 if (!pe)
405 return -ENOMEM;
406
407 pe->proc_fops = &tstats_fops;
408
409 return 0;
410}
411__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index 8533c3796082..cb1b86a9c52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,8 @@
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/tick.h>
38#include <linux/kallsyms.h>
37 39
38#include <asm/uaccess.h> 40#include <asm/uaccess.h>
39#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
262 list_add_tail(&timer->entry, vec); 264 list_add_tail(&timer->entry, vec);
263} 265}
264 266
267#ifdef CONFIG_TIMER_STATS
268void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
269{
270 if (timer->start_site)
271 return;
272
273 timer->start_site = addr;
274 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
275 timer->start_pid = current->pid;
276}
277#endif
278
265/** 279/**
266 * init_timer - initialize a timer. 280 * init_timer - initialize a timer.
267 * @timer: the timer to be initialized 281 * @timer: the timer to be initialized
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
273{ 287{
274 timer->entry.next = NULL; 288 timer->entry.next = NULL;
275 timer->base = __raw_get_cpu_var(tvec_bases); 289 timer->base = __raw_get_cpu_var(tvec_bases);
290#ifdef CONFIG_TIMER_STATS
291 timer->start_site = NULL;
292 timer->start_pid = -1;
293 memset(timer->start_comm, 0, TASK_COMM_LEN);
294#endif
276} 295}
277EXPORT_SYMBOL(init_timer); 296EXPORT_SYMBOL(init_timer);
278 297
279static inline void detach_timer(struct timer_list *timer, 298static inline void detach_timer(struct timer_list *timer,
280 int clear_pending) 299 int clear_pending)
281{ 300{
282 struct list_head *entry = &timer->entry; 301 struct list_head *entry = &timer->entry;
283 302
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
324 unsigned long flags; 343 unsigned long flags;
325 int ret = 0; 344 int ret = 0;
326 345
346 timer_stats_timer_set_start_info(timer);
327 BUG_ON(!timer->function); 347 BUG_ON(!timer->function);
328 348
329 base = lock_timer_base(timer, &flags); 349 base = lock_timer_base(timer, &flags);
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
374 tvec_base_t *base = per_cpu(tvec_bases, cpu); 394 tvec_base_t *base = per_cpu(tvec_bases, cpu);
375 unsigned long flags; 395 unsigned long flags;
376 396
397 timer_stats_timer_set_start_info(timer);
377 BUG_ON(timer_pending(timer) || !timer->function); 398 BUG_ON(timer_pending(timer) || !timer->function);
378 spin_lock_irqsave(&base->lock, flags); 399 spin_lock_irqsave(&base->lock, flags);
379 timer->base = base; 400 timer->base = base;
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
406{ 427{
407 BUG_ON(!timer->function); 428 BUG_ON(!timer->function);
408 429
430 timer_stats_timer_set_start_info(timer);
409 /* 431 /*
410 * This is a common optimization triggered by the 432 * This is a common optimization triggered by the
411 * networking code - if the timer is re-modified 433 * networking code - if the timer is re-modified
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer)
436 unsigned long flags; 458 unsigned long flags;
437 int ret = 0; 459 int ret = 0;
438 460
461 timer_stats_timer_clear_start_info(timer);
439 if (timer_pending(timer)) { 462 if (timer_pending(timer)) {
440 base = lock_timer_base(timer, &flags); 463 base = lock_timer_base(timer, &flags);
441 if (timer_pending(timer)) { 464 if (timer_pending(timer)) {
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
569 fn = timer->function; 592 fn = timer->function;
570 data = timer->data; 593 data = timer->data;
571 594
595 timer_stats_account_timer(timer);
596
572 set_running_timer(base, timer); 597 set_running_timer(base, timer);
573 detach_timer(timer, 1); 598 detach_timer(timer, 1);
574 spin_unlock_irq(&base->lock); 599 spin_unlock_irq(&base->lock);
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base)
591 spin_unlock_irq(&base->lock); 616 spin_unlock_irq(&base->lock);
592} 617}
593 618
594#ifdef CONFIG_NO_IDLE_HZ 619#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
595/* 620/*
596 * Find out when the next timer event is due to happen. This 621 * Find out when the next timer event is due to happen. This
597 * is used on S/390 to stop all activity when a cpus is idle. 622 * is used on S/390 to stop all activity when a cpus is idle.
598 * This functions needs to be called disabled. 623 * This functions needs to be called disabled.
599 */ 624 */
600unsigned long next_timer_interrupt(void) 625static unsigned long __next_timer_interrupt(tvec_base_t *base)
601{ 626{
602 tvec_base_t *base; 627 unsigned long timer_jiffies = base->timer_jiffies;
603 struct list_head *list; 628 unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
629 int index, slot, array, found = 0;
604 struct timer_list *nte; 630 struct timer_list *nte;
605 unsigned long expires;
606 unsigned long hr_expires = MAX_JIFFY_OFFSET;
607 ktime_t hr_delta;
608 tvec_t *varray[4]; 631 tvec_t *varray[4];
609 int i, j;
610
611 hr_delta = hrtimer_get_next_event();
612 if (hr_delta.tv64 != KTIME_MAX) {
613 struct timespec tsdelta;
614 tsdelta = ktime_to_timespec(hr_delta);
615 hr_expires = timespec_to_jiffies(&tsdelta);
616 if (hr_expires < 3)
617 return hr_expires + jiffies;
618 }
619 hr_expires += jiffies;
620
621 base = __get_cpu_var(tvec_bases);
622 spin_lock(&base->lock);
623 expires = base->timer_jiffies + (LONG_MAX >> 1);
624 list = NULL;
625 632
626 /* Look for timer events in tv1. */ 633 /* Look for timer events in tv1. */
627 j = base->timer_jiffies & TVR_MASK; 634 index = slot = timer_jiffies & TVR_MASK;
628 do { 635 do {
629 list_for_each_entry(nte, base->tv1.vec + j, entry) { 636 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
637 found = 1;
630 expires = nte->expires; 638 expires = nte->expires;
631 if (j < (base->timer_jiffies & TVR_MASK)) 639 /* Look at the cascade bucket(s)? */
632 list = base->tv2.vec + (INDEX(0)); 640 if (!index || slot < index)
633 goto found; 641 goto cascade;
642 return expires;
634 } 643 }
635 j = (j + 1) & TVR_MASK; 644 slot = (slot + 1) & TVR_MASK;
636 } while (j != (base->timer_jiffies & TVR_MASK)); 645 } while (slot != index);
646
647cascade:
648 /* Calculate the next cascade event */
649 if (index)
650 timer_jiffies += TVR_SIZE - index;
651 timer_jiffies >>= TVR_BITS;
637 652
638 /* Check tv2-tv5. */ 653 /* Check tv2-tv5. */
639 varray[0] = &base->tv2; 654 varray[0] = &base->tv2;
640 varray[1] = &base->tv3; 655 varray[1] = &base->tv3;
641 varray[2] = &base->tv4; 656 varray[2] = &base->tv4;
642 varray[3] = &base->tv5; 657 varray[3] = &base->tv5;
643 for (i = 0; i < 4; i++) { 658
644 j = INDEX(i); 659 for (array = 0; array < 4; array++) {
660 tvec_t *varp = varray[array];
661
662 index = slot = timer_jiffies & TVN_MASK;
645 do { 663 do {
646 if (list_empty(varray[i]->vec + j)) { 664 list_for_each_entry(nte, varp->vec + slot, entry) {
647 j = (j + 1) & TVN_MASK; 665 found = 1;
648 continue;
649 }
650 list_for_each_entry(nte, varray[i]->vec + j, entry)
651 if (time_before(nte->expires, expires)) 666 if (time_before(nte->expires, expires))
652 expires = nte->expires; 667 expires = nte->expires;
653 if (j < (INDEX(i)) && i < 3) 668 }
654 list = varray[i + 1]->vec + (INDEX(i + 1)); 669 /*
655 goto found; 670 * Do we still search for the first timer or are
656 } while (j != (INDEX(i))); 671 * we looking up the cascade buckets ?
657 } 672 */
658found: 673 if (found) {
659 if (list) { 674 /* Look at the cascade bucket(s)? */
660 /* 675 if (!index || slot < index)
661 * The search wrapped. We need to look at the next list 676 break;
662 * from next tv element that would cascade into tv element 677 return expires;
663 * where we found the timer element. 678 }
664 */ 679 slot = (slot + 1) & TVN_MASK;
665 list_for_each_entry(nte, list, entry) { 680 } while (slot != index);
666 if (time_before(nte->expires, expires)) 681
667 expires = nte->expires; 682 if (index)
668 } 683 timer_jiffies += TVN_SIZE - index;
684 timer_jiffies >>= TVN_BITS;
669 } 685 }
670 spin_unlock(&base->lock); 686 return expires;
687}
671 688
672 /* 689/*
673 * It can happen that other CPUs service timer IRQs and increment 690 * Check, if the next hrtimer event is before the next timer wheel
674 * jiffies, but we have not yet got a local timer tick to process 691 * event:
675 * the timer wheels. In that case, the expiry time can be before 692 */
676 * jiffies, but since the high-resolution timer here is relative to 693static unsigned long cmp_next_hrtimer_event(unsigned long now,
677 * jiffies, the default expression when high-resolution timers are 694 unsigned long expires)
678 * not active, 695{
679 * 696 ktime_t hr_delta = hrtimer_get_next_event();
680 * time_before(MAX_JIFFY_OFFSET + jiffies, expires) 697 struct timespec tsdelta;
681 * 698
682 * would falsely evaluate to true. If that is the case, just 699 if (hr_delta.tv64 == KTIME_MAX)
683 * return jiffies so that we can immediately fire the local timer 700 return expires;
684 */
685 if (time_before(expires, jiffies))
686 return jiffies;
687 701
688 if (time_before(hr_expires, expires)) 702 if (hr_delta.tv64 <= TICK_NSEC)
689 return hr_expires; 703 return now;
690 704
705 tsdelta = ktime_to_timespec(hr_delta);
706 now += timespec_to_jiffies(&tsdelta);
707 if (time_before(now, expires))
708 return now;
691 return expires; 709 return expires;
692} 710}
711
712/**
713 * next_timer_interrupt - return the jiffy of the next pending timer
714 */
715unsigned long get_next_timer_interrupt(unsigned long now)
716{
717 tvec_base_t *base = __get_cpu_var(tvec_bases);
718 unsigned long expires;
719
720 spin_lock(&base->lock);
721 expires = __next_timer_interrupt(base);
722 spin_unlock(&base->lock);
723
724 if (time_before_eq(expires, now))
725 return now;
726
727 return cmp_next_hrtimer_event(now, expires);
728}
729
730#ifdef CONFIG_NO_IDLE_HZ
731unsigned long next_timer_interrupt(void)
732{
733 return get_next_timer_interrupt(jiffies);
734}
735#endif
736
693#endif 737#endif
694 738
695/******************************************************************/ 739/******************************************************************/
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday);
832 * 876 *
833 * Accumulates current time interval and initializes new clocksource 877 * Accumulates current time interval and initializes new clocksource
834 */ 878 */
835static int change_clocksource(void) 879static void change_clocksource(void)
836{ 880{
837 struct clocksource *new; 881 struct clocksource *new;
838 cycle_t now; 882 cycle_t now;
839 u64 nsec; 883 u64 nsec;
884
840 new = clocksource_get_next(); 885 new = clocksource_get_next();
841 if (clock != new) { 886
842 now = clocksource_read(new); 887 if (clock == new)
843 nsec = __get_nsec_offset(); 888 return;
844 timespec_add_ns(&xtime, nsec); 889
845 890 now = clocksource_read(new);
846 clock = new; 891 nsec = __get_nsec_offset();
847 clock->cycle_last = now; 892 timespec_add_ns(&xtime, nsec);
848 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 893
849 clock->name); 894 clock = new;
850 return 1; 895 clock->cycle_last = now;
851 } else if (clock->update_callback) { 896
852 return clock->update_callback(); 897 clock->error = 0;
853 } 898 clock->xtime_nsec = 0;
854 return 0; 899 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
900
901 tick_clock_notify();
902
903 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
904 clock->name);
855} 905}
856#else 906#else
857static inline int change_clocksource(void) 907static inline void change_clocksource(void) { }
858{
859 return 0;
860}
861#endif 908#endif
862 909
863/** 910/**
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void)
871 do { 918 do {
872 seq = read_seqbegin(&xtime_lock); 919 seq = read_seqbegin(&xtime_lock);
873 920
874 ret = clock->is_continuous; 921 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
875 922
876 } while (read_seqretry(&xtime_lock, seq)); 923 } while (read_seqretry(&xtime_lock, seq));
877 924
878 return ret; 925 return ret;
879} 926}
880 927
928/**
929 * read_persistent_clock - Return time in seconds from the persistent clock.
930 *
931 * Weak dummy function for arches that do not yet support it.
932 * Returns seconds from epoch using the battery backed persistent clock.
933 * Returns zero if unsupported.
934 *
935 * XXX - Do be sure to remove it once all arches implement it.
936 */
937unsigned long __attribute__((weak)) read_persistent_clock(void)
938{
939 return 0;
940}
941
881/* 942/*
882 * timekeeping_init - Initializes the clocksource and common timekeeping values 943 * timekeeping_init - Initializes the clocksource and common timekeeping values
883 */ 944 */
884void __init timekeeping_init(void) 945void __init timekeeping_init(void)
885{ 946{
886 unsigned long flags; 947 unsigned long flags;
948 unsigned long sec = read_persistent_clock();
887 949
888 write_seqlock_irqsave(&xtime_lock, flags); 950 write_seqlock_irqsave(&xtime_lock, flags);
889 951
890 ntp_clear(); 952 ntp_clear();
891 953
892 clock = clocksource_get_next(); 954 clock = clocksource_get_next();
893 clocksource_calculate_interval(clock, tick_nsec); 955 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
894 clock->cycle_last = clocksource_read(clock); 956 clock->cycle_last = clocksource_read(clock);
895 957
958 xtime.tv_sec = sec;
959 xtime.tv_nsec = 0;
960 set_normalized_timespec(&wall_to_monotonic,
961 -xtime.tv_sec, -xtime.tv_nsec);
962
896 write_sequnlock_irqrestore(&xtime_lock, flags); 963 write_sequnlock_irqrestore(&xtime_lock, flags);
897} 964}
898 965
899 966/* flag for if timekeeping is suspended */
900static int timekeeping_suspended; 967static int timekeeping_suspended;
968/* time in seconds when suspend began */
969static unsigned long timekeeping_suspend_time;
970
901/** 971/**
902 * timekeeping_resume - Resumes the generic timekeeping subsystem. 972 * timekeeping_resume - Resumes the generic timekeeping subsystem.
903 * @dev: unused 973 * @dev: unused
@@ -909,13 +979,26 @@ static int timekeeping_suspended;
909static int timekeeping_resume(struct sys_device *dev) 979static int timekeeping_resume(struct sys_device *dev)
910{ 980{
911 unsigned long flags; 981 unsigned long flags;
982 unsigned long now = read_persistent_clock();
912 983
913 write_seqlock_irqsave(&xtime_lock, flags); 984 write_seqlock_irqsave(&xtime_lock, flags);
914 /* restart the last cycle value */ 985
986 if (now && (now > timekeeping_suspend_time)) {
987 unsigned long sleep_length = now - timekeeping_suspend_time;
988
989 xtime.tv_sec += sleep_length;
990 wall_to_monotonic.tv_sec -= sleep_length;
991 }
992 /* re-base the last cycle value */
915 clock->cycle_last = clocksource_read(clock); 993 clock->cycle_last = clocksource_read(clock);
916 clock->error = 0; 994 clock->error = 0;
917 timekeeping_suspended = 0; 995 timekeeping_suspended = 0;
918 write_sequnlock_irqrestore(&xtime_lock, flags); 996 write_sequnlock_irqrestore(&xtime_lock, flags);
997
998 touch_softlockup_watchdog();
999 /* Resume hrtimers */
1000 clock_was_set();
1001
919 return 0; 1002 return 0;
920} 1003}
921 1004
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
925 1008
926 write_seqlock_irqsave(&xtime_lock, flags); 1009 write_seqlock_irqsave(&xtime_lock, flags);
927 timekeeping_suspended = 1; 1010 timekeeping_suspended = 1;
1011 timekeeping_suspend_time = read_persistent_clock();
928 write_sequnlock_irqrestore(&xtime_lock, flags); 1012 write_sequnlock_irqrestore(&xtime_lock, flags);
929 return 0; 1013 return 0;
930} 1014}
@@ -1089,11 +1173,8 @@ static void update_wall_time(void)
1089 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 1173 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1090 1174
1091 /* check to see if there is a new clocksource to use */ 1175 /* check to see if there is a new clocksource to use */
1092 if (change_clocksource()) { 1176 change_clocksource();
1093 clock->error = 0; 1177 update_vsyscall(&xtime, clock);
1094 clock->xtime_nsec = 0;
1095 clocksource_calculate_interval(clock, tick_nsec);
1096 }
1097} 1178}
1098 1179
1099/* 1180/*
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks)
1162 * This read-write spinlock protects us from races in SMP while 1243 * This read-write spinlock protects us from races in SMP while
1163 * playing with xtime and avenrun. 1244 * playing with xtime and avenrun.
1164 */ 1245 */
1165#ifndef ARCH_HAVE_XTIME_LOCK 1246__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1166__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1167 1247
1168EXPORT_SYMBOL(xtime_lock); 1248EXPORT_SYMBOL(xtime_lock);
1169#endif
1170 1249
1171/* 1250/*
1172 * This function runs timers and the timer-tq in bottom half context. 1251 * This function runs timers and the timer-tq in bottom half context.
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h)
1175{ 1254{
1176 tvec_base_t *base = __get_cpu_var(tvec_bases); 1255 tvec_base_t *base = __get_cpu_var(tvec_bases);
1177 1256
1178 hrtimer_run_queues(); 1257 hrtimer_run_queues();
1258
1179 if (time_after_eq(jiffies, base->timer_jiffies)) 1259 if (time_after_eq(jiffies, base->timer_jiffies))
1180 __run_timers(base); 1260 __run_timers(base);
1181} 1261}
@@ -1621,6 +1701,8 @@ void __init init_timers(void)
1621 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1701 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1622 (void *)(long)smp_processor_id()); 1702 (void *)(long)smp_processor_id());
1623 1703
1704 init_timer_stats();
1705
1624 BUG_ON(err == NOTIFY_BAD); 1706 BUG_ON(err == NOTIFY_BAD);
1625 register_cpu_notifier(&timers_nb); 1707 register_cpu_notifier(&timers_nb);
1626 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1708 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index baacc3691415..658f638c402c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -22,8 +22,6 @@
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24 24
25
26#define USEC_PER_TICK (USEC_PER_SEC/HZ)
27/* 25/*
28 * fill in basic accounting fields 26 * fill in basic accounting fields
29 */ 27 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 020d1fff57dc..b6fa5e63085d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
218} 218}
219EXPORT_SYMBOL_GPL(queue_work); 219EXPORT_SYMBOL_GPL(queue_work);
220 220
221static void delayed_work_timer_fn(unsigned long __data) 221void delayed_work_timer_fn(unsigned long __data)
222{ 222{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
245 struct timer_list *timer = &dwork->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work; 246 struct work_struct *work = &dwork->work;
247 247
248 timer_stats_timer_set_start_info(timer);
248 if (delay == 0) 249 if (delay == 0)
249 return queue_work(wq, work); 250 return queue_work(wq, work);
250 251
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
593 * After waiting for a given time this puts a job in the kernel-global 594 * After waiting for a given time this puts a job in the kernel-global
594 * workqueue. 595 * workqueue.
595 */ 596 */
596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) 597int fastcall schedule_delayed_work(struct delayed_work *dwork,
598 unsigned long delay)
597{ 599{
600 timer_stats_timer_set_start_info(&dwork->timer);
598 return queue_delayed_work(keventd_wq, dwork, delay); 601 return queue_delayed_work(keventd_wq, dwork, delay);
599} 602}
600EXPORT_SYMBOL(schedule_delayed_work); 603EXPORT_SYMBOL(schedule_delayed_work);