aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c216
-rw-r--r--kernel/auditfilter.c11
-rw-r--r--kernel/auditsc.c40
-rw-r--r--kernel/capability.c8
-rw-r--r--kernel/compat.c66
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/exit.c67
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/hrtimer.c830
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c51
-rw-r--r--kernel/irq/devres.c88
-rw-r--r--kernel/irq/manage.c81
-rw-r--r--kernel/irq/migration.c9
-rw-r--r--kernel/irq/proc.c25
-rw-r--r--kernel/itimer.c18
-rw-r--r--kernel/kfifo.c10
-rw-r--r--kernel/kmod.c44
-rw-r--r--kernel/kprobes.c113
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/lockdep.c46
-rw-r--r--kernel/lockdep_proc.c44
-rw-r--r--kernel/module.c91
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/params.c48
-rw-r--r--kernel/posix-cpu-timers.c15
-rw-r--r--kernel/posix-timers.c18
-rw-r--r--kernel/power/Kconfig26
-rw-r--r--kernel/power/disk.c115
-rw-r--r--kernel/power/main.c43
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/swsusp.c5
-rw-r--r--kernel/power/user.c155
-rw-r--r--kernel/printk.c20
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/relay.c192
-rw-r--r--kernel/resource.c63
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c34
-rw-r--r--kernel/signal.c110
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/sys.c51
-rw-r--r--kernel/sysctl.c676
-rw-r--r--kernel/time.c254
-rw-r--r--kernel/time/Kconfig25
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/clockevents.c345
-rw-r--r--kernel/time/clocksource.c247
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c480
-rw-r--r--kernel/time/tick-common.c347
-rw-r--r--kernel/time/tick-internal.h110
-rw-r--r--kernel/time/tick-oneshot.c84
-rw-r--r--kernel/time/tick-sched.c567
-rw-r--r--kernel/time/timer_list.c287
-rw-r--r--kernel/time/timer_stats.c411
-rw-r--r--kernel/timer.c374
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/utsname_sysctl.c146
-rw-r--r--kernel/workqueue.c13
65 files changed, 5782 insertions, 1435 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 14f4d45e0ae9..ac6b27abb1ad 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
47obj-$(CONFIG_SECCOMP) += seccomp.o 47obj-$(CONFIG_SECCOMP) += seccomp.o
48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
49obj-$(CONFIG_RELAY) += relay.o 49obj-$(CONFIG_RELAY) += relay.o
50obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
50obj-$(CONFIG_UTS_NS) += utsname.o 51obj-$(CONFIG_UTS_NS) += utsname.o
51obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
52obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
diff --git a/kernel/audit.c b/kernel/audit.c
index d9b690ac684b..76c9a11b72d6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2,7 +2,7 @@
2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. 2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
3 * System-call specific features have moved to auditsc.c 3 * System-call specific features have moved to auditsc.c
4 * 4 *
5 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 5 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
6 * All Rights Reserved. 6 * All Rights Reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
@@ -65,7 +65,9 @@
65 * (Initialization happens after skb_init is called.) */ 65 * (Initialization happens after skb_init is called.) */
66static int audit_initialized; 66static int audit_initialized;
67 67
68/* No syscall auditing will take place unless audit_enabled != 0. */ 68/* 0 - no auditing
69 * 1 - auditing enabled
70 * 2 - auditing enabled and configuration is locked/unchangeable. */
69int audit_enabled; 71int audit_enabled;
70 72
71/* Default state when kernel boots without any parameters. */ 73/* Default state when kernel boots without any parameters. */
@@ -239,102 +241,150 @@ void audit_log_lost(const char *message)
239 241
240static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) 242static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
241{ 243{
242 int old = audit_rate_limit; 244 int res, rc = 0, old = audit_rate_limit;
245
246 /* check if we are locked */
247 if (audit_enabled == 2)
248 res = 0;
249 else
250 res = 1;
243 251
244 if (sid) { 252 if (sid) {
245 char *ctx = NULL; 253 char *ctx = NULL;
246 u32 len; 254 u32 len;
247 int rc; 255 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
248 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
249 return rc;
250 else
251 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 256 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
252 "audit_rate_limit=%d old=%d by auid=%u subj=%s", 257 "audit_rate_limit=%d old=%d by auid=%u"
253 limit, old, loginuid, ctx); 258 " subj=%s res=%d",
254 kfree(ctx); 259 limit, old, loginuid, ctx, res);
255 } else 260 kfree(ctx);
256 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 261 } else
257 "audit_rate_limit=%d old=%d by auid=%u", 262 res = 0; /* Something weird, deny request */
258 limit, old, loginuid); 263 }
259 audit_rate_limit = limit; 264 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
260 return 0; 265 "audit_rate_limit=%d old=%d by auid=%u res=%d",
266 limit, old, loginuid, res);
267
268 /* If we are allowed, make the change */
269 if (res == 1)
270 audit_rate_limit = limit;
271 /* Not allowed, update reason */
272 else if (rc == 0)
273 rc = -EPERM;
274 return rc;
261} 275}
262 276
263static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 277static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
264{ 278{
265 int old = audit_backlog_limit; 279 int res, rc = 0, old = audit_backlog_limit;
280
281 /* check if we are locked */
282 if (audit_enabled == 2)
283 res = 0;
284 else
285 res = 1;
266 286
267 if (sid) { 287 if (sid) {
268 char *ctx = NULL; 288 char *ctx = NULL;
269 u32 len; 289 u32 len;
270 int rc; 290 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
271 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
272 return rc;
273 else
274 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 291 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
275 "audit_backlog_limit=%d old=%d by auid=%u subj=%s", 292 "audit_backlog_limit=%d old=%d by auid=%u"
276 limit, old, loginuid, ctx); 293 " subj=%s res=%d",
277 kfree(ctx); 294 limit, old, loginuid, ctx, res);
278 } else 295 kfree(ctx);
279 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 296 } else
280 "audit_backlog_limit=%d old=%d by auid=%u", 297 res = 0; /* Something weird, deny request */
281 limit, old, loginuid); 298 }
282 audit_backlog_limit = limit; 299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
283 return 0; 300 "audit_backlog_limit=%d old=%d by auid=%u res=%d",
301 limit, old, loginuid, res);
302
303 /* If we are allowed, make the change */
304 if (res == 1)
305 audit_backlog_limit = limit;
306 /* Not allowed, update reason */
307 else if (rc == 0)
308 rc = -EPERM;
309 return rc;
284} 310}
285 311
286static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 312static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
287{ 313{
288 int old = audit_enabled; 314 int res, rc = 0, old = audit_enabled;
289 315
290 if (state != 0 && state != 1) 316 if (state < 0 || state > 2)
291 return -EINVAL; 317 return -EINVAL;
292 318
319 /* check if we are locked */
320 if (audit_enabled == 2)
321 res = 0;
322 else
323 res = 1;
324
293 if (sid) { 325 if (sid) {
294 char *ctx = NULL; 326 char *ctx = NULL;
295 u32 len; 327 u32 len;
296 int rc; 328 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
297 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
298 return rc;
299 else
300 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 329 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
301 "audit_enabled=%d old=%d by auid=%u subj=%s", 330 "audit_enabled=%d old=%d by auid=%u"
302 state, old, loginuid, ctx); 331 " subj=%s res=%d",
303 kfree(ctx); 332 state, old, loginuid, ctx, res);
304 } else 333 kfree(ctx);
305 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 334 } else
306 "audit_enabled=%d old=%d by auid=%u", 335 res = 0; /* Something weird, deny request */
307 state, old, loginuid); 336 }
308 audit_enabled = state; 337 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
309 return 0; 338 "audit_enabled=%d old=%d by auid=%u res=%d",
339 state, old, loginuid, res);
340
341 /* If we are allowed, make the change */
342 if (res == 1)
343 audit_enabled = state;
344 /* Not allowed, update reason */
345 else if (rc == 0)
346 rc = -EPERM;
347 return rc;
310} 348}
311 349
312static int audit_set_failure(int state, uid_t loginuid, u32 sid) 350static int audit_set_failure(int state, uid_t loginuid, u32 sid)
313{ 351{
314 int old = audit_failure; 352 int res, rc = 0, old = audit_failure;
315 353
316 if (state != AUDIT_FAIL_SILENT 354 if (state != AUDIT_FAIL_SILENT
317 && state != AUDIT_FAIL_PRINTK 355 && state != AUDIT_FAIL_PRINTK
318 && state != AUDIT_FAIL_PANIC) 356 && state != AUDIT_FAIL_PANIC)
319 return -EINVAL; 357 return -EINVAL;
320 358
359 /* check if we are locked */
360 if (audit_enabled == 2)
361 res = 0;
362 else
363 res = 1;
364
321 if (sid) { 365 if (sid) {
322 char *ctx = NULL; 366 char *ctx = NULL;
323 u32 len; 367 u32 len;
324 int rc; 368 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
325 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
326 return rc;
327 else
328 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 369 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
329 "audit_failure=%d old=%d by auid=%u subj=%s", 370 "audit_failure=%d old=%d by auid=%u"
330 state, old, loginuid, ctx); 371 " subj=%s res=%d",
331 kfree(ctx); 372 state, old, loginuid, ctx, res);
332 } else 373 kfree(ctx);
333 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 374 } else
334 "audit_failure=%d old=%d by auid=%u", 375 res = 0; /* Something weird, deny request */
335 state, old, loginuid); 376 }
336 audit_failure = state; 377 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
337 return 0; 378 "audit_failure=%d old=%d by auid=%u res=%d",
379 state, old, loginuid, res);
380
381 /* If we are allowed, make the change */
382 if (res == 1)
383 audit_failure = state;
384 /* Not allowed, update reason */
385 else if (rc == 0)
386 rc = -EPERM;
387 return rc;
338} 388}
339 389
340static int kauditd_thread(void *dummy) 390static int kauditd_thread(void *dummy)
@@ -599,6 +649,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
599 case AUDIT_DEL: 649 case AUDIT_DEL:
600 if (nlmsg_len(nlh) < sizeof(struct audit_rule)) 650 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
601 return -EINVAL; 651 return -EINVAL;
652 if (audit_enabled == 2) {
653 ab = audit_log_start(NULL, GFP_KERNEL,
654 AUDIT_CONFIG_CHANGE);
655 if (ab) {
656 audit_log_format(ab,
657 "pid=%d uid=%u auid=%u",
658 pid, uid, loginuid);
659 if (sid) {
660 if (selinux_sid_to_string(
661 sid, &ctx, &len)) {
662 audit_log_format(ab,
663 " ssid=%u", sid);
664 /* Maybe call audit_panic? */
665 } else
666 audit_log_format(ab,
667 " subj=%s", ctx);
668 kfree(ctx);
669 }
670 audit_log_format(ab, " audit_enabled=%d res=0",
671 audit_enabled);
672 audit_log_end(ab);
673 }
674 return -EPERM;
675 }
602 /* fallthrough */ 676 /* fallthrough */
603 case AUDIT_LIST: 677 case AUDIT_LIST:
604 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 678 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
@@ -609,6 +683,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
609 case AUDIT_DEL_RULE: 683 case AUDIT_DEL_RULE:
610 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 684 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
611 return -EINVAL; 685 return -EINVAL;
686 if (audit_enabled == 2) {
687 ab = audit_log_start(NULL, GFP_KERNEL,
688 AUDIT_CONFIG_CHANGE);
689 if (ab) {
690 audit_log_format(ab,
691 "pid=%d uid=%u auid=%u",
692 pid, uid, loginuid);
693 if (sid) {
694 if (selinux_sid_to_string(
695 sid, &ctx, &len)) {
696 audit_log_format(ab,
697 " ssid=%u", sid);
698 /* Maybe call audit_panic? */
699 } else
700 audit_log_format(ab,
701 " subj=%s", ctx);
702 kfree(ctx);
703 }
704 audit_log_format(ab, " audit_enabled=%d res=0",
705 audit_enabled);
706 audit_log_end(ab);
707 }
708 return -EPERM;
709 }
612 /* fallthrough */ 710 /* fallthrough */
613 case AUDIT_LIST_RULES: 711 case AUDIT_LIST_RULES:
614 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 712 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9c8c23227c7f..3749193aed8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -937,9 +937,10 @@ static void audit_update_watch(struct audit_parent *parent,
937 } 937 }
938 938
939 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 939 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
940 audit_log_format(ab, "audit updated rules specifying path="); 940 audit_log_format(ab, "op=updated rules specifying path=");
941 audit_log_untrustedstring(ab, owatch->path); 941 audit_log_untrustedstring(ab, owatch->path);
942 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); 942 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
943 audit_log_format(ab, " list=%d res=1", r->listnr);
943 audit_log_end(ab); 944 audit_log_end(ab);
944 945
945 audit_remove_watch(owatch); 946 audit_remove_watch(owatch);
@@ -969,14 +970,14 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
969 e = container_of(r, struct audit_entry, rule); 970 e = container_of(r, struct audit_entry, rule);
970 971
971 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 972 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
972 audit_log_format(ab, "audit implicitly removed rule path="); 973 audit_log_format(ab, "op=remove rule path=");
973 audit_log_untrustedstring(ab, w->path); 974 audit_log_untrustedstring(ab, w->path);
974 if (r->filterkey) { 975 if (r->filterkey) {
975 audit_log_format(ab, " key="); 976 audit_log_format(ab, " key=");
976 audit_log_untrustedstring(ab, r->filterkey); 977 audit_log_untrustedstring(ab, r->filterkey);
977 } else 978 } else
978 audit_log_format(ab, " key=(null)"); 979 audit_log_format(ab, " key=(null)");
979 audit_log_format(ab, " list=%d", r->listnr); 980 audit_log_format(ab, " list=%d res=1", r->listnr);
980 audit_log_end(ab); 981 audit_log_end(ab);
981 982
982 list_del(&r->rlist); 983 list_del(&r->rlist);
@@ -1410,7 +1411,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1410 audit_log_format(ab, " subj=%s", ctx); 1411 audit_log_format(ab, " subj=%s", ctx);
1411 kfree(ctx); 1412 kfree(ctx);
1412 } 1413 }
1413 audit_log_format(ab, " %s rule key=", action); 1414 audit_log_format(ab, " op=%s rule key=", action);
1414 if (rule->filterkey) 1415 if (rule->filterkey)
1415 audit_log_untrustedstring(ab, rule->filterkey); 1416 audit_log_untrustedstring(ab, rule->filterkey);
1416 else 1417 else
@@ -1601,8 +1602,8 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1601 1602
1602int audit_filter_user(struct netlink_skb_parms *cb, int type) 1603int audit_filter_user(struct netlink_skb_parms *cb, int type)
1603{ 1604{
1605 enum audit_state state = AUDIT_DISABLED;
1604 struct audit_entry *e; 1606 struct audit_entry *e;
1605 enum audit_state state;
1606 int ret = 1; 1607 int ret = 1;
1607 1608
1608 rcu_read_lock(); 1609 rcu_read_lock();
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 298897559ca4..359955800dd2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -170,6 +170,11 @@ struct audit_aux_data_sockaddr {
170 char a[0]; 170 char a[0];
171}; 171};
172 172
173struct audit_aux_data_fd_pair {
174 struct audit_aux_data d;
175 int fd[2];
176};
177
173struct audit_aux_data_path { 178struct audit_aux_data_path {
174 struct audit_aux_data d; 179 struct audit_aux_data d;
175 struct dentry *dentry; 180 struct dentry *dentry;
@@ -961,6 +966,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
961 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); 966 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
962 break; } 967 break; }
963 968
969 case AUDIT_FD_PAIR: {
970 struct audit_aux_data_fd_pair *axs = (void *)aux;
971 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
972 break; }
973
964 } 974 }
965 audit_log_end(ab); 975 audit_log_end(ab);
966 } 976 }
@@ -1815,6 +1825,36 @@ int audit_socketcall(int nargs, unsigned long *args)
1815} 1825}
1816 1826
1817/** 1827/**
1828 * __audit_fd_pair - record audit data for pipe and socketpair
1829 * @fd1: the first file descriptor
1830 * @fd2: the second file descriptor
1831 *
1832 * Returns 0 for success or NULL context or < 0 on error.
1833 */
1834int __audit_fd_pair(int fd1, int fd2)
1835{
1836 struct audit_context *context = current->audit_context;
1837 struct audit_aux_data_fd_pair *ax;
1838
1839 if (likely(!context)) {
1840 return 0;
1841 }
1842
1843 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
1844 if (!ax) {
1845 return -ENOMEM;
1846 }
1847
1848 ax->fd[0] = fd1;
1849 ax->fd[1] = fd2;
1850
1851 ax->d.type = AUDIT_FD_PAIR;
1852 ax->d.next = context->aux;
1853 context->aux = (void *)ax;
1854 return 0;
1855}
1856
1857/**
1818 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto 1858 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
1819 * @len: data length in user space 1859 * @len: data length in user space
1820 * @a: data address in kernel space 1860 * @a: data address in kernel space
diff --git a/kernel/capability.c b/kernel/capability.c
index edb845a6e84a..c8d3c7762034 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -92,15 +92,17 @@ out:
92 * cap_set_pg - set capabilities for all processes in a given process 92 * cap_set_pg - set capabilities for all processes in a given process
93 * group. We call this holding task_capability_lock and tasklist_lock. 93 * group. We call this holding task_capability_lock and tasklist_lock.
94 */ 94 */
95static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, 95static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
96 kernel_cap_t *inheritable, 96 kernel_cap_t *inheritable,
97 kernel_cap_t *permitted) 97 kernel_cap_t *permitted)
98{ 98{
99 struct task_struct *g, *target; 99 struct task_struct *g, *target;
100 int ret = -EPERM; 100 int ret = -EPERM;
101 int found = 0; 101 int found = 0;
102 struct pid *pgrp;
102 103
103 do_each_task_pid(pgrp, PIDTYPE_PGID, g) { 104 pgrp = find_pid(pgrp_nr);
105 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
104 target = g; 106 target = g;
105 while_each_thread(g, target) { 107 while_each_thread(g, target) {
106 if (!security_capset_check(target, effective, 108 if (!security_capset_check(target, effective,
@@ -113,7 +115,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
113 } 115 }
114 found = 1; 116 found = 1;
115 } 117 }
116 } while_each_task_pid(pgrp, PIDTYPE_PGID, g); 118 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
117 119
118 if (!found) 120 if (!found)
119 ret = 0; 121 ret = 0;
diff --git a/kernel/compat.c b/kernel/compat.c
index 6952dd057300..cebb4c28c039 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1016,3 +1016,69 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
1016 return sys_migrate_pages(pid, nr_bits + 1, old, new); 1016 return sys_migrate_pages(pid, nr_bits + 1, old, new);
1017} 1017}
1018#endif 1018#endif
1019
1020struct compat_sysinfo {
1021 s32 uptime;
1022 u32 loads[3];
1023 u32 totalram;
1024 u32 freeram;
1025 u32 sharedram;
1026 u32 bufferram;
1027 u32 totalswap;
1028 u32 freeswap;
1029 u16 procs;
1030 u16 pad;
1031 u32 totalhigh;
1032 u32 freehigh;
1033 u32 mem_unit;
1034 char _f[20-2*sizeof(u32)-sizeof(int)];
1035};
1036
1037asmlinkage long
1038compat_sys_sysinfo(struct compat_sysinfo __user *info)
1039{
1040 struct sysinfo s;
1041
1042 do_sysinfo(&s);
1043
1044 /* Check to see if any memory value is too large for 32-bit and scale
1045 * down if needed
1046 */
1047 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
1048 int bitcount = 0;
1049
1050 while (s.mem_unit < PAGE_SIZE) {
1051 s.mem_unit <<= 1;
1052 bitcount++;
1053 }
1054
1055 s.totalram >>= bitcount;
1056 s.freeram >>= bitcount;
1057 s.sharedram >>= bitcount;
1058 s.bufferram >>= bitcount;
1059 s.totalswap >>= bitcount;
1060 s.freeswap >>= bitcount;
1061 s.totalhigh >>= bitcount;
1062 s.freehigh >>= bitcount;
1063 }
1064
1065 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
1066 __put_user (s.uptime, &info->uptime) ||
1067 __put_user (s.loads[0], &info->loads[0]) ||
1068 __put_user (s.loads[1], &info->loads[1]) ||
1069 __put_user (s.loads[2], &info->loads[2]) ||
1070 __put_user (s.totalram, &info->totalram) ||
1071 __put_user (s.freeram, &info->freeram) ||
1072 __put_user (s.sharedram, &info->sharedram) ||
1073 __put_user (s.bufferram, &info->bufferram) ||
1074 __put_user (s.totalswap, &info->totalswap) ||
1075 __put_user (s.freeswap, &info->freeswap) ||
1076 __put_user (s.procs, &info->procs) ||
1077 __put_user (s.totalhigh, &info->totalhigh) ||
1078 __put_user (s.freehigh, &info->freehigh) ||
1079 __put_user (s.mem_unit, &info->mem_unit))
1080 return -EFAULT;
1081
1082 return 0;
1083}
1084
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7406fe6966f9..3d4206ada5c9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -309,6 +309,8 @@ void enable_nonboot_cpus(void)
309 mutex_lock(&cpu_add_remove_lock); 309 mutex_lock(&cpu_add_remove_lock);
310 cpu_hotplug_disabled = 0; 310 cpu_hotplug_disabled = 0;
311 mutex_unlock(&cpu_add_remove_lock); 311 mutex_unlock(&cpu_add_remove_lock);
312 if (cpus_empty(frozen_cpus))
313 return;
312 314
313 printk("Enabling non-boot CPUs ...\n"); 315 printk("Enabling non-boot CPUs ...\n");
314 for_each_cpu_mask(cpu, frozen_cpus) { 316 for_each_cpu_mask(cpu, frozen_cpus) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6b05dc69c959..f382b0f775e1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1540,7 +1540,7 @@ static const struct file_operations cpuset_file_operations = {
1540 .release = cpuset_file_release, 1540 .release = cpuset_file_release,
1541}; 1541};
1542 1542
1543static struct inode_operations cpuset_dir_inode_operations = { 1543static const struct inode_operations cpuset_dir_inode_operations = {
1544 .lookup = simple_lookup, 1544 .lookup = simple_lookup,
1545 .mkdir = cpuset_mkdir, 1545 .mkdir = cpuset_mkdir,
1546 .rmdir = cpuset_rmdir, 1546 .rmdir = cpuset_rmdir,
@@ -2656,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
2656 return single_open(file, proc_cpuset_show, pid); 2656 return single_open(file, proc_cpuset_show, pid);
2657} 2657}
2658 2658
2659struct file_operations proc_cpuset_operations = { 2659const struct file_operations proc_cpuset_operations = {
2660 .open = cpuset_open, 2660 .open = cpuset_open,
2661 .read = seq_read, 2661 .read = seq_read,
2662 .llseek = seq_lseek, 2662 .llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index fec12eb12471..f132349c0325 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -185,21 +185,19 @@ repeat:
185 * This checks not only the pgrp, but falls back on the pid if no 185 * This checks not only the pgrp, but falls back on the pid if no
186 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 186 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
187 * without this... 187 * without this...
188 *
189 * The caller must hold rcu lock or the tasklist lock.
188 */ 190 */
189int session_of_pgrp(int pgrp) 191struct pid *session_of_pgrp(struct pid *pgrp)
190{ 192{
191 struct task_struct *p; 193 struct task_struct *p;
192 int sid = 0; 194 struct pid *sid = NULL;
193
194 read_lock(&tasklist_lock);
195 195
196 p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); 196 p = pid_task(pgrp, PIDTYPE_PGID);
197 if (p == NULL) 197 if (p == NULL)
198 p = find_task_by_pid(pgrp); 198 p = pid_task(pgrp, PIDTYPE_PID);
199 if (p != NULL) 199 if (p != NULL)
200 sid = process_session(p); 200 sid = task_session(p);
201
202 read_unlock(&tasklist_lock);
203 201
204 return sid; 202 return sid;
205} 203}
@@ -212,53 +210,52 @@ int session_of_pgrp(int pgrp)
212 * 210 *
213 * "I ask you, have you ever known what it is to be an orphan?" 211 * "I ask you, have you ever known what it is to be an orphan?"
214 */ 212 */
215static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) 213static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
216{ 214{
217 struct task_struct *p; 215 struct task_struct *p;
218 int ret = 1; 216 int ret = 1;
219 217
220 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 218 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
221 if (p == ignored_task 219 if (p == ignored_task
222 || p->exit_state 220 || p->exit_state
223 || is_init(p->real_parent)) 221 || is_init(p->real_parent))
224 continue; 222 continue;
225 if (process_group(p->real_parent) != pgrp && 223 if (task_pgrp(p->real_parent) != pgrp &&
226 process_session(p->real_parent) == process_session(p)) { 224 task_session(p->real_parent) == task_session(p)) {
227 ret = 0; 225 ret = 0;
228 break; 226 break;
229 } 227 }
230 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 228 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
231 return ret; /* (sighing) "Often!" */ 229 return ret; /* (sighing) "Often!" */
232} 230}
233 231
234int is_orphaned_pgrp(int pgrp) 232int is_current_pgrp_orphaned(void)
235{ 233{
236 int retval; 234 int retval;
237 235
238 read_lock(&tasklist_lock); 236 read_lock(&tasklist_lock);
239 retval = will_become_orphaned_pgrp(pgrp, NULL); 237 retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
240 read_unlock(&tasklist_lock); 238 read_unlock(&tasklist_lock);
241 239
242 return retval; 240 return retval;
243} 241}
244 242
245static int has_stopped_jobs(int pgrp) 243static int has_stopped_jobs(struct pid *pgrp)
246{ 244{
247 int retval = 0; 245 int retval = 0;
248 struct task_struct *p; 246 struct task_struct *p;
249 247
250 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 248 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
251 if (p->state != TASK_STOPPED) 249 if (p->state != TASK_STOPPED)
252 continue; 250 continue;
253 retval = 1; 251 retval = 1;
254 break; 252 break;
255 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 253 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
256 return retval; 254 return retval;
257} 255}
258 256
259/** 257/**
260 * reparent_to_init - Reparent the calling kernel thread to the init task 258 * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to.
261 * of the pid space that the thread belongs to.
262 * 259 *
263 * If a kernel thread is launched as a result of a system call, or if 260 * If a kernel thread is launched as a result of a system call, or if
264 * it ever exits, it should generally reparent itself to init so that 261 * it ever exits, it should generally reparent itself to init so that
@@ -431,8 +428,10 @@ static void close_files(struct files_struct * files)
431 while (set) { 428 while (set) {
432 if (set & 1) { 429 if (set & 1) {
433 struct file * file = xchg(&fdt->fd[i], NULL); 430 struct file * file = xchg(&fdt->fd[i], NULL);
434 if (file) 431 if (file) {
435 filp_close(file, files); 432 filp_close(file, files);
433 cond_resched();
434 }
436 } 435 }
437 i++; 436 i++;
438 set >>= 1; 437 set >>= 1;
@@ -649,14 +648,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
649 * than we are, and it was the only connection 648 * than we are, and it was the only connection
650 * outside, so the child pgrp is now orphaned. 649 * outside, so the child pgrp is now orphaned.
651 */ 650 */
652 if ((process_group(p) != process_group(father)) && 651 if ((task_pgrp(p) != task_pgrp(father)) &&
653 (process_session(p) == process_session(father))) { 652 (task_session(p) == task_session(father))) {
654 int pgrp = process_group(p); 653 struct pid *pgrp = task_pgrp(p);
655 654
656 if (will_become_orphaned_pgrp(pgrp, NULL) && 655 if (will_become_orphaned_pgrp(pgrp, NULL) &&
657 has_stopped_jobs(pgrp)) { 656 has_stopped_jobs(pgrp)) {
658 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); 657 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
659 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); 658 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
660 } 659 }
661 } 660 }
662} 661}
@@ -736,6 +735,7 @@ static void exit_notify(struct task_struct *tsk)
736 int state; 735 int state;
737 struct task_struct *t; 736 struct task_struct *t;
738 struct list_head ptrace_dead, *_p, *_n; 737 struct list_head ptrace_dead, *_p, *_n;
738 struct pid *pgrp;
739 739
740 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) 740 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
741 && !thread_group_empty(tsk)) { 741 && !thread_group_empty(tsk)) {
@@ -788,12 +788,13 @@ static void exit_notify(struct task_struct *tsk)
788 788
789 t = tsk->real_parent; 789 t = tsk->real_parent;
790 790
791 if ((process_group(t) != process_group(tsk)) && 791 pgrp = task_pgrp(tsk);
792 (process_session(t) == process_session(tsk)) && 792 if ((task_pgrp(t) != pgrp) &&
793 will_become_orphaned_pgrp(process_group(tsk), tsk) && 793 (task_session(t) != task_session(tsk)) &&
794 has_stopped_jobs(process_group(tsk))) { 794 will_become_orphaned_pgrp(pgrp, tsk) &&
795 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); 795 has_stopped_jobs(pgrp)) {
796 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); 796 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
797 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
797 } 798 }
798 799
799 /* Let father know we died 800 /* Let father know we died
diff --git a/kernel/fork.c b/kernel/fork.c
index d57118da73ff..d154cc786489 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
858 init_sigpending(&sig->shared_pending); 858 init_sigpending(&sig->shared_pending);
859 INIT_LIST_HEAD(&sig->posix_timers); 859 INIT_LIST_HEAD(&sig->posix_timers);
860 860
861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 861 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
862 sig->it_real_incr.tv64 = 0; 862 sig->it_real_incr.tv64 = 0;
863 sig->real_timer.function = it_real_fn; 863 sig->real_timer.function = it_real_fn;
864 sig->tsk = tsk; 864 sig->tsk = tsk;
@@ -869,7 +869,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
869 sig->it_prof_incr = cputime_zero; 869 sig->it_prof_incr = cputime_zero;
870 870
871 sig->leader = 0; /* session leadership doesn't inherit */ 871 sig->leader = 0; /* session leadership doesn't inherit */
872 sig->tty_old_pgrp = 0; 872 sig->tty_old_pgrp = NULL;
873 873
874 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 874 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
875 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 875 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
@@ -1038,10 +1038,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1038 p->utime = cputime_zero; 1038 p->utime = cputime_zero;
1039 p->stime = cputime_zero; 1039 p->stime = cputime_zero;
1040 p->sched_time = 0; 1040 p->sched_time = 0;
1041#ifdef CONFIG_TASK_XACCT
1041 p->rchar = 0; /* I/O counter: bytes read */ 1042 p->rchar = 0; /* I/O counter: bytes read */
1042 p->wchar = 0; /* I/O counter: bytes written */ 1043 p->wchar = 0; /* I/O counter: bytes written */
1043 p->syscr = 0; /* I/O counter: read syscalls */ 1044 p->syscr = 0; /* I/O counter: read syscalls */
1044 p->syscw = 0; /* I/O counter: write syscalls */ 1045 p->syscw = 0; /* I/O counter: write syscalls */
1046#endif
1045 task_io_accounting_init(p); 1047 task_io_accounting_init(p);
1046 acct_clear_integrals(p); 1048 acct_clear_integrals(p);
1047 1049
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a737de857d3..e749e7df14b1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1134 1134
1135 if (sec != MAX_SCHEDULE_TIMEOUT) { 1135 if (sec != MAX_SCHEDULE_TIMEOUT) {
1136 to = &timeout; 1136 to = &timeout;
1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); 1137 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1138 hrtimer_init_sleeper(to, current); 1138 hrtimer_init_sleeper(to, current);
1139 to->timer.expires = ktime_set(sec, nsec); 1139 to->timer.expires = ktime_set(sec, nsec);
1140 } 1140 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d0ba190dfeb6..476cb0c0b4a4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * linux/kernel/hrtimer.c 2 * linux/kernel/hrtimer.c
3 * 3 *
4 * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
6 * 7 *
7 * High-resolution kernel timers 8 * High-resolution kernel timers
8 * 9 *
@@ -31,12 +32,17 @@
31 */ 32 */
32 33
33#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
34#include <linux/module.h> 36#include <linux/module.h>
35#include <linux/percpu.h> 37#include <linux/percpu.h>
36#include <linux/hrtimer.h> 38#include <linux/hrtimer.h>
37#include <linux/notifier.h> 39#include <linux/notifier.h>
38#include <linux/syscalls.h> 40#include <linux/syscalls.h>
41#include <linux/kallsyms.h>
39#include <linux/interrupt.h> 42#include <linux/interrupt.h>
43#include <linux/tick.h>
44#include <linux/seq_file.h>
45#include <linux/err.h>
40 46
41#include <asm/uaccess.h> 47#include <asm/uaccess.h>
42 48
@@ -45,7 +51,7 @@
45 * 51 *
46 * returns the time in ktime_t format 52 * returns the time in ktime_t format
47 */ 53 */
48static ktime_t ktime_get(void) 54ktime_t ktime_get(void)
49{ 55{
50 struct timespec now; 56 struct timespec now;
51 57
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void)
59 * 65 *
60 * returns the time in ktime_t format 66 * returns the time in ktime_t format
61 */ 67 */
62static ktime_t ktime_get_real(void) 68ktime_t ktime_get_real(void)
63{ 69{
64 struct timespec now; 70 struct timespec now;
65 71
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
79 * This ensures that we capture erroneous accesses to these clock ids 85 * This ensures that we capture erroneous accesses to these clock ids
80 * rather than moving them into the range of valid clock id's. 86 * rather than moving them into the range of valid clock id's.
81 */ 87 */
82 88DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
83#define MAX_HRTIMER_BASES 2
84
85static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
86{ 89{
90
91 .clock_base =
87 { 92 {
88 .index = CLOCK_REALTIME, 93 {
89 .get_time = &ktime_get_real, 94 .index = CLOCK_REALTIME,
90 .resolution = KTIME_REALTIME_RES, 95 .get_time = &ktime_get_real,
91 }, 96 .resolution = KTIME_LOW_RES,
92 { 97 },
93 .index = CLOCK_MONOTONIC, 98 {
94 .get_time = &ktime_get, 99 .index = CLOCK_MONOTONIC,
95 .resolution = KTIME_MONOTONIC_RES, 100 .get_time = &ktime_get,
96 }, 101 .resolution = KTIME_LOW_RES,
102 },
103 }
97}; 104};
98 105
99/** 106/**
@@ -102,7 +109,7 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
102 * 109 *
103 * The function calculates the monotonic clock from the realtime 110 * The function calculates the monotonic clock from the realtime
104 * clock and the wall_to_monotonic offset and stores the result 111 * clock and the wall_to_monotonic offset and stores the result
105 * in normalized timespec format in the variable pointed to by ts. 112 * in normalized timespec format in the variable pointed to by @ts.
106 */ 113 */
107void ktime_get_ts(struct timespec *ts) 114void ktime_get_ts(struct timespec *ts)
108{ 115{
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
125 * Get the coarse grained time at the softirq based on xtime and 132 * Get the coarse grained time at the softirq based on xtime and
126 * wall_to_monotonic. 133 * wall_to_monotonic.
127 */ 134 */
128static void hrtimer_get_softirq_time(struct hrtimer_base *base) 135static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
129{ 136{
130 ktime_t xtim, tomono; 137 ktime_t xtim, tomono;
138 struct timespec xts;
131 unsigned long seq; 139 unsigned long seq;
132 140
133 do { 141 do {
134 seq = read_seqbegin(&xtime_lock); 142 seq = read_seqbegin(&xtime_lock);
135 xtim = timespec_to_ktime(xtime); 143#ifdef CONFIG_NO_HZ
136 tomono = timespec_to_ktime(wall_to_monotonic); 144 getnstimeofday(&xts);
137 145#else
146 xts = xtime;
147#endif
138 } while (read_seqretry(&xtime_lock, seq)); 148 } while (read_seqretry(&xtime_lock, seq));
139 149
140 base[CLOCK_REALTIME].softirq_time = xtim; 150 xtim = timespec_to_ktime(xts);
141 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); 151 tomono = timespec_to_ktime(wall_to_monotonic);
152 base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
153 base->clock_base[CLOCK_MONOTONIC].softirq_time =
154 ktime_add(xtim, tomono);
155}
156
157/*
158 * Helper function to check, whether the timer is running the callback
159 * function
160 */
161static inline int hrtimer_callback_running(struct hrtimer *timer)
162{
163 return timer->state & HRTIMER_STATE_CALLBACK;
142} 164}
143 165
144/* 166/*
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
147 */ 169 */
148#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
149 171
150#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
151
152/* 172/*
153 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock 173 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
154 * means that all timers which are tied to this base via timer->base are 174 * means that all timers which are tied to this base via timer->base are
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
161 * possible to set timer->base = NULL and drop the lock: the timer remains 181 * possible to set timer->base = NULL and drop the lock: the timer remains
162 * locked. 182 * locked.
163 */ 183 */
164static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, 184static
165 unsigned long *flags) 185struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
186 unsigned long *flags)
166{ 187{
167 struct hrtimer_base *base; 188 struct hrtimer_clock_base *base;
168 189
169 for (;;) { 190 for (;;) {
170 base = timer->base; 191 base = timer->base;
171 if (likely(base != NULL)) { 192 if (likely(base != NULL)) {
172 spin_lock_irqsave(&base->lock, *flags); 193 spin_lock_irqsave(&base->cpu_base->lock, *flags);
173 if (likely(base == timer->base)) 194 if (likely(base == timer->base))
174 return base; 195 return base;
175 /* The timer has migrated to another CPU: */ 196 /* The timer has migrated to another CPU: */
176 spin_unlock_irqrestore(&base->lock, *flags); 197 spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
177 } 198 }
178 cpu_relax(); 199 cpu_relax();
179 } 200 }
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
182/* 203/*
183 * Switch the timer base to the current CPU when possible. 204 * Switch the timer base to the current CPU when possible.
184 */ 205 */
185static inline struct hrtimer_base * 206static inline struct hrtimer_clock_base *
186switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) 207switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
187{ 208{
188 struct hrtimer_base *new_base; 209 struct hrtimer_clock_base *new_base;
210 struct hrtimer_cpu_base *new_cpu_base;
189 211
190 new_base = &__get_cpu_var(hrtimer_bases)[base->index]; 212 new_cpu_base = &__get_cpu_var(hrtimer_bases);
213 new_base = &new_cpu_base->clock_base[base->index];
191 214
192 if (base != new_base) { 215 if (base != new_base) {
193 /* 216 /*
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
199 * completed. There is no conflict as we hold the lock until 222 * completed. There is no conflict as we hold the lock until
200 * the timer is enqueued. 223 * the timer is enqueued.
201 */ 224 */
202 if (unlikely(base->curr_timer == timer)) 225 if (unlikely(hrtimer_callback_running(timer)))
203 return base; 226 return base;
204 227
205 /* See the comment in lock_timer_base() */ 228 /* See the comment in lock_timer_base() */
206 timer->base = NULL; 229 timer->base = NULL;
207 spin_unlock(&base->lock); 230 spin_unlock(&base->cpu_base->lock);
208 spin_lock(&new_base->lock); 231 spin_lock(&new_base->cpu_base->lock);
209 timer->base = new_base; 232 timer->base = new_base;
210 } 233 }
211 return new_base; 234 return new_base;
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
213 236
214#else /* CONFIG_SMP */ 237#else /* CONFIG_SMP */
215 238
216#define set_curr_timer(b, t) do { } while (0) 239static inline struct hrtimer_clock_base *
217
218static inline struct hrtimer_base *
219lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 240lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
220{ 241{
221 struct hrtimer_base *base = timer->base; 242 struct hrtimer_clock_base *base = timer->base;
222 243
223 spin_lock_irqsave(&base->lock, *flags); 244 spin_lock_irqsave(&base->cpu_base->lock, *flags);
224 245
225 return base; 246 return base;
226} 247}
227 248
228#define switch_hrtimer_base(t, b) (b) 249# define switch_hrtimer_base(t, b) (b)
229 250
230#endif /* !CONFIG_SMP */ 251#endif /* !CONFIG_SMP */
231 252
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
256 277
257 return ktime_add(kt, tmp); 278 return ktime_add(kt, tmp);
258} 279}
259
260#else /* CONFIG_KTIME_SCALAR */
261
262# endif /* !CONFIG_KTIME_SCALAR */ 280# endif /* !CONFIG_KTIME_SCALAR */
263 281
264/* 282/*
265 * Divide a ktime value by a nanosecond value 283 * Divide a ktime value by a nanosecond value
266 */ 284 */
267static unsigned long ktime_divns(const ktime_t kt, s64 div) 285unsigned long ktime_divns(const ktime_t kt, s64 div)
268{ 286{
269 u64 dclc, inc, dns; 287 u64 dclc, inc, dns;
270 int sft = 0; 288 int sft = 0;
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div)
281 299
282 return (unsigned long) dclc; 300 return (unsigned long) dclc;
283} 301}
284
285#else /* BITS_PER_LONG < 64 */
286# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
287#endif /* BITS_PER_LONG >= 64 */ 302#endif /* BITS_PER_LONG >= 64 */
288 303
304/* High resolution timer related functions */
305#ifdef CONFIG_HIGH_RES_TIMERS
306
307/*
308 * High resolution timer enabled ?
309 */
310static int hrtimer_hres_enabled __read_mostly = 1;
311
312/*
313 * Enable / Disable high resolution mode
314 */
315static int __init setup_hrtimer_hres(char *str)
316{
317 if (!strcmp(str, "off"))
318 hrtimer_hres_enabled = 0;
319 else if (!strcmp(str, "on"))
320 hrtimer_hres_enabled = 1;
321 else
322 return 0;
323 return 1;
324}
325
326__setup("highres=", setup_hrtimer_hres);
327
328/*
329 * hrtimer_high_res_enabled - query, if the highres mode is enabled
330 */
331static inline int hrtimer_is_hres_enabled(void)
332{
333 return hrtimer_hres_enabled;
334}
335
336/*
337 * Is the high resolution mode active ?
338 */
339static inline int hrtimer_hres_active(void)
340{
341 return __get_cpu_var(hrtimer_bases).hres_active;
342}
343
344/*
345 * Reprogram the event source with checking both queues for the
346 * next event
347 * Called with interrupts disabled and base->lock held
348 */
349static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
350{
351 int i;
352 struct hrtimer_clock_base *base = cpu_base->clock_base;
353 ktime_t expires;
354
355 cpu_base->expires_next.tv64 = KTIME_MAX;
356
357 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
358 struct hrtimer *timer;
359
360 if (!base->first)
361 continue;
362 timer = rb_entry(base->first, struct hrtimer, node);
363 expires = ktime_sub(timer->expires, base->offset);
364 if (expires.tv64 < cpu_base->expires_next.tv64)
365 cpu_base->expires_next = expires;
366 }
367
368 if (cpu_base->expires_next.tv64 != KTIME_MAX)
369 tick_program_event(cpu_base->expires_next, 1);
370}
371
372/*
373 * Shared reprogramming for clock_realtime and clock_monotonic
374 *
375 * When a timer is enqueued and expires earlier than the already enqueued
376 * timers, we have to check, whether it expires earlier than the timer for
377 * which the clock event device was armed.
378 *
379 * Called with interrupts disabled and base->cpu_base.lock held
380 */
381static int hrtimer_reprogram(struct hrtimer *timer,
382 struct hrtimer_clock_base *base)
383{
384 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
385 ktime_t expires = ktime_sub(timer->expires, base->offset);
386 int res;
387
388 /*
389 * When the callback is running, we do not reprogram the clock event
390 * device. The timer callback is either running on a different CPU or
391 * the callback is executed in the hrtimer_interupt context. The
392 * reprogramming is handled either by the softirq, which called the
393 * callback or at the end of the hrtimer_interrupt.
394 */
395 if (hrtimer_callback_running(timer))
396 return 0;
397
398 if (expires.tv64 >= expires_next->tv64)
399 return 0;
400
401 /*
402 * Clockevents returns -ETIME, when the event was in the past.
403 */
404 res = tick_program_event(expires, 0);
405 if (!IS_ERR_VALUE(res))
406 *expires_next = expires;
407 return res;
408}
409
410
411/*
412 * Retrigger next event is called after clock was set
413 *
414 * Called with interrupts disabled via on_each_cpu()
415 */
416static void retrigger_next_event(void *arg)
417{
418 struct hrtimer_cpu_base *base;
419 struct timespec realtime_offset;
420 unsigned long seq;
421
422 if (!hrtimer_hres_active())
423 return;
424
425 do {
426 seq = read_seqbegin(&xtime_lock);
427 set_normalized_timespec(&realtime_offset,
428 -wall_to_monotonic.tv_sec,
429 -wall_to_monotonic.tv_nsec);
430 } while (read_seqretry(&xtime_lock, seq));
431
432 base = &__get_cpu_var(hrtimer_bases);
433
434 /* Adjust CLOCK_REALTIME offset */
435 spin_lock(&base->lock);
436 base->clock_base[CLOCK_REALTIME].offset =
437 timespec_to_ktime(realtime_offset);
438
439 hrtimer_force_reprogram(base);
440 spin_unlock(&base->lock);
441}
442
443/*
444 * Clock realtime was set
445 *
446 * Change the offset of the realtime clock vs. the monotonic
447 * clock.
448 *
449 * We might have to reprogram the high resolution timer interrupt. On
450 * SMP we call the architecture specific code to retrigger _all_ high
451 * resolution timer interrupts. On UP we just disable interrupts and
452 * call the high resolution interrupt code.
453 */
454void clock_was_set(void)
455{
456 /* Retrigger the CPU local events everywhere */
457 on_each_cpu(retrigger_next_event, NULL, 0, 1);
458}
459
460/*
461 * Check, whether the timer is on the callback pending list
462 */
463static inline int hrtimer_cb_pending(const struct hrtimer *timer)
464{
465 return timer->state & HRTIMER_STATE_PENDING;
466}
467
468/*
469 * Remove a timer from the callback pending list
470 */
471static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
472{
473 list_del_init(&timer->cb_entry);
474}
475
476/*
477 * Initialize the high resolution related parts of cpu_base
478 */
479static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
480{
481 base->expires_next.tv64 = KTIME_MAX;
482 base->hres_active = 0;
483 INIT_LIST_HEAD(&base->cb_pending);
484}
485
486/*
487 * Initialize the high resolution related parts of a hrtimer
488 */
489static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
490{
491 INIT_LIST_HEAD(&timer->cb_entry);
492}
493
494/*
495 * When High resolution timers are active, try to reprogram. Note, that in case
496 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
497 * check happens. The timer gets enqueued into the rbtree. The reprogramming
498 * and expiry check is done in the hrtimer_interrupt or in the softirq.
499 */
500static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
501 struct hrtimer_clock_base *base)
502{
503 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
504
505 /* Timer is expired, act upon the callback mode */
506 switch(timer->cb_mode) {
507 case HRTIMER_CB_IRQSAFE_NO_RESTART:
508 /*
509 * We can call the callback from here. No restart
510 * happens, so no danger of recursion
511 */
512 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
513 return 1;
514 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
515 /*
516 * This is solely for the sched tick emulation with
517 * dynamic tick support to ensure that we do not
518 * restart the tick right on the edge and end up with
519 * the tick timer in the softirq ! The calling site
520 * takes care of this.
521 */
522 return 1;
523 case HRTIMER_CB_IRQSAFE:
524 case HRTIMER_CB_SOFTIRQ:
525 /*
526 * Move everything else into the softirq pending list !
527 */
528 list_add_tail(&timer->cb_entry,
529 &base->cpu_base->cb_pending);
530 timer->state = HRTIMER_STATE_PENDING;
531 raise_softirq(HRTIMER_SOFTIRQ);
532 return 1;
533 default:
534 BUG();
535 }
536 }
537 return 0;
538}
539
540/*
541 * Switch to high resolution mode
542 */
543static void hrtimer_switch_to_hres(void)
544{
545 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
546 unsigned long flags;
547
548 if (base->hres_active)
549 return;
550
551 local_irq_save(flags);
552
553 if (tick_init_highres()) {
554 local_irq_restore(flags);
555 return;
556 }
557 base->hres_active = 1;
558 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
559 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
560
561 tick_setup_sched_timer();
562
563 /* "Retrigger" the interrupt to get things going */
564 retrigger_next_event(NULL);
565 local_irq_restore(flags);
566 printk(KERN_INFO "Switched to high resolution mode on CPU %d\n",
567 smp_processor_id());
568}
569
570#else
571
572static inline int hrtimer_hres_active(void) { return 0; }
573static inline int hrtimer_is_hres_enabled(void) { return 0; }
574static inline void hrtimer_switch_to_hres(void) { }
575static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
576static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
577 struct hrtimer_clock_base *base)
578{
579 return 0;
580}
581static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
582static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
583static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
584static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
585
586#endif /* CONFIG_HIGH_RES_TIMERS */
587
588#ifdef CONFIG_TIMER_STATS
589void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
590{
591 if (timer->start_site)
592 return;
593
594 timer->start_site = addr;
595 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
596 timer->start_pid = current->pid;
597}
598#endif
599
289/* 600/*
290 * Counterpart to lock_timer_base above: 601 * Counterpart to lock_timer_base above:
291 */ 602 */
292static inline 603static inline
293void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 604void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
294{ 605{
295 spin_unlock_irqrestore(&timer->base->lock, *flags); 606 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
296} 607}
297 608
298/** 609/**
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
342 * The timer is inserted in expiry order. Insertion into the 653 * The timer is inserted in expiry order. Insertion into the
343 * red black tree is O(log(n)). Must hold the base lock. 654 * red black tree is O(log(n)). Must hold the base lock.
344 */ 655 */
345static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 656static void enqueue_hrtimer(struct hrtimer *timer,
657 struct hrtimer_clock_base *base, int reprogram)
346{ 658{
347 struct rb_node **link = &base->active.rb_node; 659 struct rb_node **link = &base->active.rb_node;
348 struct rb_node *parent = NULL; 660 struct rb_node *parent = NULL;
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
368 * Insert the timer to the rbtree and check whether it 680 * Insert the timer to the rbtree and check whether it
369 * replaces the first pending timer 681 * replaces the first pending timer
370 */ 682 */
371 rb_link_node(&timer->node, parent, link);
372 rb_insert_color(&timer->node, &base->active);
373
374 if (!base->first || timer->expires.tv64 < 683 if (!base->first || timer->expires.tv64 <
375 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 684 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
685 /*
686 * Reprogram the clock event device. When the timer is already
687 * expired hrtimer_enqueue_reprogram has either called the
688 * callback or added it to the pending list and raised the
689 * softirq.
690 *
691 * This is a NOP for !HIGHRES
692 */
693 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
694 return;
695
376 base->first = &timer->node; 696 base->first = &timer->node;
697 }
698
699 rb_link_node(&timer->node, parent, link);
700 rb_insert_color(&timer->node, &base->active);
701 /*
702 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
703 * state of a possibly running callback.
704 */
705 timer->state |= HRTIMER_STATE_ENQUEUED;
377} 706}
378 707
379/* 708/*
380 * __remove_hrtimer - internal function to remove a timer 709 * __remove_hrtimer - internal function to remove a timer
381 * 710 *
382 * Caller must hold the base lock. 711 * Caller must hold the base lock.
712 *
713 * High resolution timer mode reprograms the clock event device when the
714 * timer is the one which expires next. The caller can disable this by setting
715 * reprogram to zero. This is useful, when the context does a reprogramming
716 * anyway (e.g. timer interrupt)
383 */ 717 */
384static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 718static void __remove_hrtimer(struct hrtimer *timer,
719 struct hrtimer_clock_base *base,
720 unsigned long newstate, int reprogram)
385{ 721{
386 /* 722 /* High res. callback list. NOP for !HIGHRES */
387 * Remove the timer from the rbtree and replace the 723 if (hrtimer_cb_pending(timer))
388 * first entry pointer if necessary. 724 hrtimer_remove_cb_pending(timer);
389 */ 725 else {
390 if (base->first == &timer->node) 726 /*
391 base->first = rb_next(&timer->node); 727 * Remove the timer from the rbtree and replace the
392 rb_erase(&timer->node, &base->active); 728 * first entry pointer if necessary.
393 rb_set_parent(&timer->node, &timer->node); 729 */
730 if (base->first == &timer->node) {
731 base->first = rb_next(&timer->node);
732 /* Reprogram the clock event device. if enabled */
733 if (reprogram && hrtimer_hres_active())
734 hrtimer_force_reprogram(base->cpu_base);
735 }
736 rb_erase(&timer->node, &base->active);
737 }
738 timer->state = newstate;
394} 739}
395 740
396/* 741/*
397 * remove hrtimer, called with base lock held 742 * remove hrtimer, called with base lock held
398 */ 743 */
399static inline int 744static inline int
400remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) 745remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
401{ 746{
402 if (hrtimer_active(timer)) { 747 if (hrtimer_is_queued(timer)) {
403 __remove_hrtimer(timer, base); 748 int reprogram;
749
750 /*
751 * Remove the timer and force reprogramming when high
752 * resolution mode is active and the timer is on the current
753 * CPU. If we remove a timer on another CPU, reprogramming is
754 * skipped. The interrupt event on this CPU is fired and
755 * reprogramming happens in the interrupt handler. This is a
756 * rare case and less expensive than a smp call.
757 */
758 timer_stats_hrtimer_clear_start_info(timer);
759 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
760 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
761 reprogram);
404 return 1; 762 return 1;
405 } 763 }
406 return 0; 764 return 0;
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
419int 777int
420hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 778hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
421{ 779{
422 struct hrtimer_base *base, *new_base; 780 struct hrtimer_clock_base *base, *new_base;
423 unsigned long flags; 781 unsigned long flags;
424 int ret; 782 int ret;
425 783
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
431 /* Switch the timer base, if necessary: */ 789 /* Switch the timer base, if necessary: */
432 new_base = switch_hrtimer_base(timer, base); 790 new_base = switch_hrtimer_base(timer, base);
433 791
434 if (mode == HRTIMER_REL) { 792 if (mode == HRTIMER_MODE_REL) {
435 tim = ktime_add(tim, new_base->get_time()); 793 tim = ktime_add(tim, new_base->get_time());
436 /* 794 /*
437 * CONFIG_TIME_LOW_RES is a temporary way for architectures 795 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
446 } 804 }
447 timer->expires = tim; 805 timer->expires = tim;
448 806
449 enqueue_hrtimer(timer, new_base); 807 timer_stats_hrtimer_set_start_info(timer);
808
809 enqueue_hrtimer(timer, new_base, base == new_base);
450 810
451 unlock_hrtimer_base(timer, &flags); 811 unlock_hrtimer_base(timer, &flags);
452 812
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
466 */ 826 */
467int hrtimer_try_to_cancel(struct hrtimer *timer) 827int hrtimer_try_to_cancel(struct hrtimer *timer)
468{ 828{
469 struct hrtimer_base *base; 829 struct hrtimer_clock_base *base;
470 unsigned long flags; 830 unsigned long flags;
471 int ret = -1; 831 int ret = -1;
472 832
473 base = lock_hrtimer_base(timer, &flags); 833 base = lock_hrtimer_base(timer, &flags);
474 834
475 if (base->curr_timer != timer) 835 if (!hrtimer_callback_running(timer))
476 ret = remove_hrtimer(timer, base); 836 ret = remove_hrtimer(timer, base);
477 837
478 unlock_hrtimer_base(timer, &flags); 838 unlock_hrtimer_base(timer, &flags);
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
508 */ 868 */
509ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 869ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
510{ 870{
511 struct hrtimer_base *base; 871 struct hrtimer_clock_base *base;
512 unsigned long flags; 872 unsigned long flags;
513 ktime_t rem; 873 ktime_t rem;
514 874
515 base = lock_hrtimer_base(timer, &flags); 875 base = lock_hrtimer_base(timer, &flags);
516 rem = ktime_sub(timer->expires, timer->base->get_time()); 876 rem = ktime_sub(timer->expires, base->get_time());
517 unlock_hrtimer_base(timer, &flags); 877 unlock_hrtimer_base(timer, &flags);
518 878
519 return rem; 879 return rem;
520} 880}
521EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 881EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
522 882
523#ifdef CONFIG_NO_IDLE_HZ 883#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
524/** 884/**
525 * hrtimer_get_next_event - get the time until next expiry event 885 * hrtimer_get_next_event - get the time until next expiry event
526 * 886 *
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
529 */ 889 */
530ktime_t hrtimer_get_next_event(void) 890ktime_t hrtimer_get_next_event(void)
531{ 891{
532 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 892 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
893 struct hrtimer_clock_base *base = cpu_base->clock_base;
533 ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 894 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
534 unsigned long flags; 895 unsigned long flags;
535 int i; 896 int i;
536 897
537 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 898 spin_lock_irqsave(&cpu_base->lock, flags);
538 struct hrtimer *timer;
539 899
540 spin_lock_irqsave(&base->lock, flags); 900 if (!hrtimer_hres_active()) {
541 if (!base->first) { 901 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
542 spin_unlock_irqrestore(&base->lock, flags); 902 struct hrtimer *timer;
543 continue; 903
904 if (!base->first)
905 continue;
906
907 timer = rb_entry(base->first, struct hrtimer, node);
908 delta.tv64 = timer->expires.tv64;
909 delta = ktime_sub(delta, base->get_time());
910 if (delta.tv64 < mindelta.tv64)
911 mindelta.tv64 = delta.tv64;
544 } 912 }
545 timer = rb_entry(base->first, struct hrtimer, node);
546 delta.tv64 = timer->expires.tv64;
547 spin_unlock_irqrestore(&base->lock, flags);
548 delta = ktime_sub(delta, base->get_time());
549 if (delta.tv64 < mindelta.tv64)
550 mindelta.tv64 = delta.tv64;
551 } 913 }
914
915 spin_unlock_irqrestore(&cpu_base->lock, flags);
916
552 if (mindelta.tv64 < 0) 917 if (mindelta.tv64 < 0)
553 mindelta.tv64 = 0; 918 mindelta.tv64 = 0;
554 return mindelta; 919 return mindelta;
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void)
564void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 929void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
565 enum hrtimer_mode mode) 930 enum hrtimer_mode mode)
566{ 931{
567 struct hrtimer_base *bases; 932 struct hrtimer_cpu_base *cpu_base;
568 933
569 memset(timer, 0, sizeof(struct hrtimer)); 934 memset(timer, 0, sizeof(struct hrtimer));
570 935
571 bases = __raw_get_cpu_var(hrtimer_bases); 936 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
572 937
573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 938 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
574 clock_id = CLOCK_MONOTONIC; 939 clock_id = CLOCK_MONOTONIC;
575 940
576 timer->base = &bases[clock_id]; 941 timer->base = &cpu_base->clock_base[clock_id];
577 rb_set_parent(&timer->node, &timer->node); 942 hrtimer_init_timer_hres(timer);
943
944#ifdef CONFIG_TIMER_STATS
945 timer->start_site = NULL;
946 timer->start_pid = -1;
947 memset(timer->start_comm, 0, TASK_COMM_LEN);
948#endif
578} 949}
579EXPORT_SYMBOL_GPL(hrtimer_init); 950EXPORT_SYMBOL_GPL(hrtimer_init);
580 951
@@ -583,26 +954,164 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
583 * @which_clock: which clock to query 954 * @which_clock: which clock to query
584 * @tp: pointer to timespec variable to store the resolution 955 * @tp: pointer to timespec variable to store the resolution
585 * 956 *
586 * Store the resolution of the clock selected by which_clock in the 957 * Store the resolution of the clock selected by @which_clock in the
587 * variable pointed to by tp. 958 * variable pointed to by @tp.
588 */ 959 */
589int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 960int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
590{ 961{
591 struct hrtimer_base *bases; 962 struct hrtimer_cpu_base *cpu_base;
592 963
593 bases = __raw_get_cpu_var(hrtimer_bases); 964 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
594 *tp = ktime_to_timespec(bases[which_clock].resolution); 965 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
595 966
596 return 0; 967 return 0;
597} 968}
598EXPORT_SYMBOL_GPL(hrtimer_get_res); 969EXPORT_SYMBOL_GPL(hrtimer_get_res);
599 970
971#ifdef CONFIG_HIGH_RES_TIMERS
972
973/*
974 * High resolution timer interrupt
975 * Called with interrupts disabled
976 */
977void hrtimer_interrupt(struct clock_event_device *dev)
978{
979 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
980 struct hrtimer_clock_base *base;
981 ktime_t expires_next, now;
982 int i, raise = 0;
983
984 BUG_ON(!cpu_base->hres_active);
985 cpu_base->nr_events++;
986 dev->next_event.tv64 = KTIME_MAX;
987
988 retry:
989 now = ktime_get();
990
991 expires_next.tv64 = KTIME_MAX;
992
993 base = cpu_base->clock_base;
994
995 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
996 ktime_t basenow;
997 struct rb_node *node;
998
999 spin_lock(&cpu_base->lock);
1000
1001 basenow = ktime_add(now, base->offset);
1002
1003 while ((node = base->first)) {
1004 struct hrtimer *timer;
1005
1006 timer = rb_entry(node, struct hrtimer, node);
1007
1008 if (basenow.tv64 < timer->expires.tv64) {
1009 ktime_t expires;
1010
1011 expires = ktime_sub(timer->expires,
1012 base->offset);
1013 if (expires.tv64 < expires_next.tv64)
1014 expires_next = expires;
1015 break;
1016 }
1017
1018 /* Move softirq callbacks to the pending list */
1019 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1020 __remove_hrtimer(timer, base,
1021 HRTIMER_STATE_PENDING, 0);
1022 list_add_tail(&timer->cb_entry,
1023 &base->cpu_base->cb_pending);
1024 raise = 1;
1025 continue;
1026 }
1027
1028 __remove_hrtimer(timer, base,
1029 HRTIMER_STATE_CALLBACK, 0);
1030 timer_stats_account_hrtimer(timer);
1031
1032 /*
1033 * Note: We clear the CALLBACK bit after
1034 * enqueue_hrtimer to avoid reprogramming of
1035 * the event hardware. This happens at the end
1036 * of this function anyway.
1037 */
1038 if (timer->function(timer) != HRTIMER_NORESTART) {
1039 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1040 enqueue_hrtimer(timer, base, 0);
1041 }
1042 timer->state &= ~HRTIMER_STATE_CALLBACK;
1043 }
1044 spin_unlock(&cpu_base->lock);
1045 base++;
1046 }
1047
1048 cpu_base->expires_next = expires_next;
1049
1050 /* Reprogramming necessary ? */
1051 if (expires_next.tv64 != KTIME_MAX) {
1052 if (tick_program_event(expires_next, 0))
1053 goto retry;
1054 }
1055
1056 /* Raise softirq ? */
1057 if (raise)
1058 raise_softirq(HRTIMER_SOFTIRQ);
1059}
1060
1061static void run_hrtimer_softirq(struct softirq_action *h)
1062{
1063 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1064
1065 spin_lock_irq(&cpu_base->lock);
1066
1067 while (!list_empty(&cpu_base->cb_pending)) {
1068 enum hrtimer_restart (*fn)(struct hrtimer *);
1069 struct hrtimer *timer;
1070 int restart;
1071
1072 timer = list_entry(cpu_base->cb_pending.next,
1073 struct hrtimer, cb_entry);
1074
1075 timer_stats_account_hrtimer(timer);
1076
1077 fn = timer->function;
1078 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1079 spin_unlock_irq(&cpu_base->lock);
1080
1081 restart = fn(timer);
1082
1083 spin_lock_irq(&cpu_base->lock);
1084
1085 timer->state &= ~HRTIMER_STATE_CALLBACK;
1086 if (restart == HRTIMER_RESTART) {
1087 BUG_ON(hrtimer_active(timer));
1088 /*
1089 * Enqueue the timer, allow reprogramming of the event
1090 * device
1091 */
1092 enqueue_hrtimer(timer, timer->base, 1);
1093 } else if (hrtimer_active(timer)) {
1094 /*
1095 * If the timer was rearmed on another CPU, reprogram
1096 * the event device.
1097 */
1098 if (timer->base->first == &timer->node)
1099 hrtimer_reprogram(timer, timer->base);
1100 }
1101 }
1102 spin_unlock_irq(&cpu_base->lock);
1103}
1104
1105#endif /* CONFIG_HIGH_RES_TIMERS */
1106
600/* 1107/*
601 * Expire the per base hrtimer-queue: 1108 * Expire the per base hrtimer-queue:
602 */ 1109 */
603static inline void run_hrtimer_queue(struct hrtimer_base *base) 1110static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1111 int index)
604{ 1112{
605 struct rb_node *node; 1113 struct rb_node *node;
1114 struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
606 1115
607 if (!base->first) 1116 if (!base->first)
608 return; 1117 return;
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
610 if (base->get_softirq_time) 1119 if (base->get_softirq_time)
611 base->softirq_time = base->get_softirq_time(); 1120 base->softirq_time = base->get_softirq_time();
612 1121
613 spin_lock_irq(&base->lock); 1122 spin_lock_irq(&cpu_base->lock);
614 1123
615 while ((node = base->first)) { 1124 while ((node = base->first)) {
616 struct hrtimer *timer; 1125 struct hrtimer *timer;
617 int (*fn)(struct hrtimer *); 1126 enum hrtimer_restart (*fn)(struct hrtimer *);
618 int restart; 1127 int restart;
619 1128
620 timer = rb_entry(node, struct hrtimer, node); 1129 timer = rb_entry(node, struct hrtimer, node);
621 if (base->softirq_time.tv64 <= timer->expires.tv64) 1130 if (base->softirq_time.tv64 <= timer->expires.tv64)
622 break; 1131 break;
623 1132
1133 timer_stats_account_hrtimer(timer);
1134
624 fn = timer->function; 1135 fn = timer->function;
625 set_curr_timer(base, timer); 1136 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
626 __remove_hrtimer(timer, base); 1137 spin_unlock_irq(&cpu_base->lock);
627 spin_unlock_irq(&base->lock);
628 1138
629 restart = fn(timer); 1139 restart = fn(timer);
630 1140
631 spin_lock_irq(&base->lock); 1141 spin_lock_irq(&cpu_base->lock);
632 1142
1143 timer->state &= ~HRTIMER_STATE_CALLBACK;
633 if (restart != HRTIMER_NORESTART) { 1144 if (restart != HRTIMER_NORESTART) {
634 BUG_ON(hrtimer_active(timer)); 1145 BUG_ON(hrtimer_active(timer));
635 enqueue_hrtimer(timer, base); 1146 enqueue_hrtimer(timer, base, 0);
636 } 1147 }
637 } 1148 }
638 set_curr_timer(base, NULL); 1149 spin_unlock_irq(&cpu_base->lock);
639 spin_unlock_irq(&base->lock);
640} 1150}
641 1151
642/* 1152/*
643 * Called from timer softirq every jiffy, expire hrtimers: 1153 * Called from timer softirq every jiffy, expire hrtimers:
1154 *
1155 * For HRT its the fall back code to run the softirq in the timer
1156 * softirq context in case the hrtimer initialization failed or has
1157 * not been done yet.
644 */ 1158 */
645void hrtimer_run_queues(void) 1159void hrtimer_run_queues(void)
646{ 1160{
647 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 1161 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
648 int i; 1162 int i;
649 1163
650 hrtimer_get_softirq_time(base); 1164 if (hrtimer_hres_active())
1165 return;
1166
1167 /*
1168 * This _is_ ugly: We have to check in the softirq context,
1169 * whether we can switch to highres and / or nohz mode. The
1170 * clocksource switch happens in the timer interrupt with
1171 * xtime_lock held. Notification from there only sets the
1172 * check bit in the tick_oneshot code, otherwise we might
1173 * deadlock vs. xtime_lock.
1174 */
1175 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1176 hrtimer_switch_to_hres();
651 1177
652 for (i = 0; i < MAX_HRTIMER_BASES; i++) 1178 hrtimer_get_softirq_time(cpu_base);
653 run_hrtimer_queue(&base[i]); 1179
1180 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1181 run_hrtimer_queue(cpu_base, i);
654} 1182}
655 1183
656/* 1184/*
657 * Sleep related functions: 1185 * Sleep related functions:
658 */ 1186 */
659static int hrtimer_wakeup(struct hrtimer *timer) 1187static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
660{ 1188{
661 struct hrtimer_sleeper *t = 1189 struct hrtimer_sleeper *t =
662 container_of(timer, struct hrtimer_sleeper, timer); 1190 container_of(timer, struct hrtimer_sleeper, timer);
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
673{ 1201{
674 sl->timer.function = hrtimer_wakeup; 1202 sl->timer.function = hrtimer_wakeup;
675 sl->task = task; 1203 sl->task = task;
1204#ifdef CONFIG_HIGH_RES_TIMERS
1205 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
1206#endif
676} 1207}
677 1208
678static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1209static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
683 set_current_state(TASK_INTERRUPTIBLE); 1214 set_current_state(TASK_INTERRUPTIBLE);
684 hrtimer_start(&t->timer, t->timer.expires, mode); 1215 hrtimer_start(&t->timer, t->timer.expires, mode);
685 1216
686 schedule(); 1217 if (likely(t->task))
1218 schedule();
687 1219
688 hrtimer_cancel(&t->timer); 1220 hrtimer_cancel(&t->timer);
689 mode = HRTIMER_ABS; 1221 mode = HRTIMER_MODE_ABS;
690 1222
691 } while (t->task && !signal_pending(current)); 1223 } while (t->task && !signal_pending(current));
692 1224
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
702 1234
703 restart->fn = do_no_restart_syscall; 1235 restart->fn = do_no_restart_syscall;
704 1236
705 hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); 1237 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
706 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1238 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
707 1239
708 if (do_nanosleep(&t, HRTIMER_ABS)) 1240 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
709 return 0; 1241 return 0;
710 1242
711 rmtp = (struct timespec __user *) restart->arg1; 1243 rmtp = (struct timespec __user *) restart->arg1;
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
738 return 0; 1270 return 0;
739 1271
740 /* Absolute timers do not update the rmtp value and restart: */ 1272 /* Absolute timers do not update the rmtp value and restart: */
741 if (mode == HRTIMER_ABS) 1273 if (mode == HRTIMER_MODE_ABS)
742 return -ERESTARTNOHAND; 1274 return -ERESTARTNOHAND;
743 1275
744 if (rmtp) { 1276 if (rmtp) {
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
771 if (!timespec_valid(&tu)) 1303 if (!timespec_valid(&tu))
772 return -EINVAL; 1304 return -EINVAL;
773 1305
774 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); 1306 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
775} 1307}
776 1308
777/* 1309/*
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
779 */ 1311 */
780static void __devinit init_hrtimers_cpu(int cpu) 1312static void __devinit init_hrtimers_cpu(int cpu)
781{ 1313{
782 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); 1314 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
783 int i; 1315 int i;
784 1316
785 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { 1317 spin_lock_init(&cpu_base->lock);
786 spin_lock_init(&base->lock); 1318 lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
787 lockdep_set_class(&base->lock, &base->lock_key); 1319
788 } 1320 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1321 cpu_base->clock_base[i].cpu_base = cpu_base;
1322
1323 hrtimer_init_hres(cpu_base);
789} 1324}
790 1325
791#ifdef CONFIG_HOTPLUG_CPU 1326#ifdef CONFIG_HOTPLUG_CPU
792 1327
793static void migrate_hrtimer_list(struct hrtimer_base *old_base, 1328static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
794 struct hrtimer_base *new_base) 1329 struct hrtimer_clock_base *new_base)
795{ 1330{
796 struct hrtimer *timer; 1331 struct hrtimer *timer;
797 struct rb_node *node; 1332 struct rb_node *node;
798 1333
799 while ((node = rb_first(&old_base->active))) { 1334 while ((node = rb_first(&old_base->active))) {
800 timer = rb_entry(node, struct hrtimer, node); 1335 timer = rb_entry(node, struct hrtimer, node);
801 __remove_hrtimer(timer, old_base); 1336 BUG_ON(hrtimer_callback_running(timer));
1337 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
802 timer->base = new_base; 1338 timer->base = new_base;
803 enqueue_hrtimer(timer, new_base); 1339 /*
1340 * Enqueue the timer. Allow reprogramming of the event device
1341 */
1342 enqueue_hrtimer(timer, new_base, 1);
804 } 1343 }
805} 1344}
806 1345
807static void migrate_hrtimers(int cpu) 1346static void migrate_hrtimers(int cpu)
808{ 1347{
809 struct hrtimer_base *old_base, *new_base; 1348 struct hrtimer_cpu_base *old_base, *new_base;
810 int i; 1349 int i;
811 1350
812 BUG_ON(cpu_online(cpu)); 1351 BUG_ON(cpu_online(cpu));
813 old_base = per_cpu(hrtimer_bases, cpu); 1352 old_base = &per_cpu(hrtimer_bases, cpu);
814 new_base = get_cpu_var(hrtimer_bases); 1353 new_base = &get_cpu_var(hrtimer_bases);
815
816 local_irq_disable();
817 1354
818 for (i = 0; i < MAX_HRTIMER_BASES; i++) { 1355 tick_cancel_sched_timer(cpu);
819 1356
820 spin_lock(&new_base->lock); 1357 local_irq_disable();
821 spin_lock(&old_base->lock);
822
823 BUG_ON(old_base->curr_timer);
824 1358
825 migrate_hrtimer_list(old_base, new_base); 1359 spin_lock(&new_base->lock);
1360 spin_lock(&old_base->lock);
826 1361
827 spin_unlock(&old_base->lock); 1362 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
828 spin_unlock(&new_base->lock); 1363 migrate_hrtimer_list(&old_base->clock_base[i],
829 old_base++; 1364 &new_base->clock_base[i]);
830 new_base++;
831 } 1365 }
1366 spin_unlock(&old_base->lock);
1367 spin_unlock(&new_base->lock);
832 1368
833 local_irq_enable(); 1369 local_irq_enable();
834 put_cpu_var(hrtimer_bases); 1370 put_cpu_var(hrtimer_bases);
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
848 1384
849#ifdef CONFIG_HOTPLUG_CPU 1385#ifdef CONFIG_HOTPLUG_CPU
850 case CPU_DEAD: 1386 case CPU_DEAD:
1387 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
851 migrate_hrtimers(cpu); 1388 migrate_hrtimers(cpu);
852 break; 1389 break;
853#endif 1390#endif
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void)
868 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1405 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
869 (void *)(long)smp_processor_id()); 1406 (void *)(long)smp_processor_id());
870 register_cpu_notifier(&hrtimers_nb); 1407 register_cpu_notifier(&hrtimers_nb);
1408#ifdef CONFIG_HIGH_RES_TIMERS
1409 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
1410#endif
871} 1411}
872 1412
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 1dab0ac3f797..681c52dbfe22 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y := handle.o manage.o spurious.o resend.o chip.o 2obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d27b25855743..0133f4f9e9f0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -39,6 +39,7 @@ void dynamic_irq_init(unsigned int irq)
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 40 desc->handle_irq = handle_bad_irq;
41 desc->depth = 1; 41 desc->depth = 1;
42 desc->msi_desc = NULL;
42 desc->handler_data = NULL; 43 desc->handler_data = NULL;
43 desc->chip_data = NULL; 44 desc->chip_data = NULL;
44 desc->action = NULL; 45 desc->action = NULL;
@@ -74,6 +75,9 @@ void dynamic_irq_cleanup(unsigned int irq)
74 WARN_ON(1); 75 WARN_ON(1);
75 return; 76 return;
76 } 77 }
78 desc->msi_desc = NULL;
79 desc->handler_data = NULL;
80 desc->chip_data = NULL;
77 desc->handle_irq = handle_bad_irq; 81 desc->handle_irq = handle_bad_irq;
78 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
79 spin_unlock_irqrestore(&desc->lock, flags); 83 spin_unlock_irqrestore(&desc->lock, flags);
@@ -162,6 +166,30 @@ int set_irq_data(unsigned int irq, void *data)
162EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
163 167
164/** 168/**
169 * set_irq_data - set irq type data for an irq
170 * @irq: Interrupt number
171 * @entry: Pointer to MSI descriptor data
172 *
173 * Set the hardware irq controller data for an irq
174 */
175int set_irq_msi(unsigned int irq, struct msi_desc *entry)
176{
177 struct irq_desc *desc;
178 unsigned long flags;
179
180 if (irq >= NR_IRQS) {
181 printk(KERN_ERR
182 "Trying to install msi data for IRQ%d\n", irq);
183 return -EINVAL;
184 }
185 desc = irq_desc + irq;
186 spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry;
188 spin_unlock_irqrestore(&desc->lock, flags);
189 return 0;
190}
191
192/**
165 * set_irq_chip_data - set irq chip data for an irq 193 * set_irq_chip_data - set irq chip data for an irq
166 * @irq: Interrupt number 194 * @irq: Interrupt number
167 * @data: Pointer to chip specific data 195 * @data: Pointer to chip specific data
@@ -202,10 +230,6 @@ static void default_enable(unsigned int irq)
202 */ 230 */
203static void default_disable(unsigned int irq) 231static void default_disable(unsigned int irq)
204{ 232{
205 struct irq_desc *desc = irq_desc + irq;
206
207 if (!(desc->status & IRQ_DELAYED_DISABLE))
208 desc->chip->mask(irq);
209} 233}
210 234
211/* 235/*
@@ -270,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
270 294
271 if (unlikely(desc->status & IRQ_INPROGRESS)) 295 if (unlikely(desc->status & IRQ_INPROGRESS))
272 goto out_unlock; 296 goto out_unlock;
273 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
274 kstat_cpu(cpu).irqs[irq]++; 297 kstat_cpu(cpu).irqs[irq]++;
275 298
276 action = desc->action; 299 action = desc->action;
277 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 300 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
301 if (desc->chip->mask)
302 desc->chip->mask(irq);
303 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
304 desc->status |= IRQ_PENDING;
278 goto out_unlock; 305 goto out_unlock;
306 }
279 307
308 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
280 desc->status |= IRQ_INPROGRESS; 309 desc->status |= IRQ_INPROGRESS;
281 spin_unlock(&desc->lock); 310 spin_unlock(&desc->lock);
282 311
@@ -368,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
368 397
369 /* 398 /*
370 * If its disabled or no action available 399 * If its disabled or no action available
371 * keep it masked and get out of here 400 * then mask it and get out of here:
372 */ 401 */
373 action = desc->action; 402 action = desc->action;
374 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 403 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
375 desc->status |= IRQ_PENDING; 404 desc->status |= IRQ_PENDING;
405 if (desc->chip->mask)
406 desc->chip->mask(irq);
376 goto out; 407 goto out;
377 } 408 }
378 409
@@ -534,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
534 565
535 /* Uninstall? */ 566 /* Uninstall? */
536 if (handle == handle_bad_irq) { 567 if (handle == handle_bad_irq) {
537 if (desc->chip != &no_irq_chip) { 568 if (desc->chip != &no_irq_chip)
538 desc->chip->mask(irq); 569 mask_ack_irq(desc, irq);
539 desc->chip->ack(irq);
540 }
541 desc->status |= IRQ_DISABLED; 570 desc->status |= IRQ_DISABLED;
542 desc->depth = 1; 571 desc->depth = 1;
543 } 572 }
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
new file mode 100644
index 000000000000..85a430da0fb6
--- /dev/null
+++ b/kernel/irq/devres.c
@@ -0,0 +1,88 @@
1#include <linux/module.h>
2#include <linux/interrupt.h>
3
4/*
5 * Device resource management aware IRQ request/free implementation.
6 */
7struct irq_devres {
8 unsigned int irq;
9 void *dev_id;
10};
11
12static void devm_irq_release(struct device *dev, void *res)
13{
14 struct irq_devres *this = res;
15
16 free_irq(this->irq, this->dev_id);
17}
18
19static int devm_irq_match(struct device *dev, void *res, void *data)
20{
21 struct irq_devres *this = res, *match = data;
22
23 return this->irq == match->irq && this->dev_id == match->dev_id;
24}
25
26/**
27 * devm_request_irq - allocate an interrupt line for a managed device
28 * @dev: device to request interrupt for
29 * @irq: Interrupt line to allocate
30 * @handler: Function to be called when the IRQ occurs
31 * @irqflags: Interrupt type flags
32 * @devname: An ascii name for the claiming device
33 * @dev_id: A cookie passed back to the handler function
34 *
35 * Except for the extra @dev argument, this function takes the
36 * same arguments and performs the same function as
37 * request_irq(). IRQs requested with this function will be
38 * automatically freed on driver detach.
39 *
40 * If an IRQ allocated with this function needs to be freed
41 * separately, dev_free_irq() must be used.
42 */
43int devm_request_irq(struct device *dev, unsigned int irq,
44 irq_handler_t handler, unsigned long irqflags,
45 const char *devname, void *dev_id)
46{
47 struct irq_devres *dr;
48 int rc;
49
50 dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
51 GFP_KERNEL);
52 if (!dr)
53 return -ENOMEM;
54
55 rc = request_irq(irq, handler, irqflags, devname, dev_id);
56 if (rc) {
57 kfree(dr);
58 return rc;
59 }
60
61 dr->irq = irq;
62 dr->dev_id = dev_id;
63 devres_add(dev, dr);
64
65 return 0;
66}
67EXPORT_SYMBOL(devm_request_irq);
68
69/**
70 * devm_free_irq - free an interrupt
71 * @dev: device to free interrupt for
72 * @irq: Interrupt line to free
73 * @dev_id: Device identity to free
74 *
75 * Except for the extra @dev argument, this function takes the
76 * same arguments and performs the same function as free_irq().
77 * This function instead of free_irq() should be used to manually
78 * free IRQs allocated with dev_request_irq().
79 */
80void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
81{
82 struct irq_devres match_data = { irq, dev_id };
83
84 free_irq(irq, dev_id);
85 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
86 &match_data));
87}
88EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8b961adc3bd2..5597c157442a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq)
38} 38}
39EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
40 40
41/**
42 * irq_can_set_affinity - Check if the affinity of a given irq can be set
43 * @irq: Interrupt to check
44 *
45 */
46int irq_can_set_affinity(unsigned int irq)
47{
48 struct irq_desc *desc = irq_desc + irq;
49
50 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
51 !desc->chip->set_affinity)
52 return 0;
53
54 return 1;
55}
56
57/**
58 * irq_set_affinity - Set the irq affinity of a given irq
59 * @irq: Interrupt to set affinity
60 * @cpumask: cpumask
61 *
62 */
63int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
64{
65 struct irq_desc *desc = irq_desc + irq;
66
67 if (!desc->chip->set_affinity)
68 return -EINVAL;
69
70 set_balance_irq_affinity(irq, cpumask);
71
72#ifdef CONFIG_GENERIC_PENDING_IRQ
73 set_pending_irq(irq, cpumask);
74#else
75 desc->affinity = cpumask;
76 desc->chip->set_affinity(irq, cpumask);
77#endif
78 return 0;
79}
80
41#endif 81#endif
42 82
43/** 83/**
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
281 if (new->flags & IRQF_PERCPU) 321 if (new->flags & IRQF_PERCPU)
282 desc->status |= IRQ_PER_CPU; 322 desc->status |= IRQ_PER_CPU;
283#endif 323#endif
324 /* Exclude IRQ from balancing */
325 if (new->flags & IRQF_NOBALANCING)
326 desc->status |= IRQ_NO_BALANCING;
327
284 if (!shared) { 328 if (!shared) {
285 irq_chip_set_defaults(desc->chip); 329 irq_chip_set_defaults(desc->chip);
286 330
@@ -328,12 +372,14 @@ int setup_irq(unsigned int irq, struct irqaction *new)
328 return 0; 372 return 0;
329 373
330mismatch: 374mismatch:
375#ifdef CONFIG_DEBUG_SHIRQ
331 if (!(new->flags & IRQF_PROBE_SHARED)) { 376 if (!(new->flags & IRQF_PROBE_SHARED)) {
332 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 377 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
333 if (old_name) 378 if (old_name)
334 printk(KERN_ERR "current handler: %s\n", old_name); 379 printk(KERN_ERR "current handler: %s\n", old_name);
335 dump_stack(); 380 dump_stack();
336 } 381 }
382#endif
337 spin_unlock_irqrestore(&desc->lock, flags); 383 spin_unlock_irqrestore(&desc->lock, flags);
338 return -EBUSY; 384 return -EBUSY;
339} 385}
@@ -357,6 +403,7 @@ void free_irq(unsigned int irq, void *dev_id)
357 struct irq_desc *desc; 403 struct irq_desc *desc;
358 struct irqaction **p; 404 struct irqaction **p;
359 unsigned long flags; 405 unsigned long flags;
406 irqreturn_t (*handler)(int, void *) = NULL;
360 407
361 WARN_ON(in_interrupt()); 408 WARN_ON(in_interrupt());
362 if (irq >= NR_IRQS) 409 if (irq >= NR_IRQS)
@@ -396,6 +443,8 @@ void free_irq(unsigned int irq, void *dev_id)
396 443
397 /* Make sure it's not being used on another CPU */ 444 /* Make sure it's not being used on another CPU */
398 synchronize_irq(irq); 445 synchronize_irq(irq);
446 if (action->flags & IRQF_SHARED)
447 handler = action->handler;
399 kfree(action); 448 kfree(action);
400 return; 449 return;
401 } 450 }
@@ -403,6 +452,17 @@ void free_irq(unsigned int irq, void *dev_id)
403 spin_unlock_irqrestore(&desc->lock, flags); 452 spin_unlock_irqrestore(&desc->lock, flags);
404 return; 453 return;
405 } 454 }
455#ifdef CONFIG_DEBUG_SHIRQ
456 if (handler) {
457 /*
458 * It's a shared IRQ -- the driver ought to be prepared for it
459 * to happen even now it's being freed, so let's make sure....
460 * We do this after actually deregistering it, to make sure that
461 * a 'real' IRQ doesn't run in parallel with our fake
462 */
463 handler(irq, dev_id);
464 }
465#endif
406} 466}
407EXPORT_SYMBOL(free_irq); 467EXPORT_SYMBOL(free_irq);
408 468
@@ -445,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
445 /* 505 /*
446 * Lockdep wants atomic interrupt handlers: 506 * Lockdep wants atomic interrupt handlers:
447 */ 507 */
448 irqflags |= SA_INTERRUPT; 508 irqflags |= IRQF_DISABLED;
449#endif 509#endif
450 /* 510 /*
451 * Sanity-check: shared interrupts must pass in a real dev-ID, 511 * Sanity-check: shared interrupts must pass in a real dev-ID,
@@ -475,6 +535,25 @@ int request_irq(unsigned int irq, irq_handler_t handler,
475 535
476 select_smp_affinity(irq); 536 select_smp_affinity(irq);
477 537
538#ifdef CONFIG_DEBUG_SHIRQ
539 if (irqflags & IRQF_SHARED) {
540 /*
541 * It's a shared IRQ -- the driver ought to be prepared for it
542 * to happen immediately, so let's make sure....
543 * We do this before actually registering it, to make sure that
544 * a 'real' IRQ doesn't run in parallel with our fake
545 */
546 if (irqflags & IRQF_DISABLED) {
547 unsigned long flags;
548
549 local_irq_save(flags);
550 handler(irq, dev_id);
551 local_irq_restore(flags);
552 } else
553 handler(irq, dev_id);
554 }
555#endif
556
478 retval = setup_irq(irq, action); 557 retval = setup_irq(irq, action);
479 if (retval) 558 if (retval)
480 kfree(action); 559 kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 4baa3bbcd25a..77b7acc875c5 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -65,12 +65,11 @@ void move_native_irq(int irq)
65 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 65 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
66 return; 66 return;
67 67
68 if (likely(!(desc->status & IRQ_DISABLED))) 68 if (unlikely(desc->status & IRQ_DISABLED))
69 desc->chip->disable(irq); 69 return;
70 70
71 desc->chip->mask(irq);
71 move_masked_irq(irq); 72 move_masked_irq(irq);
72 73 desc->chip->unmask(irq);
73 if (likely(!(desc->status & IRQ_DISABLED)))
74 desc->chip->enable(irq);
75} 74}
76 75
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 61f5c717a8f5..2db91eb54ad8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19#ifdef CONFIG_GENERIC_PENDING_IRQ
20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
21{
22 set_balance_irq_affinity(irq, mask_val);
23
24 /*
25 * Save these away for later use. Re-progam when the
26 * interrupt is pending
27 */
28 set_pending_irq(irq, mask_val);
29}
30#else
31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
32{
33 set_balance_irq_affinity(irq, mask_val);
34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
36}
37#endif
38
39static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
40 int count, int *eof, void *data) 20 int count, int *eof, void *data)
41{ 21{
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
55 cpumask_t new_value, tmp; 35 cpumask_t new_value, tmp;
56 36
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 37 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status)) 38 irq_balancing_disabled(irq))
59 return -EIO; 39 return -EIO;
60 40
61 err = cpumask_parse_user(buffer, count, new_value); 41 err = cpumask_parse_user(buffer, count, new_value);
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
73 code to set default SMP affinity. */ 53 code to set default SMP affinity. */
74 return select_smp_affinity(irq) ? -EINVAL : full_count; 54 return select_smp_affinity(irq) ? -EINVAL : full_count;
75 55
76 proc_set_irq_affinity(irq, new_value); 56 irq_set_affinity(irq, new_value);
77 57
78 return full_count; 58 return full_count;
79} 59}
@@ -136,7 +116,6 @@ void register_irq_proc(unsigned int irq)
136 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 116 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
137 117
138 if (entry) { 118 if (entry) {
139 entry->nlink = 1;
140 entry->data = (void *)(long)irq; 119 entry->data = (void *)(long)irq;
141 entry->read_proc = irq_affinity_read_proc; 120 entry->read_proc = irq_affinity_read_proc;
142 entry->write_proc = irq_affinity_write_proc; 121 entry->write_proc = irq_affinity_write_proc;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 204ed7939e75..307c6a632ef6 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(struct hrtimer *timer) 131enum hrtimer_restart it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct signal_struct *sig = 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer); 134 container_of(timer, struct signal_struct, real_timer);
135 135
136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
137 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART;
142 }
143 return HRTIMER_NORESTART; 138 return HRTIMER_NORESTART;
144} 139}
145 140
@@ -231,11 +226,14 @@ again:
231 spin_unlock_irq(&tsk->sighand->siglock); 226 spin_unlock_irq(&tsk->sighand->siglock);
232 goto again; 227 goto again;
233 } 228 }
234 tsk->signal->it_real_incr =
235 timeval_to_ktime(value->it_interval);
236 expires = timeval_to_ktime(value->it_value); 229 expires = timeval_to_ktime(value->it_value);
237 if (expires.tv64 != 0) 230 if (expires.tv64 != 0) {
238 hrtimer_start(timer, expires, HRTIMER_REL); 231 tsk->signal->it_real_incr =
232 timeval_to_ktime(value->it_interval);
233 hrtimer_start(timer, expires, HRTIMER_MODE_REL);
234 } else
235 tsk->signal->it_real_incr.tv64 = 0;
236
239 spin_unlock_irq(&tsk->sighand->siglock); 237 spin_unlock_irq(&tsk->sighand->siglock);
240 break; 238 break;
241 case ITIMER_VIRTUAL: 239 case ITIMER_VIRTUAL:
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 5d1d907378a2..cee419143fd4 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -32,8 +32,8 @@
32 * @gfp_mask: get_free_pages mask, passed to kmalloc() 32 * @gfp_mask: get_free_pages mask, passed to kmalloc()
33 * @lock: the lock to be used to protect the fifo buffer 33 * @lock: the lock to be used to protect the fifo buffer
34 * 34 *
35 * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the 35 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
36 * struct kfifo with kfree(). 36 * &struct kfifo with kfree().
37 */ 37 */
38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
39 gfp_t gfp_mask, spinlock_t *lock) 39 gfp_t gfp_mask, spinlock_t *lock)
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(kfifo_free);
108 * @buffer: the data to be added. 108 * @buffer: the data to be added.
109 * @len: the length of the data to be added. 109 * @len: the length of the data to be added.
110 * 110 *
111 * This function copies at most 'len' bytes from the 'buffer' into 111 * This function copies at most @len bytes from the @buffer into
112 * the FIFO depending on the free space, and returns the number of 112 * the FIFO depending on the free space, and returns the number of
113 * bytes copied. 113 * bytes copied.
114 * 114 *
@@ -155,8 +155,8 @@ EXPORT_SYMBOL(__kfifo_put);
155 * @buffer: where the data must be copied. 155 * @buffer: where the data must be copied.
156 * @len: the size of the destination buffer. 156 * @len: the size of the destination buffer.
157 * 157 *
158 * This function copies at most 'len' bytes from the FIFO into the 158 * This function copies at most @len bytes from the FIFO into the
159 * 'buffer' and returns the number of copied bytes. 159 * @buffer and returns the number of copied bytes.
160 * 160 *
161 * Note that with only one concurrent reader and one concurrent 161 * Note that with only one concurrent reader and one concurrent
162 * writer, you don't need extra locking to use these functions. 162 * writer, you don't need extra locking to use these functions.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..796276141e51 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
217 sub_info->retval = ret; 217 sub_info->retval = ret;
218 } 218 }
219 219
220 complete(sub_info->complete); 220 if (sub_info->wait < 0)
221 kfree(sub_info);
222 else
223 complete(sub_info->complete);
221 return 0; 224 return 0;
222} 225}
223 226
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
239 pid = kernel_thread(____call_usermodehelper, sub_info, 242 pid = kernel_thread(____call_usermodehelper, sub_info,
240 CLONE_VFORK | SIGCHLD); 243 CLONE_VFORK | SIGCHLD);
241 244
245 if (wait < 0)
246 return;
247
242 if (pid < 0) { 248 if (pid < 0) {
243 sub_info->retval = pid; 249 sub_info->retval = pid;
244 complete(sub_info->complete); 250 complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
253 * @envp: null-terminated environment list 259 * @envp: null-terminated environment list
254 * @session_keyring: session keyring for process (NULL for an empty keyring) 260 * @session_keyring: session keyring for process (NULL for an empty keyring)
255 * @wait: wait for the application to finish and return status. 261 * @wait: wait for the application to finish and return status.
262 * when -1 don't wait at all, but you get no useful error back when
263 * the program couldn't be exec'ed. This makes it safe to call
264 * from interrupt context.
256 * 265 *
257 * Runs a user-space application. The application is started 266 * Runs a user-space application. The application is started
258 * asynchronously if wait is not set, and runs as a child of keventd. 267 * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
265 struct key *session_keyring, int wait) 274 struct key *session_keyring, int wait)
266{ 275{
267 DECLARE_COMPLETION_ONSTACK(done); 276 DECLARE_COMPLETION_ONSTACK(done);
268 struct subprocess_info sub_info = { 277 struct subprocess_info *sub_info;
269 .work = __WORK_INITIALIZER(sub_info.work, 278 int retval;
270 __call_usermodehelper),
271 .complete = &done,
272 .path = path,
273 .argv = argv,
274 .envp = envp,
275 .ring = session_keyring,
276 .wait = wait,
277 .retval = 0,
278 };
279 279
280 if (!khelper_wq) 280 if (!khelper_wq)
281 return -EBUSY; 281 return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
283 if (path[0] == '\0') 283 if (path[0] == '\0')
284 return 0; 284 return 0;
285 285
286 queue_work(khelper_wq, &sub_info.work); 286 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
287 if (!sub_info)
288 return -ENOMEM;
289
290 INIT_WORK(&sub_info->work, __call_usermodehelper);
291 sub_info->complete = &done;
292 sub_info->path = path;
293 sub_info->argv = argv;
294 sub_info->envp = envp;
295 sub_info->ring = session_keyring;
296 sub_info->wait = wait;
297
298 queue_work(khelper_wq, &sub_info->work);
299 if (wait < 0) /* task has freed sub_info */
300 return 0;
287 wait_for_completion(&done); 301 wait_for_completion(&done);
288 return sub_info.retval; 302 retval = sub_info->retval;
303 kfree(sub_info);
304 return retval;
289} 305}
290EXPORT_SYMBOL(call_usermodehelper_keys); 306EXPORT_SYMBOL(call_usermodehelper_keys);
291 307
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6fcf8dd148d0..d25a9ada3f8e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -39,6 +39,8 @@
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h> 41#include <linux/freezer.h>
42#include <linux/seq_file.h>
43#include <linux/debugfs.h>
42#include <asm-generic/sections.h> 44#include <asm-generic/sections.h>
43#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
44#include <asm/errno.h> 46#include <asm/errno.h>
@@ -778,6 +780,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
778 return -ENOSYS; 780 return -ENOSYS;
779} 781}
780 782
783static int __kprobes pre_handler_kretprobe(struct kprobe *p,
784 struct pt_regs *regs)
785{
786 return 0;
787}
788
781#endif /* ARCH_SUPPORTS_KRETPROBES */ 789#endif /* ARCH_SUPPORTS_KRETPROBES */
782 790
783void __kprobes unregister_kretprobe(struct kretprobe *rp) 791void __kprobes unregister_kretprobe(struct kretprobe *rp)
@@ -815,7 +823,109 @@ static int __init init_kprobes(void)
815 return err; 823 return err;
816} 824}
817 825
818__initcall(init_kprobes); 826#ifdef CONFIG_DEBUG_FS
827static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
828 const char *sym, int offset,char *modname)
829{
830 char *kprobe_type;
831
832 if (p->pre_handler == pre_handler_kretprobe)
833 kprobe_type = "r";
834 else if (p->pre_handler == setjmp_pre_handler)
835 kprobe_type = "j";
836 else
837 kprobe_type = "k";
838 if (sym)
839 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type,
840 sym, offset, (modname ? modname : " "));
841 else
842 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr);
843}
844
845static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
846{
847 return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
848}
849
850static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
851{
852 (*pos)++;
853 if (*pos >= KPROBE_TABLE_SIZE)
854 return NULL;
855 return pos;
856}
857
858static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
859{
860 /* Nothing to do */
861}
862
863static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
864{
865 struct hlist_head *head;
866 struct hlist_node *node;
867 struct kprobe *p, *kp;
868 const char *sym = NULL;
869 unsigned int i = *(loff_t *) v;
870 unsigned long size, offset = 0;
871 char *modname, namebuf[128];
872
873 head = &kprobe_table[i];
874 preempt_disable();
875 hlist_for_each_entry_rcu(p, node, head, hlist) {
876 sym = kallsyms_lookup((unsigned long)p->addr, &size,
877 &offset, &modname, namebuf);
878 if (p->pre_handler == aggr_pre_handler) {
879 list_for_each_entry_rcu(kp, &p->list, list)
880 report_probe(pi, kp, sym, offset, modname);
881 } else
882 report_probe(pi, p, sym, offset, modname);
883 }
884 preempt_enable();
885 return 0;
886}
887
888static struct seq_operations kprobes_seq_ops = {
889 .start = kprobe_seq_start,
890 .next = kprobe_seq_next,
891 .stop = kprobe_seq_stop,
892 .show = show_kprobe_addr
893};
894
895static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
896{
897 return seq_open(filp, &kprobes_seq_ops);
898}
899
900static struct file_operations debugfs_kprobes_operations = {
901 .open = kprobes_open,
902 .read = seq_read,
903 .llseek = seq_lseek,
904 .release = seq_release,
905};
906
907static int __kprobes debugfs_kprobe_init(void)
908{
909 struct dentry *dir, *file;
910
911 dir = debugfs_create_dir("kprobes", NULL);
912 if (!dir)
913 return -ENOMEM;
914
915 file = debugfs_create_file("list", 0444, dir , 0 ,
916 &debugfs_kprobes_operations);
917 if (!file) {
918 debugfs_remove(dir);
919 return -ENOMEM;
920 }
921
922 return 0;
923}
924
925late_initcall(debugfs_kprobe_init);
926#endif /* CONFIG_DEBUG_FS */
927
928module_init(init_kprobes);
819 929
820EXPORT_SYMBOL_GPL(register_kprobe); 930EXPORT_SYMBOL_GPL(register_kprobe);
821EXPORT_SYMBOL_GPL(unregister_kprobe); 931EXPORT_SYMBOL_GPL(unregister_kprobe);
@@ -824,4 +934,3 @@ EXPORT_SYMBOL_GPL(unregister_jprobe);
824EXPORT_SYMBOL_GPL(jprobe_return); 934EXPORT_SYMBOL_GPL(jprobe_return);
825EXPORT_SYMBOL_GPL(register_kretprobe); 935EXPORT_SYMBOL_GPL(register_kretprobe);
826EXPORT_SYMBOL_GPL(unregister_kretprobe); 936EXPORT_SYMBOL_GPL(unregister_kretprobe);
827
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1db8c72d0d38..87c50ccd1d4e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -50,7 +50,7 @@ static struct kthread_stop_info kthread_stop_info;
50/** 50/**
51 * kthread_should_stop - should this kthread return now? 51 * kthread_should_stop - should this kthread return now?
52 * 52 *
53 * When someone calls kthread_stop on your kthread, it will be woken 53 * When someone calls kthread_stop() on your kthread, it will be woken
54 * and this will return true. You should then return, and your return 54 * and this will return true. You should then return, and your return
55 * value will be passed through to kthread_stop(). 55 * value will be passed through to kthread_stop().
56 */ 56 */
@@ -143,7 +143,7 @@ static void keventd_create_kthread(struct work_struct *work)
143 * it. See also kthread_run(), kthread_create_on_cpu(). 143 * it. See also kthread_run(), kthread_create_on_cpu().
144 * 144 *
145 * When woken, the thread will run @threadfn() with @data as its 145 * When woken, the thread will run @threadfn() with @data as its
146 * argument. @threadfn can either call do_exit() directly if it is a 146 * argument. @threadfn() can either call do_exit() directly if it is a
147 * standalone thread for which noone will call kthread_stop(), or 147 * standalone thread for which noone will call kthread_stop(), or
148 * return when 'kthread_should_stop()' is true (which means 148 * return when 'kthread_should_stop()' is true (which means
149 * kthread_stop() has been called). The return value should be zero 149 * kthread_stop() has been called). The return value should be zero
@@ -192,7 +192,7 @@ EXPORT_SYMBOL(kthread_create);
192 * 192 *
193 * Description: This function is equivalent to set_cpus_allowed(), 193 * Description: This function is equivalent to set_cpus_allowed(),
194 * except that @cpu doesn't need to be online, and the thread must be 194 * except that @cpu doesn't need to be online, and the thread must be
195 * stopped (i.e., just returned from kthread_create(). 195 * stopped (i.e., just returned from kthread_create()).
196 */ 196 */
197void kthread_bind(struct task_struct *k, unsigned int cpu) 197void kthread_bind(struct task_struct *k, unsigned int cpu)
198{ 198{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 509efd49540f..a08a17218dfa 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -70,6 +70,9 @@ static int graph_lock(void)
70 70
71static inline int graph_unlock(void) 71static inline int graph_unlock(void)
72{ 72{
73 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
74 return DEBUG_LOCKS_WARN_ON(1);
75
73 __raw_spin_unlock(&lockdep_lock); 76 __raw_spin_unlock(&lockdep_lock);
74 return 0; 77 return 0;
75} 78}
@@ -487,7 +490,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
487 * Add a new dependency to the head of the list: 490 * Add a new dependency to the head of the list:
488 */ 491 */
489static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 492static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
490 struct list_head *head, unsigned long ip) 493 struct list_head *head, unsigned long ip, int distance)
491{ 494{
492 struct lock_list *entry; 495 struct lock_list *entry;
493 /* 496 /*
@@ -499,6 +502,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
499 return 0; 502 return 0;
500 503
501 entry->class = this; 504 entry->class = this;
505 entry->distance = distance;
502 if (!save_trace(&entry->trace)) 506 if (!save_trace(&entry->trace))
503 return 0; 507 return 0;
504 508
@@ -712,6 +716,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
712 struct lock_list *entry; 716 struct lock_list *entry;
713 int ret; 717 int ret;
714 718
719 if (!__raw_spin_is_locked(&lockdep_lock))
720 return DEBUG_LOCKS_WARN_ON(1);
721
715 if (depth > max_recursion_depth) 722 if (depth > max_recursion_depth)
716 max_recursion_depth = depth; 723 max_recursion_depth = depth;
717 if (depth >= RECURSION_LIMIT) 724 if (depth >= RECURSION_LIMIT)
@@ -900,7 +907,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
900 */ 907 */
901static int 908static int
902check_prev_add(struct task_struct *curr, struct held_lock *prev, 909check_prev_add(struct task_struct *curr, struct held_lock *prev,
903 struct held_lock *next) 910 struct held_lock *next, int distance)
904{ 911{
905 struct lock_list *entry; 912 struct lock_list *entry;
906 int ret; 913 int ret;
@@ -978,8 +985,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
978 * L2 added to its dependency list, due to the first chain.) 985 * L2 added to its dependency list, due to the first chain.)
979 */ 986 */
980 list_for_each_entry(entry, &prev->class->locks_after, entry) { 987 list_for_each_entry(entry, &prev->class->locks_after, entry) {
981 if (entry->class == next->class) 988 if (entry->class == next->class) {
989 if (distance == 1)
990 entry->distance = 1;
982 return 2; 991 return 2;
992 }
983 } 993 }
984 994
985 /* 995 /*
@@ -987,12 +997,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
987 * to the previous lock's dependency list: 997 * to the previous lock's dependency list:
988 */ 998 */
989 ret = add_lock_to_list(prev->class, next->class, 999 ret = add_lock_to_list(prev->class, next->class,
990 &prev->class->locks_after, next->acquire_ip); 1000 &prev->class->locks_after, next->acquire_ip, distance);
1001
991 if (!ret) 1002 if (!ret)
992 return 0; 1003 return 0;
993 1004
994 ret = add_lock_to_list(next->class, prev->class, 1005 ret = add_lock_to_list(next->class, prev->class,
995 &next->class->locks_before, next->acquire_ip); 1006 &next->class->locks_before, next->acquire_ip, distance);
996 if (!ret) 1007 if (!ret)
997 return 0; 1008 return 0;
998 1009
@@ -1040,13 +1051,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1040 goto out_bug; 1051 goto out_bug;
1041 1052
1042 for (;;) { 1053 for (;;) {
1054 int distance = curr->lockdep_depth - depth + 1;
1043 hlock = curr->held_locks + depth-1; 1055 hlock = curr->held_locks + depth-1;
1044 /* 1056 /*
1045 * Only non-recursive-read entries get new dependencies 1057 * Only non-recursive-read entries get new dependencies
1046 * added: 1058 * added:
1047 */ 1059 */
1048 if (hlock->read != 2) { 1060 if (hlock->read != 2) {
1049 if (!check_prev_add(curr, hlock, next)) 1061 if (!check_prev_add(curr, hlock, next, distance))
1050 return 0; 1062 return 0;
1051 /* 1063 /*
1052 * Stop after the first non-trylock entry, 1064 * Stop after the first non-trylock entry,
@@ -1293,7 +1305,8 @@ out_unlock_set:
1293 if (!subclass || force) 1305 if (!subclass || force)
1294 lock->class_cache = class; 1306 lock->class_cache = class;
1295 1307
1296 DEBUG_LOCKS_WARN_ON(class->subclass != subclass); 1308 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
1309 return NULL;
1297 1310
1298 return class; 1311 return class;
1299} 1312}
@@ -1308,7 +1321,8 @@ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
1308 struct list_head *hash_head = chainhashentry(chain_key); 1321 struct list_head *hash_head = chainhashentry(chain_key);
1309 struct lock_chain *chain; 1322 struct lock_chain *chain;
1310 1323
1311 DEBUG_LOCKS_WARN_ON(!irqs_disabled()); 1324 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1325 return 0;
1312 /* 1326 /*
1313 * We can walk it lock-free, because entries only get added 1327 * We can walk it lock-free, because entries only get added
1314 * to the hash: 1328 * to the hash:
@@ -1394,7 +1408,9 @@ static void check_chain_key(struct task_struct *curr)
1394 return; 1408 return;
1395 } 1409 }
1396 id = hlock->class - lock_classes; 1410 id = hlock->class - lock_classes;
1397 DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); 1411 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
1412 return;
1413
1398 if (prev_hlock && (prev_hlock->irq_context != 1414 if (prev_hlock && (prev_hlock->irq_context !=
1399 hlock->irq_context)) 1415 hlock->irq_context))
1400 chain_key = 0; 1416 chain_key = 0;
@@ -2205,15 +2221,24 @@ out_calc_hash:
2205 if (!check_prevs_add(curr, hlock)) 2221 if (!check_prevs_add(curr, hlock))
2206 return 0; 2222 return 0;
2207 graph_unlock(); 2223 graph_unlock();
2208 } 2224 } else
2225 /* after lookup_chain_cache(): */
2226 if (unlikely(!debug_locks))
2227 return 0;
2228
2209 curr->lockdep_depth++; 2229 curr->lockdep_depth++;
2210 check_chain_key(curr); 2230 check_chain_key(curr);
2231#ifdef CONFIG_DEBUG_LOCKDEP
2232 if (unlikely(!debug_locks))
2233 return 0;
2234#endif
2211 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 2235 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
2212 debug_locks_off(); 2236 debug_locks_off();
2213 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 2237 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2214 printk("turning off the locking correctness validator.\n"); 2238 printk("turning off the locking correctness validator.\n");
2215 return 0; 2239 return 0;
2216 } 2240 }
2241
2217 if (unlikely(curr->lockdep_depth > max_lockdep_depth)) 2242 if (unlikely(curr->lockdep_depth > max_lockdep_depth))
2218 max_lockdep_depth = curr->lockdep_depth; 2243 max_lockdep_depth = curr->lockdep_depth;
2219 2244
@@ -2764,4 +2789,3 @@ void debug_show_held_locks(struct task_struct *task)
2764} 2789}
2765 2790
2766EXPORT_SYMBOL_GPL(debug_show_held_locks); 2791EXPORT_SYMBOL_GPL(debug_show_held_locks);
2767
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b554b40a4aa6..58f35e586ee3 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -10,7 +10,6 @@
10 * Code for /proc/lockdep and /proc/lockdep_stats: 10 * Code for /proc/lockdep and /proc/lockdep_stats:
11 * 11 *
12 */ 12 */
13#include <linux/sched.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
16#include <linux/seq_file.h> 15#include <linux/seq_file.h>
@@ -77,12 +76,29 @@ static unsigned long count_backward_deps(struct lock_class *class)
77 return ret; 76 return ret;
78} 77}
79 78
79static void print_name(struct seq_file *m, struct lock_class *class)
80{
81 char str[128];
82 const char *name = class->name;
83
84 if (!name) {
85 name = __get_key_name(class->key, str);
86 seq_printf(m, "%s", name);
87 } else{
88 seq_printf(m, "%s", name);
89 if (class->name_version > 1)
90 seq_printf(m, "#%d", class->name_version);
91 if (class->subclass)
92 seq_printf(m, "/%d", class->subclass);
93 }
94}
95
80static int l_show(struct seq_file *m, void *v) 96static int l_show(struct seq_file *m, void *v)
81{ 97{
82 unsigned long nr_forward_deps, nr_backward_deps; 98 unsigned long nr_forward_deps, nr_backward_deps;
83 struct lock_class *class = m->private; 99 struct lock_class *class = m->private;
84 char str[128], c1, c2, c3, c4; 100 struct lock_list *entry;
85 const char *name; 101 char c1, c2, c3, c4;
86 102
87 seq_printf(m, "%p", class->key); 103 seq_printf(m, "%p", class->key);
88#ifdef CONFIG_DEBUG_LOCKDEP 104#ifdef CONFIG_DEBUG_LOCKDEP
@@ -97,16 +113,16 @@ static int l_show(struct seq_file *m, void *v)
97 get_usage_chars(class, &c1, &c2, &c3, &c4); 113 get_usage_chars(class, &c1, &c2, &c3, &c4);
98 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 114 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
99 115
100 name = class->name; 116 seq_printf(m, ": ");
101 if (!name) { 117 print_name(m, class);
102 name = __get_key_name(class->key, str); 118 seq_puts(m, "\n");
103 seq_printf(m, ": %s", name); 119
104 } else{ 120 list_for_each_entry(entry, &class->locks_after, entry) {
105 seq_printf(m, ": %s", name); 121 if (entry->distance == 1) {
106 if (class->name_version > 1) 122 seq_printf(m, " -> [%p] ", entry->class);
107 seq_printf(m, "#%d", class->name_version); 123 print_name(m, entry->class);
108 if (class->subclass) 124 seq_puts(m, "\n");
109 seq_printf(m, "/%d", class->subclass); 125 }
110 } 126 }
111 seq_puts(m, "\n"); 127 seq_puts(m, "\n");
112 128
@@ -227,7 +243,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
227 243
228 sum_forward_deps += count_forward_deps(class); 244 sum_forward_deps += count_forward_deps(class);
229 } 245 }
230#ifdef CONFIG_LOCKDEP_DEBUG 246#ifdef CONFIG_DEBUG_LOCKDEP
231 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 247 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
232#endif 248#endif
233 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 249 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
diff --git a/kernel/module.c b/kernel/module.c
index d0f2260a0210..f77e893e4620 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -537,6 +537,8 @@ static int already_uses(struct module *a, struct module *b)
537static int use_module(struct module *a, struct module *b) 537static int use_module(struct module *a, struct module *b)
538{ 538{
539 struct module_use *use; 539 struct module_use *use;
540 int no_warn;
541
540 if (b == NULL || already_uses(a, b)) return 1; 542 if (b == NULL || already_uses(a, b)) return 1;
541 543
542 if (!strong_try_module_get(b)) 544 if (!strong_try_module_get(b))
@@ -552,6 +554,7 @@ static int use_module(struct module *a, struct module *b)
552 554
553 use->module_which_uses = a; 555 use->module_which_uses = a;
554 list_add(&use->list, &b->modules_which_use_me); 556 list_add(&use->list, &b->modules_which_use_me);
557 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
555 return 1; 558 return 1;
556} 559}
557 560
@@ -569,6 +572,7 @@ static void module_unload_free(struct module *mod)
569 module_put(i); 572 module_put(i);
570 list_del(&use->list); 573 list_del(&use->list);
571 kfree(use); 574 kfree(use);
575 sysfs_remove_link(i->holders_dir, mod->name);
572 /* There can be at most one match. */ 576 /* There can be at most one match. */
573 break; 577 break;
574 } 578 }
@@ -1064,7 +1068,8 @@ static inline void remove_sect_attrs(struct module *mod)
1064} 1068}
1065#endif /* CONFIG_KALLSYMS */ 1069#endif /* CONFIG_KALLSYMS */
1066 1070
1067static int module_add_modinfo_attrs(struct module *mod) 1071#ifdef CONFIG_SYSFS
1072int module_add_modinfo_attrs(struct module *mod)
1068{ 1073{
1069 struct module_attribute *attr; 1074 struct module_attribute *attr;
1070 struct module_attribute *temp_attr; 1075 struct module_attribute *temp_attr;
@@ -1090,7 +1095,7 @@ static int module_add_modinfo_attrs(struct module *mod)
1090 return error; 1095 return error;
1091} 1096}
1092 1097
1093static void module_remove_modinfo_attrs(struct module *mod) 1098void module_remove_modinfo_attrs(struct module *mod)
1094{ 1099{
1095 struct module_attribute *attr; 1100 struct module_attribute *attr;
1096 int i; 1101 int i;
@@ -1105,10 +1110,10 @@ static void module_remove_modinfo_attrs(struct module *mod)
1105 } 1110 }
1106 kfree(mod->modinfo_attrs); 1111 kfree(mod->modinfo_attrs);
1107} 1112}
1113#endif
1108 1114
1109static int mod_sysfs_setup(struct module *mod, 1115#ifdef CONFIG_SYSFS
1110 struct kernel_param *kparam, 1116int mod_sysfs_init(struct module *mod)
1111 unsigned int num_params)
1112{ 1117{
1113 int err; 1118 int err;
1114 1119
@@ -1125,21 +1130,30 @@ static int mod_sysfs_setup(struct module *mod,
1125 kobj_set_kset_s(&mod->mkobj, module_subsys); 1130 kobj_set_kset_s(&mod->mkobj, module_subsys);
1126 mod->mkobj.mod = mod; 1131 mod->mkobj.mod = mod;
1127 1132
1128 /* delay uevent until full sysfs population */
1129 kobject_init(&mod->mkobj.kobj); 1133 kobject_init(&mod->mkobj.kobj);
1134
1135out:
1136 return err;
1137}
1138
1139int mod_sysfs_setup(struct module *mod,
1140 struct kernel_param *kparam,
1141 unsigned int num_params)
1142{
1143 int err;
1144
1145 /* delay uevent until full sysfs population */
1130 err = kobject_add(&mod->mkobj.kobj); 1146 err = kobject_add(&mod->mkobj.kobj);
1131 if (err) 1147 if (err)
1132 goto out; 1148 goto out;
1133 1149
1134 mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); 1150 mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
1135 if (!mod->drivers_dir) { 1151 if (!mod->holders_dir)
1136 err = -ENOMEM;
1137 goto out_unreg; 1152 goto out_unreg;
1138 }
1139 1153
1140 err = module_param_sysfs_setup(mod, kparam, num_params); 1154 err = module_param_sysfs_setup(mod, kparam, num_params);
1141 if (err) 1155 if (err)
1142 goto out_unreg_drivers; 1156 goto out_unreg_holders;
1143 1157
1144 err = module_add_modinfo_attrs(mod); 1158 err = module_add_modinfo_attrs(mod);
1145 if (err) 1159 if (err)
@@ -1150,21 +1164,22 @@ static int mod_sysfs_setup(struct module *mod,
1150 1164
1151out_unreg_param: 1165out_unreg_param:
1152 module_param_sysfs_remove(mod); 1166 module_param_sysfs_remove(mod);
1153out_unreg_drivers: 1167out_unreg_holders:
1154 kobject_unregister(mod->drivers_dir); 1168 kobject_unregister(mod->holders_dir);
1155out_unreg: 1169out_unreg:
1156 kobject_del(&mod->mkobj.kobj); 1170 kobject_del(&mod->mkobj.kobj);
1157 kobject_put(&mod->mkobj.kobj); 1171 kobject_put(&mod->mkobj.kobj);
1158out: 1172out:
1159 return err; 1173 return err;
1160} 1174}
1175#endif
1161 1176
1162static void mod_kobject_remove(struct module *mod) 1177static void mod_kobject_remove(struct module *mod)
1163{ 1178{
1164 module_remove_modinfo_attrs(mod); 1179 module_remove_modinfo_attrs(mod);
1165 module_param_sysfs_remove(mod); 1180 module_param_sysfs_remove(mod);
1166 kobject_unregister(mod->drivers_dir); 1181 kobject_unregister(mod->mkobj.drivers_dir);
1167 1182 kobject_unregister(mod->holders_dir);
1168 kobject_unregister(&mod->mkobj.kobj); 1183 kobject_unregister(&mod->mkobj.kobj);
1169} 1184}
1170 1185
@@ -1768,6 +1783,10 @@ static struct module *load_module(void __user *umod,
1768 /* Now we've moved module, initialize linked lists, etc. */ 1783 /* Now we've moved module, initialize linked lists, etc. */
1769 module_unload_init(mod); 1784 module_unload_init(mod);
1770 1785
1786 /* Initialize kobject, so we can reference it. */
1787 if (mod_sysfs_init(mod) != 0)
1788 goto cleanup;
1789
1771 /* Set up license info based on the info section */ 1790 /* Set up license info based on the info section */
1772 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1791 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1773 1792
@@ -2327,6 +2346,7 @@ void print_modules(void)
2327 printk("\n"); 2346 printk("\n");
2328} 2347}
2329 2348
2349#ifdef CONFIG_SYSFS
2330static char *make_driver_name(struct device_driver *drv) 2350static char *make_driver_name(struct device_driver *drv)
2331{ 2351{
2332 char *driver_name; 2352 char *driver_name;
@@ -2340,19 +2360,43 @@ static char *make_driver_name(struct device_driver *drv)
2340 return driver_name; 2360 return driver_name;
2341} 2361}
2342 2362
2363static void module_create_drivers_dir(struct module_kobject *mk)
2364{
2365 if (!mk || mk->drivers_dir)
2366 return;
2367
2368 mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
2369}
2370
2343void module_add_driver(struct module *mod, struct device_driver *drv) 2371void module_add_driver(struct module *mod, struct device_driver *drv)
2344{ 2372{
2345 char *driver_name; 2373 char *driver_name;
2346 int no_warn; 2374 int no_warn;
2375 struct module_kobject *mk = NULL;
2347 2376
2348 if (!mod || !drv) 2377 if (!drv)
2378 return;
2379
2380 if (mod)
2381 mk = &mod->mkobj;
2382 else if (drv->mod_name) {
2383 struct kobject *mkobj;
2384
2385 /* Lookup built-in module entry in /sys/modules */
2386 mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name);
2387 if (mkobj)
2388 mk = container_of(mkobj, struct module_kobject, kobj);
2389 }
2390
2391 if (!mk)
2349 return; 2392 return;
2350 2393
2351 /* Don't check return codes; these calls are idempotent */ 2394 /* Don't check return codes; these calls are idempotent */
2352 no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); 2395 no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
2353 driver_name = make_driver_name(drv); 2396 driver_name = make_driver_name(drv);
2354 if (driver_name) { 2397 if (driver_name) {
2355 no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, 2398 module_create_drivers_dir(mk);
2399 no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
2356 driver_name); 2400 driver_name);
2357 kfree(driver_name); 2401 kfree(driver_name);
2358 } 2402 }
@@ -2367,16 +2411,23 @@ void module_remove_driver(struct device_driver *drv)
2367 return; 2411 return;
2368 2412
2369 sysfs_remove_link(&drv->kobj, "module"); 2413 sysfs_remove_link(&drv->kobj, "module");
2370 if (drv->owner && drv->owner->drivers_dir) { 2414 if (drv->owner && drv->owner->mkobj.drivers_dir) {
2371 driver_name = make_driver_name(drv); 2415 driver_name = make_driver_name(drv);
2372 if (driver_name) { 2416 if (driver_name) {
2373 sysfs_remove_link(drv->owner->drivers_dir, 2417 sysfs_remove_link(drv->owner->mkobj.drivers_dir,
2374 driver_name); 2418 driver_name);
2375 kfree(driver_name); 2419 kfree(driver_name);
2376 } 2420 }
2377 } 2421 }
2422 /*
2423 * Undo the additional reference we added in module_add_driver()
2424 * via kset_find_obj()
2425 */
2426 if (drv->mod_name)
2427 kobject_put(&drv->kobj);
2378} 2428}
2379EXPORT_SYMBOL(module_remove_driver); 2429EXPORT_SYMBOL(module_remove_driver);
2430#endif
2380 2431
2381#ifdef CONFIG_MODVERSIONS 2432#ifdef CONFIG_MODVERSIONS
2382/* Generate the signature for struct module here, too, for modversions. */ 2433/* Generate the signature for struct module here, too, for modversions. */
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 841539d72c55..d17436cdea1b 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -13,7 +13,6 @@
13 * Released under the General Public License (GPL). 13 * Released under the General Public License (GPL).
14 */ 14 */
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <linux/sched.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/module.h> 17#include <linux/module.h>
19#include <linux/poison.h> 18#include <linux/poison.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 525e365f7239..623d1828259a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -150,6 +150,7 @@ EXPORT_SYMBOL(panic);
150 * 'R' - User forced a module unload. 150 * 'R' - User forced a module unload.
151 * 'M' - Machine had a machine check experience. 151 * 'M' - Machine had a machine check experience.
152 * 'B' - System has hit bad_page. 152 * 'B' - System has hit bad_page.
153 * 'U' - Userspace-defined naughtiness.
153 * 154 *
154 * The string is overwritten by the next call to print_taint(). 155 * The string is overwritten by the next call to print_taint().
155 */ 156 */
@@ -158,13 +159,14 @@ const char *print_tainted(void)
158{ 159{
159 static char buf[20]; 160 static char buf[20];
160 if (tainted) { 161 if (tainted) {
161 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", 162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c",
162 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
163 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
164 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
165 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
166 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
167 tainted & TAINT_BAD_PAGE ? 'B' : ' '); 168 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
169 tainted & TAINT_USER ? 'U' : ' ');
168 } 170 }
169 else 171 else
170 snprintf(buf, sizeof(buf), "Not tainted"); 172 snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index 718945da8f58..e265b13195b1 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -389,6 +389,7 @@ struct module_param_attrs
389 struct param_attribute attrs[0]; 389 struct param_attribute attrs[0];
390}; 390};
391 391
392#ifdef CONFIG_SYSFS
392#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 393#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
393 394
394static ssize_t param_attr_show(struct module_attribute *mattr, 395static ssize_t param_attr_show(struct module_attribute *mattr,
@@ -424,6 +425,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
424 return len; 425 return len;
425 return err; 426 return err;
426} 427}
428#endif
427 429
428#ifdef CONFIG_MODULES 430#ifdef CONFIG_MODULES
429#define __modinit 431#define __modinit
@@ -431,6 +433,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
431#define __modinit __init 433#define __modinit __init
432#endif 434#endif
433 435
436#ifdef CONFIG_SYSFS
434/* 437/*
435 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME 438 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
436 * @mk: struct module_kobject (contains parent kobject) 439 * @mk: struct module_kobject (contains parent kobject)
@@ -498,9 +501,7 @@ param_sysfs_setup(struct module_kobject *mk,
498 return mp; 501 return mp;
499} 502}
500 503
501
502#ifdef CONFIG_MODULES 504#ifdef CONFIG_MODULES
503
504/* 505/*
505 * module_param_sysfs_setup - setup sysfs support for one module 506 * module_param_sysfs_setup - setup sysfs support for one module
506 * @mod: module 507 * @mod: module
@@ -561,14 +562,11 @@ static void __init kernel_param_sysfs_setup(const char *name,
561 mk->mod = THIS_MODULE; 562 mk->mod = THIS_MODULE;
562 kobj_set_kset_s(mk, module_subsys); 563 kobj_set_kset_s(mk, module_subsys);
563 kobject_set_name(&mk->kobj, name); 564 kobject_set_name(&mk->kobj, name);
564 ret = kobject_register(&mk->kobj); 565 kobject_init(&mk->kobj);
566 ret = kobject_add(&mk->kobj);
565 BUG_ON(ret < 0); 567 BUG_ON(ret < 0);
566 568 param_sysfs_setup(mk, kparam, num_params, name_skip);
567 /* no need to keep the kobject if no parameter is exported */ 569 kobject_uevent(&mk->kobj, KOBJ_ADD);
568 if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
569 kobject_unregister(&mk->kobj);
570 kfree(mk);
571 }
572} 570}
573 571
574/* 572/*
@@ -626,7 +624,6 @@ static void __init param_sysfs_builtin(void)
626 624
627 625
628/* module-related sysfs stuff */ 626/* module-related sysfs stuff */
629#ifdef CONFIG_SYSFS
630 627
631#define to_module_attr(n) container_of(n, struct module_attribute, attr); 628#define to_module_attr(n) container_of(n, struct module_attribute, attr);
632#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 629#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
@@ -674,19 +671,27 @@ static struct sysfs_ops module_sysfs_ops = {
674 .store = module_attr_store, 671 .store = module_attr_store,
675}; 672};
676 673
677#else 674static struct kobj_type module_ktype;
678static struct sysfs_ops module_sysfs_ops = { 675
679 .show = NULL, 676static int uevent_filter(struct kset *kset, struct kobject *kobj)
680 .store = NULL, 677{
678 struct kobj_type *ktype = get_ktype(kobj);
679
680 if (ktype == &module_ktype)
681 return 1;
682 return 0;
683}
684
685static struct kset_uevent_ops module_uevent_ops = {
686 .filter = uevent_filter,
681}; 687};
682#endif 688
689decl_subsys(module, &module_ktype, &module_uevent_ops);
683 690
684static struct kobj_type module_ktype = { 691static struct kobj_type module_ktype = {
685 .sysfs_ops = &module_sysfs_ops, 692 .sysfs_ops = &module_sysfs_ops,
686}; 693};
687 694
688decl_subsys(module, &module_ktype, NULL);
689
690/* 695/*
691 * param_sysfs_init - wrapper for built-in params support 696 * param_sysfs_init - wrapper for built-in params support
692 */ 697 */
@@ -707,6 +712,15 @@ static int __init param_sysfs_init(void)
707} 712}
708subsys_initcall(param_sysfs_init); 713subsys_initcall(param_sysfs_init);
709 714
715#else
716#if 0
717static struct sysfs_ops module_sysfs_ops = {
718 .show = NULL,
719 .store = NULL,
720};
721#endif
722#endif
723
710EXPORT_SYMBOL(param_set_byte); 724EXPORT_SYMBOL(param_set_byte);
711EXPORT_SYMBOL(param_get_byte); 725EXPORT_SYMBOL(param_get_byte);
712EXPORT_SYMBOL(param_set_short); 726EXPORT_SYMBOL(param_set_short);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c3e1e6dfb5b..657f77697415 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
304 * should be able to see it. 304 * should be able to see it.
305 */ 305 */
306 struct task_struct *p; 306 struct task_struct *p;
307 read_lock(&tasklist_lock); 307 rcu_read_lock();
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else if (p->tgid == pid && p->signal) { 315 } else {
316 error = cpu_clock_sample_group(which_clock, 316 read_lock(&tasklist_lock);
317 p, &rtn); 317 if (p->tgid == pid && p->signal) {
318 error =
319 cpu_clock_sample_group(which_clock,
320 p, &rtn);
321 }
322 read_unlock(&tasklist_lock);
318 } 323 }
319 } 324 }
320 read_unlock(&tasklist_lock); 325 rcu_read_unlock();
321 } 326 }
322 327
323 if (error) 328 if (error)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5fe87de10ff0..44318ca71978 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
145 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
146static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
147 147
148static int posix_timer_fn(struct hrtimer *data); 148static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
149 149
150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
151 151
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
334 334
335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
336 */ 336 */
337static int posix_timer_fn(struct hrtimer *timer) 337static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
338{ 338{
339 struct k_itimer *timr; 339 struct k_itimer *timr;
340 unsigned long flags; 340 unsigned long flags;
341 int si_private = 0; 341 int si_private = 0;
342 int ret = HRTIMER_NORESTART; 342 enum hrtimer_restart ret = HRTIMER_NORESTART;
343 343
344 timr = container_of(timer, struct k_itimer, it.real.timer); 344 timr = container_of(timer, struct k_itimer, it.real.timer);
345 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer)
356 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
357 timr->it_overrun += 357 timr->it_overrun +=
358 hrtimer_forward(timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time, 359 hrtimer_cb_get_time(timer),
360 timr->it.real.interval); 360 timr->it.real.interval);
361 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
362 ++timr->it_requeue_pending; 362 ++timr->it_requeue_pending;
@@ -399,10 +399,9 @@ EXPORT_SYMBOL_GPL(register_posix_clock);
399static struct k_itimer * alloc_posix_timer(void) 399static struct k_itimer * alloc_posix_timer(void)
400{ 400{
401 struct k_itimer *tmr; 401 struct k_itimer *tmr;
402 tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL); 402 tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
403 if (!tmr) 403 if (!tmr)
404 return tmr; 404 return tmr;
405 memset(tmr, 0, sizeof (struct k_itimer));
406 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 405 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
407 kmem_cache_free(posix_timers_cache, tmr); 406 kmem_cache_free(posix_timers_cache, tmr);
408 tmr = NULL; 407 tmr = NULL;
@@ -723,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags,
723 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) 722 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
724 return 0; 723 return 0;
725 724
726 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 725 mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
727 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 726 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
728 timr->it.real.timer.function = posix_timer_fn; 727 timr->it.real.timer.function = posix_timer_fn;
729 728
@@ -735,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags,
735 /* SIGEV_NONE timers are not queued ! See common_timer_get */ 734 /* SIGEV_NONE timers are not queued ! See common_timer_get */
736 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 735 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
737 /* Setup correct expiry time for relative timers */ 736 /* Setup correct expiry time for relative timers */
738 if (mode == HRTIMER_REL) 737 if (mode == HRTIMER_MODE_REL)
739 timer->expires = ktime_add(timer->expires, 738 timer->expires = ktime_add(timer->expires,
740 timer->base->get_time()); 739 timer->base->get_time());
741 return 0; 740 return 0;
@@ -951,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
951 struct timespec *tsave, struct timespec __user *rmtp) 950 struct timespec *tsave, struct timespec __user *rmtp)
952{ 951{
953 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 952 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
954 HRTIMER_ABS : HRTIMER_REL, which_clock); 953 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
954 which_clock);
955} 955}
956 956
957asmlinkage long 957asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ed296225dcd4..95f6657fff73 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -131,3 +131,29 @@ config SUSPEND_SMP
131 bool 131 bool
132 depends on HOTPLUG_CPU && X86 && PM 132 depends on HOTPLUG_CPU && X86 && PM
133 default y 133 default y
134
135config APM_EMULATION
136 tristate "Advanced Power Management Emulation"
137 depends on PM && SYS_SUPPORTS_APM_EMULATION
138 help
139 APM is a BIOS specification for saving power using several different
140 techniques. This is mostly useful for battery powered laptops with
141 APM compliant BIOSes. If you say Y here, the system time will be
142 reset after a RESUME operation, the /proc/apm device will provide
143 battery status information, and user-space programs will receive
144 notification of APM "events" (e.g. battery status change).
145
146 In order to use APM, you will need supporting software. For location
147 and more information, read <file:Documentation/pm.txt> and the
148 Battery Powered Linux mini-HOWTO, available from
149 <http://www.tldp.org/docs.html#howto>.
150
151 This driver does not spin down disk drives (see the hdparm(8)
152 manpage ("man 8 hdparm") for that), and it doesn't turn off
153 VESA-compliant "green" monitors.
154
155 Generally, if you don't have a battery in your machine, there isn't
156 much point in using this driver and you should say N. If you get
157 random kernel OOPSes or reboots that don't seem to be related to
158 anything, try disabling/enabling this option (or disabling/enabling
159 APM in your BIOS).
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 88fc5d7ac737..406b20adb27a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -87,52 +87,24 @@ static inline void platform_finish(void)
87 } 87 }
88} 88}
89 89
90static void unprepare_processes(void)
91{
92 thaw_processes();
93 pm_restore_console();
94}
95
90static int prepare_processes(void) 96static int prepare_processes(void)
91{ 97{
92 int error = 0; 98 int error = 0;
93 99
94 pm_prepare_console(); 100 pm_prepare_console();
95
96 error = disable_nonboot_cpus();
97 if (error)
98 goto enable_cpus;
99
100 if (freeze_processes()) { 101 if (freeze_processes()) {
101 error = -EBUSY; 102 error = -EBUSY;
102 goto thaw; 103 unprepare_processes();
103 } 104 }
104
105 if (pm_disk_mode == PM_DISK_TESTPROC) {
106 printk("swsusp debug: Waiting for 5 seconds.\n");
107 mdelay(5000);
108 goto thaw;
109 }
110
111 error = platform_prepare();
112 if (error)
113 goto thaw;
114
115 /* Free memory before shutting down devices. */
116 if (!(error = swsusp_shrink_memory()))
117 return 0;
118
119 platform_finish();
120 thaw:
121 thaw_processes();
122 enable_cpus:
123 enable_nonboot_cpus();
124 pm_restore_console();
125 return error; 105 return error;
126} 106}
127 107
128static void unprepare_processes(void)
129{
130 platform_finish();
131 thaw_processes();
132 enable_nonboot_cpus();
133 pm_restore_console();
134}
135
136/** 108/**
137 * pm_suspend_disk - The granpappy of hibernation power management. 109 * pm_suspend_disk - The granpappy of hibernation power management.
138 * 110 *
@@ -150,29 +122,45 @@ int pm_suspend_disk(void)
150 if (error) 122 if (error)
151 return error; 123 return error;
152 124
153 if (pm_disk_mode == PM_DISK_TESTPROC) 125 if (pm_disk_mode == PM_DISK_TESTPROC) {
154 return 0; 126 printk("swsusp debug: Waiting for 5 seconds.\n");
127 mdelay(5000);
128 goto Thaw;
129 }
130 /* Free memory before shutting down devices. */
131 error = swsusp_shrink_memory();
132 if (error)
133 goto Thaw;
134
135 error = platform_prepare();
136 if (error)
137 goto Thaw;
155 138
156 suspend_console(); 139 suspend_console();
157 error = device_suspend(PMSG_FREEZE); 140 error = device_suspend(PMSG_FREEZE);
158 if (error) { 141 if (error) {
159 resume_console(); 142 printk(KERN_ERR "PM: Some devices failed to suspend\n");
160 printk("Some devices failed to suspend\n"); 143 goto Resume_devices;
161 goto Thaw;
162 } 144 }
145 error = disable_nonboot_cpus();
146 if (error)
147 goto Enable_cpus;
163 148
164 if (pm_disk_mode == PM_DISK_TEST) { 149 if (pm_disk_mode == PM_DISK_TEST) {
165 printk("swsusp debug: Waiting for 5 seconds.\n"); 150 printk("swsusp debug: Waiting for 5 seconds.\n");
166 mdelay(5000); 151 mdelay(5000);
167 goto Done; 152 goto Enable_cpus;
168 } 153 }
169 154
170 pr_debug("PM: snapshotting memory.\n"); 155 pr_debug("PM: snapshotting memory.\n");
171 in_suspend = 1; 156 in_suspend = 1;
172 if ((error = swsusp_suspend())) 157 error = swsusp_suspend();
173 goto Done; 158 if (error)
159 goto Enable_cpus;
174 160
175 if (in_suspend) { 161 if (in_suspend) {
162 enable_nonboot_cpus();
163 platform_finish();
176 device_resume(); 164 device_resume();
177 resume_console(); 165 resume_console();
178 pr_debug("PM: writing image.\n"); 166 pr_debug("PM: writing image.\n");
@@ -188,7 +176,10 @@ int pm_suspend_disk(void)
188 } 176 }
189 177
190 swsusp_free(); 178 swsusp_free();
191 Done: 179 Enable_cpus:
180 enable_nonboot_cpus();
181 Resume_devices:
182 platform_finish();
192 device_resume(); 183 device_resume();
193 resume_console(); 184 resume_console();
194 Thaw: 185 Thaw:
@@ -237,19 +228,28 @@ static int software_resume(void)
237 228
238 pr_debug("PM: Checking swsusp image.\n"); 229 pr_debug("PM: Checking swsusp image.\n");
239 230
240 if ((error = swsusp_check())) 231 error = swsusp_check();
232 if (error)
241 goto Done; 233 goto Done;
242 234
243 pr_debug("PM: Preparing processes for restore.\n"); 235 pr_debug("PM: Preparing processes for restore.\n");
244 236
245 if ((error = prepare_processes())) { 237 error = prepare_processes();
238 if (error) {
246 swsusp_close(); 239 swsusp_close();
247 goto Done; 240 goto Done;
248 } 241 }
249 242
243 error = platform_prepare();
244 if (error) {
245 swsusp_free();
246 goto Thaw;
247 }
248
250 pr_debug("PM: Reading swsusp image.\n"); 249 pr_debug("PM: Reading swsusp image.\n");
251 250
252 if ((error = swsusp_read())) { 251 error = swsusp_read();
252 if (error) {
253 swsusp_free(); 253 swsusp_free();
254 goto Thaw; 254 goto Thaw;
255 } 255 }
@@ -257,21 +257,22 @@ static int software_resume(void)
257 pr_debug("PM: Preparing devices for restore.\n"); 257 pr_debug("PM: Preparing devices for restore.\n");
258 258
259 suspend_console(); 259 suspend_console();
260 if ((error = device_suspend(PMSG_PRETHAW))) { 260 error = device_suspend(PMSG_PRETHAW);
261 resume_console(); 261 if (error)
262 printk("Some devices failed to suspend\n"); 262 goto Free;
263 swsusp_free();
264 goto Thaw;
265 }
266 263
267 mb(); 264 error = disable_nonboot_cpus();
265 if (!error)
266 swsusp_resume();
268 267
269 pr_debug("PM: Restoring saved image.\n"); 268 enable_nonboot_cpus();
270 swsusp_resume(); 269 Free:
271 pr_debug("PM: Restore failed, recovering.n"); 270 swsusp_free();
271 platform_finish();
272 device_resume(); 272 device_resume();
273 resume_console(); 273 resume_console();
274 Thaw: 274 Thaw:
275 printk(KERN_ERR "PM: Restore failed, recovering.\n");
275 unprepare_processes(); 276 unprepare_processes();
276 Done: 277 Done:
277 /* For success case, the suspend path will release the lock */ 278 /* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index ff3a6182f5f0..a064dfd8877a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/resume-trace.h> 21#include <linux/resume-trace.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/vmstat.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -43,6 +44,11 @@ void pm_set_ops(struct pm_ops * ops)
43 mutex_unlock(&pm_mutex); 44 mutex_unlock(&pm_mutex);
44} 45}
45 46
47static inline void pm_finish(suspend_state_t state)
48{
49 if (pm_ops->finish)
50 pm_ops->finish(state);
51}
46 52
47/** 53/**
48 * suspend_prepare - Do prep work before entering low-power state. 54 * suspend_prepare - Do prep work before entering low-power state.
@@ -63,16 +69,13 @@ static int suspend_prepare(suspend_state_t state)
63 69
64 pm_prepare_console(); 70 pm_prepare_console();
65 71
66 error = disable_nonboot_cpus();
67 if (error)
68 goto Enable_cpu;
69
70 if (freeze_processes()) { 72 if (freeze_processes()) {
71 error = -EAGAIN; 73 error = -EAGAIN;
72 goto Thaw; 74 goto Thaw;
73 } 75 }
74 76
75 if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { 77 if ((free_pages = global_page_state(NR_FREE_PAGES))
78 < FREE_PAGE_NUMBER) {
76 pr_debug("PM: free some memory\n"); 79 pr_debug("PM: free some memory\n");
77 shrink_all_memory(FREE_PAGE_NUMBER - free_pages); 80 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
78 if (nr_free_pages() < FREE_PAGE_NUMBER) { 81 if (nr_free_pages() < FREE_PAGE_NUMBER) {
@@ -88,18 +91,22 @@ static int suspend_prepare(suspend_state_t state)
88 } 91 }
89 92
90 suspend_console(); 93 suspend_console();
91 if ((error = device_suspend(PMSG_SUSPEND))) { 94 error = device_suspend(PMSG_SUSPEND);
95 if (error) {
92 printk(KERN_ERR "Some devices failed to suspend\n"); 96 printk(KERN_ERR "Some devices failed to suspend\n");
93 goto Finish; 97 goto Resume_devices;
94 } 98 }
95 return 0; 99 error = disable_nonboot_cpus();
96 Finish: 100 if (!error)
97 if (pm_ops->finish) 101 return 0;
98 pm_ops->finish(state); 102
103 enable_nonboot_cpus();
104 Resume_devices:
105 pm_finish(state);
106 device_resume();
107 resume_console();
99 Thaw: 108 Thaw:
100 thaw_processes(); 109 thaw_processes();
101 Enable_cpu:
102 enable_nonboot_cpus();
103 pm_restore_console(); 110 pm_restore_console();
104 return error; 111 return error;
105} 112}
@@ -134,12 +141,11 @@ int suspend_enter(suspend_state_t state)
134 141
135static void suspend_finish(suspend_state_t state) 142static void suspend_finish(suspend_state_t state)
136{ 143{
144 enable_nonboot_cpus();
145 pm_finish(state);
137 device_resume(); 146 device_resume();
138 resume_console(); 147 resume_console();
139 thaw_processes(); 148 thaw_processes();
140 enable_nonboot_cpus();
141 if (pm_ops && pm_ops->finish)
142 pm_ops->finish(state);
143 pm_restore_console(); 149 pm_restore_console();
144} 150}
145 151
@@ -161,7 +167,10 @@ static inline int valid_state(suspend_state_t state)
161 if (state == PM_SUSPEND_DISK) 167 if (state == PM_SUSPEND_DISK)
162 return 1; 168 return 1;
163 169
164 if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) 170 /* all other states need lowlevel support and need to be
171 * valid to the lowlevel implementation, no valid callback
172 * implies that all are valid. */
173 if (!pm_ops || (pm_ops->valid && !pm_ops->valid(state)))
165 return 0; 174 return 0;
166 return 1; 175 return 1;
167} 176}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c024606221c4..fc53ad068128 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -591,7 +591,7 @@ static unsigned int count_free_highmem_pages(void)
591 591
592 for_each_zone(zone) 592 for_each_zone(zone)
593 if (populated_zone(zone) && is_highmem(zone)) 593 if (populated_zone(zone) && is_highmem(zone))
594 cnt += zone->free_pages; 594 cnt += zone_page_state(zone, NR_FREE_PAGES);
595 595
596 return cnt; 596 return cnt;
597} 597}
@@ -869,7 +869,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
869 for_each_zone(zone) { 869 for_each_zone(zone) {
870 meta += snapshot_additional_pages(zone); 870 meta += snapshot_additional_pages(zone);
871 if (!is_highmem(zone)) 871 if (!is_highmem(zone))
872 free += zone->free_pages; 872 free += zone_page_state(zone, NR_FREE_PAGES);
873 } 873 }
874 874
875 nr_pages += count_pages_for_highmem(nr_highmem); 875 nr_pages += count_pages_for_highmem(nr_highmem);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 31aa0390c777..7fb834397a0d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -230,9 +230,10 @@ int swsusp_shrink_memory(void)
230 for_each_zone (zone) 230 for_each_zone (zone)
231 if (populated_zone(zone)) { 231 if (populated_zone(zone)) {
232 if (is_highmem(zone)) { 232 if (is_highmem(zone)) {
233 highmem_size -= zone->free_pages; 233 highmem_size -=
234 zone_page_state(zone, NR_FREE_PAGES);
234 } else { 235 } else {
235 tmp -= zone->free_pages; 236 tmp -= zone_page_state(zone, NR_FREE_PAGES);
236 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 237 tmp += zone->lowmem_reserve[ZONE_NORMAL];
237 tmp += snapshot_additional_pages(zone); 238 tmp += snapshot_additional_pages(zone);
238 } 239 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index f7b7a785a5c6..dd09efe7df54 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -37,6 +37,7 @@ static struct snapshot_data {
37 int mode; 37 int mode;
38 char frozen; 38 char frozen;
39 char ready; 39 char ready;
40 char platform_suspend;
40} snapshot_state; 41} snapshot_state;
41 42
42static atomic_t device_available = ATOMIC_INIT(1); 43static atomic_t device_available = ATOMIC_INIT(1);
@@ -66,6 +67,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
66 data->bitmap = NULL; 67 data->bitmap = NULL;
67 data->frozen = 0; 68 data->frozen = 0;
68 data->ready = 0; 69 data->ready = 0;
70 data->platform_suspend = 0;
69 71
70 return 0; 72 return 0;
71} 73}
@@ -122,6 +124,92 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
122 return res; 124 return res;
123} 125}
124 126
127static inline int platform_prepare(void)
128{
129 int error = 0;
130
131 if (pm_ops && pm_ops->prepare)
132 error = pm_ops->prepare(PM_SUSPEND_DISK);
133
134 return error;
135}
136
137static inline void platform_finish(void)
138{
139 if (pm_ops && pm_ops->finish)
140 pm_ops->finish(PM_SUSPEND_DISK);
141}
142
143static inline int snapshot_suspend(int platform_suspend)
144{
145 int error;
146
147 mutex_lock(&pm_mutex);
148 /* Free memory before shutting down devices. */
149 error = swsusp_shrink_memory();
150 if (error)
151 goto Finish;
152
153 if (platform_suspend) {
154 error = platform_prepare();
155 if (error)
156 goto Finish;
157 }
158 suspend_console();
159 error = device_suspend(PMSG_FREEZE);
160 if (error)
161 goto Resume_devices;
162
163 error = disable_nonboot_cpus();
164 if (!error) {
165 in_suspend = 1;
166 error = swsusp_suspend();
167 }
168 enable_nonboot_cpus();
169 Resume_devices:
170 if (platform_suspend)
171 platform_finish();
172
173 device_resume();
174 resume_console();
175 Finish:
176 mutex_unlock(&pm_mutex);
177 return error;
178}
179
180static inline int snapshot_restore(int platform_suspend)
181{
182 int error;
183
184 mutex_lock(&pm_mutex);
185 pm_prepare_console();
186 if (platform_suspend) {
187 error = platform_prepare();
188 if (error)
189 goto Finish;
190 }
191 suspend_console();
192 error = device_suspend(PMSG_PRETHAW);
193 if (error)
194 goto Resume_devices;
195
196 error = disable_nonboot_cpus();
197 if (!error)
198 error = swsusp_resume();
199
200 enable_nonboot_cpus();
201 Resume_devices:
202 if (platform_suspend)
203 platform_finish();
204
205 device_resume();
206 resume_console();
207 Finish:
208 pm_restore_console();
209 mutex_unlock(&pm_mutex);
210 return error;
211}
212
125static int snapshot_ioctl(struct inode *inode, struct file *filp, 213static int snapshot_ioctl(struct inode *inode, struct file *filp,
126 unsigned int cmd, unsigned long arg) 214 unsigned int cmd, unsigned long arg)
127{ 215{
@@ -145,14 +233,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
145 if (data->frozen) 233 if (data->frozen)
146 break; 234 break;
147 mutex_lock(&pm_mutex); 235 mutex_lock(&pm_mutex);
148 error = disable_nonboot_cpus(); 236 if (freeze_processes()) {
149 if (!error) { 237 thaw_processes();
150 error = freeze_processes(); 238 error = -EBUSY;
151 if (error) {
152 thaw_processes();
153 enable_nonboot_cpus();
154 error = -EBUSY;
155 }
156 } 239 }
157 mutex_unlock(&pm_mutex); 240 mutex_unlock(&pm_mutex);
158 if (!error) 241 if (!error)
@@ -164,7 +247,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
164 break; 247 break;
165 mutex_lock(&pm_mutex); 248 mutex_lock(&pm_mutex);
166 thaw_processes(); 249 thaw_processes();
167 enable_nonboot_cpus();
168 mutex_unlock(&pm_mutex); 250 mutex_unlock(&pm_mutex);
169 data->frozen = 0; 251 data->frozen = 0;
170 break; 252 break;
@@ -174,20 +256,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
174 error = -EPERM; 256 error = -EPERM;
175 break; 257 break;
176 } 258 }
177 mutex_lock(&pm_mutex); 259 error = snapshot_suspend(data->platform_suspend);
178 /* Free memory before shutting down devices. */
179 error = swsusp_shrink_memory();
180 if (!error) {
181 suspend_console();
182 error = device_suspend(PMSG_FREEZE);
183 if (!error) {
184 in_suspend = 1;
185 error = swsusp_suspend();
186 device_resume();
187 }
188 resume_console();
189 }
190 mutex_unlock(&pm_mutex);
191 if (!error) 260 if (!error)
192 error = put_user(in_suspend, (unsigned int __user *)arg); 261 error = put_user(in_suspend, (unsigned int __user *)arg);
193 if (!error) 262 if (!error)
@@ -201,17 +270,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
201 error = -EPERM; 270 error = -EPERM;
202 break; 271 break;
203 } 272 }
204 mutex_lock(&pm_mutex); 273 error = snapshot_restore(data->platform_suspend);
205 pm_prepare_console();
206 suspend_console();
207 error = device_suspend(PMSG_PRETHAW);
208 if (!error) {
209 error = swsusp_resume();
210 device_resume();
211 }
212 resume_console();
213 pm_restore_console();
214 mutex_unlock(&pm_mutex);
215 break; 274 break;
216 275
217 case SNAPSHOT_FREE: 276 case SNAPSHOT_FREE:
@@ -282,6 +341,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
282 break; 341 break;
283 342
284 case SNAPSHOT_S2RAM: 343 case SNAPSHOT_S2RAM:
344 if (!pm_ops) {
345 error = -ENOSYS;
346 break;
347 }
348
285 if (!data->frozen) { 349 if (!data->frozen) {
286 error = -EPERM; 350 error = -EPERM;
287 break; 351 break;
@@ -319,28 +383,35 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
319 break; 383 break;
320 384
321 case SNAPSHOT_PMOPS: 385 case SNAPSHOT_PMOPS:
386 error = -EINVAL;
387
322 switch (arg) { 388 switch (arg) {
323 389
324 case PMOPS_PREPARE: 390 case PMOPS_PREPARE:
325 if (pm_ops->prepare) { 391 if (pm_ops && pm_ops->enter) {
326 error = pm_ops->prepare(PM_SUSPEND_DISK); 392 data->platform_suspend = 1;
393 error = 0;
394 } else {
395 error = -ENOSYS;
327 } 396 }
328 break; 397 break;
329 398
330 case PMOPS_ENTER: 399 case PMOPS_ENTER:
331 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 400 if (data->platform_suspend) {
332 error = pm_ops->enter(PM_SUSPEND_DISK); 401 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
402 error = pm_ops->enter(PM_SUSPEND_DISK);
403 error = 0;
404 }
333 break; 405 break;
334 406
335 case PMOPS_FINISH: 407 case PMOPS_FINISH:
336 if (pm_ops && pm_ops->finish) { 408 if (data->platform_suspend)
337 pm_ops->finish(PM_SUSPEND_DISK); 409 error = 0;
338 } 410
339 break; 411 break;
340 412
341 default: 413 default:
342 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); 414 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
343 error = -EINVAL;
344 415
345 } 416 }
346 break; 417 break;
diff --git a/kernel/printk.c b/kernel/printk.c
index c770e1a4e882..4b47e59248df 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -54,7 +54,7 @@ int console_printk[4] = {
54}; 54};
55 55
56/* 56/*
57 * Low lever drivers may need that to know if they can schedule in 57 * Low level drivers may need that to know if they can schedule in
58 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
59 */ 59 */
60int oops_in_progress; 60int oops_in_progress;
@@ -483,7 +483,7 @@ static int have_callable_console(void)
483 * printk - print a kernel message 483 * printk - print a kernel message
484 * @fmt: format string 484 * @fmt: format string
485 * 485 *
486 * This is printk. It can be called from any context. We want it to work. 486 * This is printk(). It can be called from any context. We want it to work.
487 * 487 *
488 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 488 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
489 * call the console drivers. If we fail to get the semaphore we place the output 489 * call the console drivers. If we fail to get the semaphore we place the output
@@ -529,7 +529,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
529 zap_locks(); 529 zap_locks();
530 530
531 /* This stops the holder of console_sem just where we want him */ 531 /* This stops the holder of console_sem just where we want him */
532 local_irq_save(flags); 532 raw_local_irq_save(flags);
533 lockdep_off(); 533 lockdep_off();
534 spin_lock(&logbuf_lock); 534 spin_lock(&logbuf_lock);
535 printk_cpu = smp_processor_id(); 535 printk_cpu = smp_processor_id();
@@ -618,7 +618,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
618 up(&console_sem); 618 up(&console_sem);
619 } 619 }
620 lockdep_on(); 620 lockdep_on();
621 local_irq_restore(flags); 621 raw_local_irq_restore(flags);
622 } else { 622 } else {
623 /* 623 /*
624 * Someone else owns the drivers. We drop the spinlock, which 624 * Someone else owns the drivers. We drop the spinlock, which
@@ -628,7 +628,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
628 printk_cpu = UINT_MAX; 628 printk_cpu = UINT_MAX;
629 spin_unlock(&logbuf_lock); 629 spin_unlock(&logbuf_lock);
630 lockdep_on(); 630 lockdep_on();
631 local_irq_restore(flags); 631 raw_local_irq_restore(flags);
632 } 632 }
633 633
634 preempt_enable(); 634 preempt_enable();
@@ -783,6 +783,12 @@ int is_console_locked(void)
783 return console_locked; 783 return console_locked;
784} 784}
785 785
786void wake_up_klogd(void)
787{
788 if (!oops_in_progress && waitqueue_active(&log_wait))
789 wake_up_interruptible(&log_wait);
790}
791
786/** 792/**
787 * release_console_sem - unlock the console system 793 * release_console_sem - unlock the console system
788 * 794 *
@@ -825,8 +831,8 @@ void release_console_sem(void)
825 console_locked = 0; 831 console_locked = 0;
826 up(&console_sem); 832 up(&console_sem);
827 spin_unlock_irqrestore(&logbuf_lock, flags); 833 spin_unlock_irqrestore(&logbuf_lock, flags);
828 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) 834 if (wake_klogd)
829 wake_up_interruptible(&log_wait); 835 wake_up_klogd();
830} 836}
831EXPORT_SYMBOL(release_console_sem); 837EXPORT_SYMBOL(release_console_sem);
832 838
diff --git a/kernel/profile.c b/kernel/profile.c
index d6579d511069..9bfadb248dd8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -449,7 +449,6 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
449 /* create /proc/irq/prof_cpu_mask */ 449 /* create /proc/irq/prof_cpu_mask */
450 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) 450 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
451 return; 451 return;
452 entry->nlink = 1;
453 entry->data = (void *)&prof_cpu_mask; 452 entry->data = (void *)&prof_cpu_mask;
454 entry->read_proc = prof_cpu_mask_read_proc; 453 entry->read_proc = prof_cpu_mask_read_proc;
455 entry->write_proc = prof_cpu_mask_write_proc; 454 entry->write_proc = prof_cpu_mask_write_proc;
diff --git a/kernel/relay.c b/kernel/relay.c
index 284e2e8b4eed..ef8a935710a2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -7,6 +7,8 @@
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) 7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8 * 8 *
9 * Moved to kernel/relay.c by Paul Mundt, 2006. 9 * Moved to kernel/relay.c by Paul Mundt, 2006.
10 * November 2006 - CPU hotplug support by Mathieu Desnoyers
11 * (mathieu.desnoyers@polymtl.ca)
10 * 12 *
11 * This file is released under the GPL. 13 * This file is released under the GPL.
12 */ 14 */
@@ -18,6 +20,11 @@
18#include <linux/relay.h> 20#include <linux/relay.h>
19#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
20#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/cpu.h>
24
25/* list of open channels, for cpu hotplug */
26static DEFINE_MUTEX(relay_channels_mutex);
27static LIST_HEAD(relay_channels);
21 28
22/* 29/*
23 * close() vm_op implementation for relay file mapping. 30 * close() vm_op implementation for relay file mapping.
@@ -187,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
187 __free_page(buf->page_array[i]); 194 __free_page(buf->page_array[i]);
188 kfree(buf->page_array); 195 kfree(buf->page_array);
189 } 196 }
197 chan->buf[buf->cpu] = NULL;
190 kfree(buf->padding); 198 kfree(buf->padding);
191 kfree(buf); 199 kfree(buf);
192 kref_put(&chan->kref, relay_destroy_channel); 200 kref_put(&chan->kref, relay_destroy_channel);
@@ -320,7 +328,7 @@ static void wakeup_readers(struct work_struct *work)
320 * @buf: the channel buffer 328 * @buf: the channel buffer
321 * @init: 1 if this is a first-time initialization 329 * @init: 1 if this is a first-time initialization
322 * 330 *
323 * See relay_reset for description of effect. 331 * See relay_reset() for description of effect.
324 */ 332 */
325static void __relay_reset(struct rchan_buf *buf, unsigned int init) 333static void __relay_reset(struct rchan_buf *buf, unsigned int init)
326{ 334{
@@ -356,57 +364,75 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
356 * and restarting the channel in its initial state. The buffers 364 * and restarting the channel in its initial state. The buffers
357 * are not freed, so any mappings are still in effect. 365 * are not freed, so any mappings are still in effect.
358 * 366 *
359 * NOTE: Care should be taken that the channel isn't actually 367 * NOTE. Care should be taken that the channel isn't actually
360 * being used by anything when this call is made. 368 * being used by anything when this call is made.
361 */ 369 */
362void relay_reset(struct rchan *chan) 370void relay_reset(struct rchan *chan)
363{ 371{
364 unsigned int i; 372 unsigned int i;
365 struct rchan_buf *prev = NULL;
366 373
367 if (!chan) 374 if (!chan)
368 return; 375 return;
369 376
370 for (i = 0; i < NR_CPUS; i++) { 377 if (chan->is_global && chan->buf[0]) {
371 if (!chan->buf[i] || chan->buf[i] == prev) 378 __relay_reset(chan->buf[0], 0);
372 break; 379 return;
373 __relay_reset(chan->buf[i], 0);
374 prev = chan->buf[i];
375 } 380 }
381
382 mutex_lock(&relay_channels_mutex);
383 for_each_online_cpu(i)
384 if (chan->buf[i])
385 __relay_reset(chan->buf[i], 0);
386 mutex_unlock(&relay_channels_mutex);
376} 387}
377EXPORT_SYMBOL_GPL(relay_reset); 388EXPORT_SYMBOL_GPL(relay_reset);
378 389
379/* 390/*
380 * relay_open_buf - create a new relay channel buffer 391 * relay_open_buf - create a new relay channel buffer
381 * 392 *
382 * Internal - used by relay_open(). 393 * used by relay_open() and CPU hotplug.
383 */ 394 */
384static struct rchan_buf *relay_open_buf(struct rchan *chan, 395static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
385 const char *filename,
386 struct dentry *parent,
387 int *is_global)
388{ 396{
389 struct rchan_buf *buf; 397 struct rchan_buf *buf = NULL;
390 struct dentry *dentry; 398 struct dentry *dentry;
399 char *tmpname;
391 400
392 if (*is_global) 401 if (chan->is_global)
393 return chan->buf[0]; 402 return chan->buf[0];
394 403
404 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
405 if (!tmpname)
406 goto end;
407 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
408
395 buf = relay_create_buf(chan); 409 buf = relay_create_buf(chan);
396 if (!buf) 410 if (!buf)
397 return NULL; 411 goto free_name;
412
413 buf->cpu = cpu;
414 __relay_reset(buf, 1);
398 415
399 /* Create file in fs */ 416 /* Create file in fs */
400 dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, 417 dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
401 buf, is_global); 418 buf, &chan->is_global);
402 if (!dentry) { 419 if (!dentry)
403 relay_destroy_buf(buf); 420 goto free_buf;
404 return NULL;
405 }
406 421
407 buf->dentry = dentry; 422 buf->dentry = dentry;
408 __relay_reset(buf, 1);
409 423
424 if(chan->is_global) {
425 chan->buf[0] = buf;
426 buf->cpu = 0;
427 }
428
429 goto free_name;
430
431free_buf:
432 relay_destroy_buf(buf);
433free_name:
434 kfree(tmpname);
435end:
410 return buf; 436 return buf;
411} 437}
412 438
@@ -448,31 +474,71 @@ static void setup_callbacks(struct rchan *chan,
448} 474}
449 475
450/** 476/**
477 *
478 * relay_hotcpu_callback - CPU hotplug callback
479 * @nb: notifier block
480 * @action: hotplug action to take
481 * @hcpu: CPU number
482 *
483 * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
484 */
485static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
486 unsigned long action,
487 void *hcpu)
488{
489 unsigned int hotcpu = (unsigned long)hcpu;
490 struct rchan *chan;
491
492 switch(action) {
493 case CPU_UP_PREPARE:
494 mutex_lock(&relay_channels_mutex);
495 list_for_each_entry(chan, &relay_channels, list) {
496 if (chan->buf[hotcpu])
497 continue;
498 chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
499 if(!chan->buf[hotcpu]) {
500 printk(KERN_ERR
501 "relay_hotcpu_callback: cpu %d buffer "
502 "creation failed\n", hotcpu);
503 mutex_unlock(&relay_channels_mutex);
504 return NOTIFY_BAD;
505 }
506 }
507 mutex_unlock(&relay_channels_mutex);
508 break;
509 case CPU_DEAD:
510 /* No need to flush the cpu : will be flushed upon
511 * final relay_flush() call. */
512 break;
513 }
514 return NOTIFY_OK;
515}
516
517/**
451 * relay_open - create a new relay channel 518 * relay_open - create a new relay channel
452 * @base_filename: base name of files to create 519 * @base_filename: base name of files to create
453 * @parent: dentry of parent directory, %NULL for root directory 520 * @parent: dentry of parent directory, %NULL for root directory
454 * @subbuf_size: size of sub-buffers 521 * @subbuf_size: size of sub-buffers
455 * @n_subbufs: number of sub-buffers 522 * @n_subbufs: number of sub-buffers
456 * @cb: client callback functions 523 * @cb: client callback functions
524 * @private_data: user-defined data
457 * 525 *
458 * Returns channel pointer if successful, %NULL otherwise. 526 * Returns channel pointer if successful, %NULL otherwise.
459 * 527 *
460 * Creates a channel buffer for each cpu using the sizes and 528 * Creates a channel buffer for each cpu using the sizes and
461 * attributes specified. The created channel buffer files 529 * attributes specified. The created channel buffer files
462 * will be named base_filename0...base_filenameN-1. File 530 * will be named base_filename0...base_filenameN-1. File
463 * permissions will be S_IRUSR. 531 * permissions will be %S_IRUSR.
464 */ 532 */
465struct rchan *relay_open(const char *base_filename, 533struct rchan *relay_open(const char *base_filename,
466 struct dentry *parent, 534 struct dentry *parent,
467 size_t subbuf_size, 535 size_t subbuf_size,
468 size_t n_subbufs, 536 size_t n_subbufs,
469 struct rchan_callbacks *cb) 537 struct rchan_callbacks *cb,
538 void *private_data)
470{ 539{
471 unsigned int i; 540 unsigned int i;
472 struct rchan *chan; 541 struct rchan *chan;
473 char *tmpname;
474 int is_global = 0;
475
476 if (!base_filename) 542 if (!base_filename)
477 return NULL; 543 return NULL;
478 544
@@ -487,38 +553,32 @@ struct rchan *relay_open(const char *base_filename,
487 chan->n_subbufs = n_subbufs; 553 chan->n_subbufs = n_subbufs;
488 chan->subbuf_size = subbuf_size; 554 chan->subbuf_size = subbuf_size;
489 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 555 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
556 chan->parent = parent;
557 chan->private_data = private_data;
558 strlcpy(chan->base_filename, base_filename, NAME_MAX);
490 setup_callbacks(chan, cb); 559 setup_callbacks(chan, cb);
491 kref_init(&chan->kref); 560 kref_init(&chan->kref);
492 561
493 tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL); 562 mutex_lock(&relay_channels_mutex);
494 if (!tmpname)
495 goto free_chan;
496
497 for_each_online_cpu(i) { 563 for_each_online_cpu(i) {
498 sprintf(tmpname, "%s%d", base_filename, i); 564 chan->buf[i] = relay_open_buf(chan, i);
499 chan->buf[i] = relay_open_buf(chan, tmpname, parent,
500 &is_global);
501 if (!chan->buf[i]) 565 if (!chan->buf[i])
502 goto free_bufs; 566 goto free_bufs;
503
504 chan->buf[i]->cpu = i;
505 } 567 }
568 list_add(&chan->list, &relay_channels);
569 mutex_unlock(&relay_channels_mutex);
506 570
507 kfree(tmpname);
508 return chan; 571 return chan;
509 572
510free_bufs: 573free_bufs:
511 for (i = 0; i < NR_CPUS; i++) { 574 for_each_online_cpu(i) {
512 if (!chan->buf[i]) 575 if (!chan->buf[i])
513 break; 576 break;
514 relay_close_buf(chan->buf[i]); 577 relay_close_buf(chan->buf[i]);
515 if (is_global)
516 break;
517 } 578 }
518 kfree(tmpname);
519 579
520free_chan:
521 kref_put(&chan->kref, relay_destroy_channel); 580 kref_put(&chan->kref, relay_destroy_channel);
581 mutex_unlock(&relay_channels_mutex);
522 return NULL; 582 return NULL;
523} 583}
524EXPORT_SYMBOL_GPL(relay_open); 584EXPORT_SYMBOL_GPL(relay_open);
@@ -588,7 +648,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
588 * subbufs_consumed should be the number of sub-buffers newly consumed, 648 * subbufs_consumed should be the number of sub-buffers newly consumed,
589 * not the total consumed. 649 * not the total consumed.
590 * 650 *
591 * NOTE: Kernel clients don't need to call this function if the channel 651 * NOTE. Kernel clients don't need to call this function if the channel
592 * mode is 'overwrite'. 652 * mode is 'overwrite'.
593 */ 653 */
594void relay_subbufs_consumed(struct rchan *chan, 654void relay_subbufs_consumed(struct rchan *chan,
@@ -619,24 +679,26 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
619void relay_close(struct rchan *chan) 679void relay_close(struct rchan *chan)
620{ 680{
621 unsigned int i; 681 unsigned int i;
622 struct rchan_buf *prev = NULL;
623 682
624 if (!chan) 683 if (!chan)
625 return; 684 return;
626 685
627 for (i = 0; i < NR_CPUS; i++) { 686 mutex_lock(&relay_channels_mutex);
628 if (!chan->buf[i] || chan->buf[i] == prev) 687 if (chan->is_global && chan->buf[0])
629 break; 688 relay_close_buf(chan->buf[0]);
630 relay_close_buf(chan->buf[i]); 689 else
631 prev = chan->buf[i]; 690 for_each_possible_cpu(i)
632 } 691 if (chan->buf[i])
692 relay_close_buf(chan->buf[i]);
633 693
634 if (chan->last_toobig) 694 if (chan->last_toobig)
635 printk(KERN_WARNING "relay: one or more items not logged " 695 printk(KERN_WARNING "relay: one or more items not logged "
636 "[item size (%Zd) > sub-buffer size (%Zd)]\n", 696 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
637 chan->last_toobig, chan->subbuf_size); 697 chan->last_toobig, chan->subbuf_size);
638 698
699 list_del(&chan->list);
639 kref_put(&chan->kref, relay_destroy_channel); 700 kref_put(&chan->kref, relay_destroy_channel);
701 mutex_unlock(&relay_channels_mutex);
640} 702}
641EXPORT_SYMBOL_GPL(relay_close); 703EXPORT_SYMBOL_GPL(relay_close);
642 704
@@ -649,17 +711,20 @@ EXPORT_SYMBOL_GPL(relay_close);
649void relay_flush(struct rchan *chan) 711void relay_flush(struct rchan *chan)
650{ 712{
651 unsigned int i; 713 unsigned int i;
652 struct rchan_buf *prev = NULL;
653 714
654 if (!chan) 715 if (!chan)
655 return; 716 return;
656 717
657 for (i = 0; i < NR_CPUS; i++) { 718 if (chan->is_global && chan->buf[0]) {
658 if (!chan->buf[i] || chan->buf[i] == prev) 719 relay_switch_subbuf(chan->buf[0], 0);
659 break; 720 return;
660 relay_switch_subbuf(chan->buf[i], 0);
661 prev = chan->buf[i];
662 } 721 }
722
723 mutex_lock(&relay_channels_mutex);
724 for_each_possible_cpu(i)
725 if (chan->buf[i])
726 relay_switch_subbuf(chan->buf[i], 0);
727 mutex_unlock(&relay_channels_mutex);
663} 728}
664EXPORT_SYMBOL_GPL(relay_flush); 729EXPORT_SYMBOL_GPL(relay_flush);
665 730
@@ -684,7 +749,7 @@ static int relay_file_open(struct inode *inode, struct file *filp)
684 * @filp: the file 749 * @filp: the file
685 * @vma: the vma describing what to map 750 * @vma: the vma describing what to map
686 * 751 *
687 * Calls upon relay_mmap_buf to map the file into user space. 752 * Calls upon relay_mmap_buf() to map the file into user space.
688 */ 753 */
689static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) 754static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
690{ 755{
@@ -826,7 +891,7 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
826 * @read_pos: file read position 891 * @read_pos: file read position
827 * @buf: relay channel buffer 892 * @buf: relay channel buffer
828 * 893 *
829 * If the read_pos is in the middle of padding, return the 894 * If the @read_pos is in the middle of padding, return the
830 * position of the first actually available byte, otherwise 895 * position of the first actually available byte, otherwise
831 * return the original value. 896 * return the original value.
832 */ 897 */
@@ -1022,3 +1087,12 @@ const struct file_operations relay_file_operations = {
1022 .sendfile = relay_file_sendfile, 1087 .sendfile = relay_file_sendfile,
1023}; 1088};
1024EXPORT_SYMBOL_GPL(relay_file_operations); 1089EXPORT_SYMBOL_GPL(relay_file_operations);
1090
1091static __init int relay_init(void)
1092{
1093
1094 hotcpu_notifier(relay_hotcpu_callback, 0);
1095 return 0;
1096}
1097
1098module_init(relay_init);
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b9a497419d9..bdb55a33f969 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h>
12#include <linux/errno.h> 11#include <linux/errno.h>
13#include <linux/ioport.h> 12#include <linux/ioport.h>
14#include <linux/init.h> 13#include <linux/init.h>
@@ -17,6 +16,7 @@
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/device.h>
20#include <asm/io.h> 20#include <asm/io.h>
21 21
22 22
@@ -618,6 +618,67 @@ void __release_region(struct resource *parent, resource_size_t start,
618EXPORT_SYMBOL(__release_region); 618EXPORT_SYMBOL(__release_region);
619 619
620/* 620/*
621 * Managed region resource
622 */
623struct region_devres {
624 struct resource *parent;
625 resource_size_t start;
626 resource_size_t n;
627};
628
629static void devm_region_release(struct device *dev, void *res)
630{
631 struct region_devres *this = res;
632
633 __release_region(this->parent, this->start, this->n);
634}
635
636static int devm_region_match(struct device *dev, void *res, void *match_data)
637{
638 struct region_devres *this = res, *match = match_data;
639
640 return this->parent == match->parent &&
641 this->start == match->start && this->n == match->n;
642}
643
644struct resource * __devm_request_region(struct device *dev,
645 struct resource *parent, resource_size_t start,
646 resource_size_t n, const char *name)
647{
648 struct region_devres *dr = NULL;
649 struct resource *res;
650
651 dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
652 GFP_KERNEL);
653 if (!dr)
654 return NULL;
655
656 dr->parent = parent;
657 dr->start = start;
658 dr->n = n;
659
660 res = __request_region(parent, start, n, name);
661 if (res)
662 devres_add(dev, dr);
663 else
664 devres_free(dr);
665
666 return res;
667}
668EXPORT_SYMBOL(__devm_request_region);
669
670void __devm_release_region(struct device *dev, struct resource *parent,
671 resource_size_t start, resource_size_t n)
672{
673 struct region_devres match_data = { parent, start, n };
674
675 __release_region(parent, start, n);
676 WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
677 &match_data));
678}
679EXPORT_SYMBOL(__devm_release_region);
680
681/*
621 * Called from init/main.c to reserve IO ports. 682 * Called from init/main.c to reserve IO ports.
622 */ 683 */
623#define MAXRESERVE 4 684#define MAXRESERVE 4
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da46fd8..180978cb2f75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
625 /* Setup the timer, when timeout != NULL */ 625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout)) 626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires, 627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS); 628 HRTIMER_MODE_ABS);
629 629
630 for (;;) { 630 for (;;) {
631 /* Try to acquire the lock: */ 631 /* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index cca93cc0dd7d..0dc757246d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -57,6 +57,16 @@
57#include <asm/unistd.h> 57#include <asm/unistd.h>
58 58
59/* 59/*
60 * Scheduler clock - returns current time in nanosec units.
61 * This is default implementation.
62 * Architectures and sub-architectures can override this.
63 */
64unsigned long long __attribute__((weak)) sched_clock(void)
65{
66 return (unsigned long long)jiffies * (1000000000 / HZ);
67}
68
69/*
60 * Convert user-nice values [ -20 ... 0 ... 19 ] 70 * Convert user-nice values [ -20 ... 0 ... 19 ]
61 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 71 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
62 * and back. 72 * and back.
@@ -1843,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1843 struct mm_struct *mm = next->mm; 1853 struct mm_struct *mm = next->mm;
1844 struct mm_struct *oldmm = prev->active_mm; 1854 struct mm_struct *oldmm = prev->active_mm;
1845 1855
1856 /*
1857 * For paravirt, this is coupled with an exit in switch_to to
1858 * combine the page table reload and the switch backend into
1859 * one hypercall.
1860 */
1861 arch_enter_lazy_cpu_mode();
1862
1846 if (!mm) { 1863 if (!mm) {
1847 next->active_mm = oldmm; 1864 next->active_mm = oldmm;
1848 atomic_inc(&oldmm->mm_count); 1865 atomic_inc(&oldmm->mm_count);
@@ -2887,14 +2904,16 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2887static void update_load(struct rq *this_rq) 2904static void update_load(struct rq *this_rq)
2888{ 2905{
2889 unsigned long this_load; 2906 unsigned long this_load;
2890 int i, scale; 2907 unsigned int i, scale;
2891 2908
2892 this_load = this_rq->raw_weighted_load; 2909 this_load = this_rq->raw_weighted_load;
2893 2910
2894 /* Update our load: */ 2911 /* Update our load: */
2895 for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { 2912 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
2896 unsigned long old_load, new_load; 2913 unsigned long old_load, new_load;
2897 2914
2915 /* scale is effectively 1 << i now, and >> i divides by scale */
2916
2898 old_load = this_rq->cpu_load[i]; 2917 old_load = this_rq->cpu_load[i];
2899 new_load = this_load; 2918 new_load = this_load;
2900 /* 2919 /*
@@ -2904,7 +2923,7 @@ static void update_load(struct rq *this_rq)
2904 */ 2923 */
2905 if (new_load > old_load) 2924 if (new_load > old_load)
2906 new_load += scale-1; 2925 new_load += scale-1;
2907 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2926 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2908 } 2927 }
2909} 2928}
2910 2929
@@ -4193,13 +4212,12 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4193} 4212}
4194 4213
4195/** 4214/**
4196 * sched_setscheduler - change the scheduling policy and/or RT priority of 4215 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4197 * a thread.
4198 * @p: the task in question. 4216 * @p: the task in question.
4199 * @policy: new policy. 4217 * @policy: new policy.
4200 * @param: structure containing the new RT priority. 4218 * @param: structure containing the new RT priority.
4201 * 4219 *
4202 * NOTE: the task may be already dead 4220 * NOTE that the task may be already dead.
4203 */ 4221 */
4204int sched_setscheduler(struct task_struct *p, int policy, 4222int sched_setscheduler(struct task_struct *p, int policy,
4205 struct sched_param *param) 4223 struct sched_param *param)
@@ -4567,7 +4585,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4567/** 4585/**
4568 * sys_sched_yield - yield the current processor to other threads. 4586 * sys_sched_yield - yield the current processor to other threads.
4569 * 4587 *
4570 * this function yields the current CPU by moving the calling thread 4588 * This function yields the current CPU by moving the calling thread
4571 * to the expired array. If there are no other threads running on this 4589 * to the expired array. If there are no other threads running on this
4572 * CPU then this function will return. 4590 * CPU then this function will return.
4573 */ 4591 */
@@ -4694,7 +4712,7 @@ EXPORT_SYMBOL(cond_resched_softirq);
4694/** 4712/**
4695 * yield - yield the current processor to other threads. 4713 * yield - yield the current processor to other threads.
4696 * 4714 *
4697 * this is a shortcut for kernel-space yielding - it marks the 4715 * This is a shortcut for kernel-space yielding - it marks the
4698 * thread runnable and calls sys_sched_yield(). 4716 * thread runnable and calls sys_sched_yield().
4699 */ 4717 */
4700void __sched yield(void) 4718void __sched yield(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 8a04869402fb..3670225ecbc0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 456int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
457{ 457{
458 int signr = __dequeue_signal(&tsk->pending, mask, info); 458 int signr = __dequeue_signal(&tsk->pending, mask, info);
459 if (!signr) 459 if (!signr) {
460 signr = __dequeue_signal(&tsk->signal->shared_pending, 460 signr = __dequeue_signal(&tsk->signal->shared_pending,
461 mask, info); 461 mask, info);
462 /*
463 * itimer signal ?
464 *
465 * itimers are process shared and we restart periodic
466 * itimers in the signal delivery path to prevent DoS
467 * attacks in the high resolution timer case. This is
468 * compliant with the old way of self restarting
469 * itimers, as the SIGALRM is a legacy signal and only
470 * queued once. Changing the restart behaviour to
471 * restart the timer in the signal dequeue path is
472 * reducing the timer noise on heavy loaded !highres
473 * systems too.
474 */
475 if (unlikely(signr == SIGALRM)) {
476 struct hrtimer *tmr = &tsk->signal->real_timer;
477
478 if (!hrtimer_is_queued(tmr) &&
479 tsk->signal->it_real_incr.tv64 != 0) {
480 hrtimer_forward(tmr, tmr->base->get_time(),
481 tsk->signal->it_real_incr);
482 hrtimer_restart(tmr);
483 }
484 }
485 }
462 recalc_sigpending_tsk(tsk); 486 recalc_sigpending_tsk(tsk);
463 if (signr && unlikely(sig_kernel_stop(signr))) { 487 if (signr && unlikely(sig_kernel_stop(signr))) {
464 /* 488 /*
465 * Set a marker that we have dequeued a stop signal. Our 489 * Set a marker that we have dequeued a stop signal. Our
466 * caller might release the siglock and then the pending 490 * caller might release the siglock and then the pending
467 * stop signal it is about to process is no longer in the 491 * stop signal it is about to process is no longer in the
468 * pending bitmasks, but must still be cleared by a SIGCONT 492 * pending bitmasks, but must still be cleared by a SIGCONT
469 * (and overruled by a SIGKILL). So those cases clear this 493 * (and overruled by a SIGKILL). So those cases clear this
470 * shared flag after we've set it. Note that this flag may 494 * shared flag after we've set it. Note that this flag may
471 * remain set after the signal we return is ignored or 495 * remain set after the signal we return is ignored or
472 * handled. That doesn't matter because its only purpose 496 * handled. That doesn't matter because its only purpose
473 * is to alert stop-signal processing code when another 497 * is to alert stop-signal processing code when another
474 * processor has come along and cleared the flag. 498 * processor has come along and cleared the flag.
475 */ 499 */
476 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 500 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
477 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 501 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
478 } 502 }
479 if ( signr && 503 if ( signr &&
480 ((info->si_code & __SI_MASK) == __SI_TIMER) && 504 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
481 info->si_sys_private){ 505 info->si_sys_private){
@@ -1096,42 +1120,21 @@ int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1096 return retval; 1120 return retval;
1097} 1121}
1098 1122
1099int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1100{
1101 if (pgrp <= 0)
1102 return -EINVAL;
1103
1104 return __kill_pgrp_info(sig, info, find_pid(pgrp));
1105}
1106
1107int
1108kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1109{
1110 int retval;
1111
1112 read_lock(&tasklist_lock);
1113 retval = __kill_pg_info(sig, info, pgrp);
1114 read_unlock(&tasklist_lock);
1115
1116 return retval;
1117}
1118
1119int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) 1123int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1120{ 1124{
1121 int error; 1125 int error;
1122 int acquired_tasklist_lock = 0;
1123 struct task_struct *p; 1126 struct task_struct *p;
1124 1127
1125 rcu_read_lock(); 1128 rcu_read_lock();
1126 if (unlikely(sig_needs_tasklist(sig))) { 1129 if (unlikely(sig_needs_tasklist(sig)))
1127 read_lock(&tasklist_lock); 1130 read_lock(&tasklist_lock);
1128 acquired_tasklist_lock = 1; 1131
1129 }
1130 p = pid_task(pid, PIDTYPE_PID); 1132 p = pid_task(pid, PIDTYPE_PID);
1131 error = -ESRCH; 1133 error = -ESRCH;
1132 if (p) 1134 if (p)
1133 error = group_send_sig_info(sig, info, p); 1135 error = group_send_sig_info(sig, info, p);
1134 if (unlikely(acquired_tasklist_lock)) 1136
1137 if (unlikely(sig_needs_tasklist(sig)))
1135 read_unlock(&tasklist_lock); 1138 read_unlock(&tasklist_lock);
1136 rcu_read_unlock(); 1139 rcu_read_unlock();
1137 return error; 1140 return error;
@@ -1193,8 +1196,10 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1193 1196
1194static int kill_something_info(int sig, struct siginfo *info, int pid) 1197static int kill_something_info(int sig, struct siginfo *info, int pid)
1195{ 1198{
1199 int ret;
1200 rcu_read_lock();
1196 if (!pid) { 1201 if (!pid) {
1197 return kill_pg_info(sig, info, process_group(current)); 1202 ret = kill_pgrp_info(sig, info, task_pgrp(current));
1198 } else if (pid == -1) { 1203 } else if (pid == -1) {
1199 int retval = 0, count = 0; 1204 int retval = 0, count = 0;
1200 struct task_struct * p; 1205 struct task_struct * p;
@@ -1209,12 +1214,14 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1209 } 1214 }
1210 } 1215 }
1211 read_unlock(&tasklist_lock); 1216 read_unlock(&tasklist_lock);
1212 return count ? retval : -ESRCH; 1217 ret = count ? retval : -ESRCH;
1213 } else if (pid < 0) { 1218 } else if (pid < 0) {
1214 return kill_pg_info(sig, info, -pid); 1219 ret = kill_pgrp_info(sig, info, find_pid(-pid));
1215 } else { 1220 } else {
1216 return kill_proc_info(sig, info, pid); 1221 ret = kill_pid_info(sig, info, find_pid(pid));
1217 } 1222 }
1223 rcu_read_unlock();
1224 return ret;
1218} 1225}
1219 1226
1220/* 1227/*
@@ -1313,12 +1320,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
1313EXPORT_SYMBOL(kill_pid); 1320EXPORT_SYMBOL(kill_pid);
1314 1321
1315int 1322int
1316kill_pg(pid_t pgrp, int sig, int priv)
1317{
1318 return kill_pg_info(sig, __si_special(priv), pgrp);
1319}
1320
1321int
1322kill_proc(pid_t pid, int sig, int priv) 1323kill_proc(pid_t pid, int sig, int priv)
1323{ 1324{
1324 return kill_proc_info(sig, __si_special(priv), pid); 1325 return kill_proc_info(sig, __si_special(priv), pid);
@@ -1907,7 +1908,7 @@ relock:
1907 1908
1908 /* signals can be posted during this window */ 1909 /* signals can be posted during this window */
1909 1910
1910 if (is_orphaned_pgrp(process_group(current))) 1911 if (is_current_pgrp_orphaned())
1911 goto relock; 1912 goto relock;
1912 1913
1913 spin_lock_irq(&current->sighand->siglock); 1914 spin_lock_irq(&current->sighand->siglock);
@@ -1957,7 +1958,6 @@ EXPORT_SYMBOL(recalc_sigpending);
1957EXPORT_SYMBOL_GPL(dequeue_signal); 1958EXPORT_SYMBOL_GPL(dequeue_signal);
1958EXPORT_SYMBOL(flush_signals); 1959EXPORT_SYMBOL(flush_signals);
1959EXPORT_SYMBOL(force_sig); 1960EXPORT_SYMBOL(force_sig);
1960EXPORT_SYMBOL(kill_pg);
1961EXPORT_SYMBOL(kill_proc); 1961EXPORT_SYMBOL(kill_proc);
1962EXPORT_SYMBOL(ptrace_notify); 1962EXPORT_SYMBOL(ptrace_notify);
1963EXPORT_SYMBOL(send_sig); 1963EXPORT_SYMBOL(send_sig);
@@ -2284,7 +2284,7 @@ static int do_tkill(int tgid, int pid, int sig)
2284 * @pid: the PID of the thread 2284 * @pid: the PID of the thread
2285 * @sig: signal to be sent 2285 * @sig: signal to be sent
2286 * 2286 *
2287 * This syscall also checks the tgid and returns -ESRCH even if the PID 2287 * This syscall also checks the @tgid and returns -ESRCH even if the PID
2288 * exists but it's not belonging to the target process anymore. This 2288 * exists but it's not belonging to the target process anymore. This
2289 * method solves the problem of threads exiting and PIDs getting reused. 2289 * method solves the problem of threads exiting and PIDs getting reused.
2290 */ 2290 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 918e52df090e..8b75008e2bd8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -17,6 +17,7 @@
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/tick.h>
20 21
21#include <asm/irq.h> 22#include <asm/irq.h>
22/* 23/*
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq);
273 274
274#endif 275#endif
275 276
277/*
278 * Enter an interrupt context.
279 */
280void irq_enter(void)
281{
282 __irq_enter();
283#ifdef CONFIG_NO_HZ
284 if (idle_cpu(smp_processor_id()))
285 tick_nohz_update_jiffies();
286#endif
287}
288
276#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 289#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
277# define invoke_softirq() __do_softirq() 290# define invoke_softirq() __do_softirq()
278#else 291#else
@@ -289,6 +302,12 @@ void irq_exit(void)
289 sub_preempt_count(IRQ_EXIT_OFFSET); 302 sub_preempt_count(IRQ_EXIT_OFFSET);
290 if (!in_interrupt() && local_softirq_pending()) 303 if (!in_interrupt() && local_softirq_pending())
291 invoke_softirq(); 304 invoke_softirq();
305
306#ifdef CONFIG_NO_HZ
307 /* Make sure that timer wheel updates are propagated */
308 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
309 tick_nohz_stop_sched_tick();
310#endif
292 preempt_enable_no_resched(); 311 preempt_enable_no_resched();
293} 312}
294 313
diff --git a/kernel/sys.c b/kernel/sys.c
index 6e2101dec0fc..123b165080e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -215,7 +215,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
215 * This routine uses RCU to synchronize with changes to the chain. 215 * This routine uses RCU to synchronize with changes to the chain.
216 * 216 *
217 * If the return value of the notifier can be and'ed 217 * If the return value of the notifier can be and'ed
218 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain 218 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
219 * will return immediately, with the return value of 219 * will return immediately, with the return value of
220 * the notifier function which halted execution. 220 * the notifier function which halted execution.
221 * Otherwise the return value is the return value 221 * Otherwise the return value is the return value
@@ -313,7 +313,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
313 * run in a process context, so they are allowed to block. 313 * run in a process context, so they are allowed to block.
314 * 314 *
315 * If the return value of the notifier can be and'ed 315 * If the return value of the notifier can be and'ed
316 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain 316 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
317 * will return immediately, with the return value of 317 * will return immediately, with the return value of
318 * the notifier function which halted execution. 318 * the notifier function which halted execution.
319 * Otherwise the return value is the return value 319 * Otherwise the return value is the return value
@@ -393,7 +393,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
393 * All locking must be provided by the caller. 393 * All locking must be provided by the caller.
394 * 394 *
395 * If the return value of the notifier can be and'ed 395 * If the return value of the notifier can be and'ed
396 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain 396 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
397 * will return immediately, with the return value of 397 * will return immediately, with the return value of
398 * the notifier function which halted execution. 398 * the notifier function which halted execution.
399 * Otherwise the return value is the return value 399 * Otherwise the return value is the return value
@@ -487,7 +487,7 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
487 * run in a process context, so they are allowed to block. 487 * run in a process context, so they are allowed to block.
488 * 488 *
489 * If the return value of the notifier can be and'ed 489 * If the return value of the notifier can be and'ed
490 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain 490 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
491 * will return immediately, with the return value of 491 * will return immediately, with the return value of
492 * the notifier function which halted execution. 492 * the notifier function which halted execution.
493 * Otherwise the return value is the return value 493 * Otherwise the return value is the return value
@@ -538,7 +538,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
538 * Registers a function with the list of functions 538 * Registers a function with the list of functions
539 * to be called at reboot time. 539 * to be called at reboot time.
540 * 540 *
541 * Currently always returns zero, as blocking_notifier_chain_register 541 * Currently always returns zero, as blocking_notifier_chain_register()
542 * always returns zero. 542 * always returns zero.
543 */ 543 */
544 544
@@ -596,6 +596,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
596 struct task_struct *g, *p; 596 struct task_struct *g, *p;
597 struct user_struct *user; 597 struct user_struct *user;
598 int error = -EINVAL; 598 int error = -EINVAL;
599 struct pid *pgrp;
599 600
600 if (which > 2 || which < 0) 601 if (which > 2 || which < 0)
601 goto out; 602 goto out;
@@ -610,18 +611,21 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
610 read_lock(&tasklist_lock); 611 read_lock(&tasklist_lock);
611 switch (which) { 612 switch (which) {
612 case PRIO_PROCESS: 613 case PRIO_PROCESS:
613 if (!who) 614 if (who)
614 who = current->pid; 615 p = find_task_by_pid(who);
615 p = find_task_by_pid(who); 616 else
617 p = current;
616 if (p) 618 if (p)
617 error = set_one_prio(p, niceval, error); 619 error = set_one_prio(p, niceval, error);
618 break; 620 break;
619 case PRIO_PGRP: 621 case PRIO_PGRP:
620 if (!who) 622 if (who)
621 who = process_group(current); 623 pgrp = find_pid(who);
622 do_each_task_pid(who, PIDTYPE_PGID, p) { 624 else
625 pgrp = task_pgrp(current);
626 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
623 error = set_one_prio(p, niceval, error); 627 error = set_one_prio(p, niceval, error);
624 } while_each_task_pid(who, PIDTYPE_PGID, p); 628 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
625 break; 629 break;
626 case PRIO_USER: 630 case PRIO_USER:
627 user = current->user; 631 user = current->user;
@@ -656,6 +660,7 @@ asmlinkage long sys_getpriority(int which, int who)
656 struct task_struct *g, *p; 660 struct task_struct *g, *p;
657 struct user_struct *user; 661 struct user_struct *user;
658 long niceval, retval = -ESRCH; 662 long niceval, retval = -ESRCH;
663 struct pid *pgrp;
659 664
660 if (which > 2 || which < 0) 665 if (which > 2 || which < 0)
661 return -EINVAL; 666 return -EINVAL;
@@ -663,9 +668,10 @@ asmlinkage long sys_getpriority(int which, int who)
663 read_lock(&tasklist_lock); 668 read_lock(&tasklist_lock);
664 switch (which) { 669 switch (which) {
665 case PRIO_PROCESS: 670 case PRIO_PROCESS:
666 if (!who) 671 if (who)
667 who = current->pid; 672 p = find_task_by_pid(who);
668 p = find_task_by_pid(who); 673 else
674 p = current;
669 if (p) { 675 if (p) {
670 niceval = 20 - task_nice(p); 676 niceval = 20 - task_nice(p);
671 if (niceval > retval) 677 if (niceval > retval)
@@ -673,13 +679,15 @@ asmlinkage long sys_getpriority(int which, int who)
673 } 679 }
674 break; 680 break;
675 case PRIO_PGRP: 681 case PRIO_PGRP:
676 if (!who) 682 if (who)
677 who = process_group(current); 683 pgrp = find_pid(who);
678 do_each_task_pid(who, PIDTYPE_PGID, p) { 684 else
685 pgrp = task_pgrp(current);
686 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
679 niceval = 20 - task_nice(p); 687 niceval = 20 - task_nice(p);
680 if (niceval > retval) 688 if (niceval > retval)
681 retval = niceval; 689 retval = niceval;
682 } while_each_task_pid(who, PIDTYPE_PGID, p); 690 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
683 break; 691 break;
684 case PRIO_USER: 692 case PRIO_USER:
685 user = current->user; 693 user = current->user;
@@ -1388,7 +1396,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1388 1396
1389 if (p->real_parent == group_leader) { 1397 if (p->real_parent == group_leader) {
1390 err = -EPERM; 1398 err = -EPERM;
1391 if (process_session(p) != process_session(group_leader)) 1399 if (task_session(p) != task_session(group_leader))
1392 goto out; 1400 goto out;
1393 err = -EACCES; 1401 err = -EACCES;
1394 if (p->did_exec) 1402 if (p->did_exec)
@@ -1407,7 +1415,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1407 struct task_struct *g = 1415 struct task_struct *g =
1408 find_task_by_pid_type(PIDTYPE_PGID, pgid); 1416 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1409 1417
1410 if (!g || process_session(g) != process_session(group_leader)) 1418 if (!g || task_session(g) != task_session(group_leader))
1411 goto out; 1419 goto out;
1412 } 1420 }
1413 1421
@@ -1510,7 +1518,6 @@ asmlinkage long sys_setsid(void)
1510 1518
1511 spin_lock(&group_leader->sighand->siglock); 1519 spin_lock(&group_leader->sighand->siglock);
1512 group_leader->signal->tty = NULL; 1520 group_leader->signal->tty = NULL;
1513 group_leader->signal->tty_old_pgrp = 0;
1514 spin_unlock(&group_leader->sighand->siglock); 1521 spin_unlock(&group_leader->sighand->siglock);
1515 1522
1516 err = process_group(group_leader); 1523 err = process_group(group_leader);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 600b33358ded..3ca1d5ff0319 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -90,12 +90,6 @@ extern char modprobe_path[];
90#ifdef CONFIG_CHR_DEV_SG 90#ifdef CONFIG_CHR_DEV_SG
91extern int sg_big_buff; 91extern int sg_big_buff;
92#endif 92#endif
93#ifdef CONFIG_SYSVIPC
94static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos);
96static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
97 void __user *buffer, size_t *lenp, loff_t *ppos);
98#endif
99 93
100#ifdef __sparc__ 94#ifdef __sparc__
101extern char reboot_command []; 95extern char reboot_command [];
@@ -135,22 +129,12 @@ static int parse_table(int __user *, int, void __user *, size_t __user *,
135 void __user *, size_t, ctl_table *); 129 void __user *, size_t, ctl_table *);
136#endif 130#endif
137 131
138static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
139 void __user *buffer, size_t *lenp, loff_t *ppos);
140
141static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
142 void __user *oldval, size_t __user *oldlenp,
143 void __user *newval, size_t newlen);
144
145#ifdef CONFIG_SYSVIPC
146static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
147 void __user *oldval, size_t __user *oldlenp,
148 void __user *newval, size_t newlen);
149#endif
150 132
151#ifdef CONFIG_PROC_SYSCTL 133#ifdef CONFIG_PROC_SYSCTL
152static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 134static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
153 void __user *buffer, size_t *lenp, loff_t *ppos); 135 void __user *buffer, size_t *lenp, loff_t *ppos);
136static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp,
137 void __user *buffer, size_t *lenp, loff_t *ppos);
154#endif 138#endif
155 139
156static ctl_table root_table[]; 140static ctl_table root_table[];
@@ -174,59 +158,6 @@ extern ctl_table inotify_table[];
174int sysctl_legacy_va_layout; 158int sysctl_legacy_va_layout;
175#endif 159#endif
176 160
177static void *get_uts(ctl_table *table, int write)
178{
179 char *which = table->data;
180#ifdef CONFIG_UTS_NS
181 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
182 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
183#endif
184 if (!write)
185 down_read(&uts_sem);
186 else
187 down_write(&uts_sem);
188 return which;
189}
190
191static void put_uts(ctl_table *table, int write, void *which)
192{
193 if (!write)
194 up_read(&uts_sem);
195 else
196 up_write(&uts_sem);
197}
198
199#ifdef CONFIG_SYSVIPC
200static void *get_ipc(ctl_table *table, int write)
201{
202 char *which = table->data;
203 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
204 which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
205 return which;
206}
207#else
208#define get_ipc(T,W) ((T)->data)
209#endif
210
211/* /proc declarations: */
212
213#ifdef CONFIG_PROC_SYSCTL
214
215static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
216static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
217static int proc_opensys(struct inode *, struct file *);
218
219const struct file_operations proc_sys_file_operations = {
220 .open = proc_opensys,
221 .read = proc_readsys,
222 .write = proc_writesys,
223};
224
225extern struct proc_dir_entry *proc_sys_root;
226
227static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
228static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
229#endif
230 161
231/* The default sysctl tables: */ 162/* The default sysctl tables: */
232 163
@@ -275,51 +206,6 @@ static ctl_table root_table[] = {
275 206
276static ctl_table kern_table[] = { 207static ctl_table kern_table[] = {
277 { 208 {
278 .ctl_name = KERN_OSTYPE,
279 .procname = "ostype",
280 .data = init_uts_ns.name.sysname,
281 .maxlen = sizeof(init_uts_ns.name.sysname),
282 .mode = 0444,
283 .proc_handler = &proc_do_uts_string,
284 .strategy = &sysctl_uts_string,
285 },
286 {
287 .ctl_name = KERN_OSRELEASE,
288 .procname = "osrelease",
289 .data = init_uts_ns.name.release,
290 .maxlen = sizeof(init_uts_ns.name.release),
291 .mode = 0444,
292 .proc_handler = &proc_do_uts_string,
293 .strategy = &sysctl_uts_string,
294 },
295 {
296 .ctl_name = KERN_VERSION,
297 .procname = "version",
298 .data = init_uts_ns.name.version,
299 .maxlen = sizeof(init_uts_ns.name.version),
300 .mode = 0444,
301 .proc_handler = &proc_do_uts_string,
302 .strategy = &sysctl_uts_string,
303 },
304 {
305 .ctl_name = KERN_NODENAME,
306 .procname = "hostname",
307 .data = init_uts_ns.name.nodename,
308 .maxlen = sizeof(init_uts_ns.name.nodename),
309 .mode = 0644,
310 .proc_handler = &proc_do_uts_string,
311 .strategy = &sysctl_uts_string,
312 },
313 {
314 .ctl_name = KERN_DOMAINNAME,
315 .procname = "domainname",
316 .data = init_uts_ns.name.domainname,
317 .maxlen = sizeof(init_uts_ns.name.domainname),
318 .mode = 0644,
319 .proc_handler = &proc_do_uts_string,
320 .strategy = &sysctl_uts_string,
321 },
322 {
323 .ctl_name = KERN_PANIC, 209 .ctl_name = KERN_PANIC,
324 .procname = "panic", 210 .procname = "panic",
325 .data = &panic_timeout, 211 .data = &panic_timeout,
@@ -344,14 +230,16 @@ static ctl_table kern_table[] = {
344 .proc_handler = &proc_dostring, 230 .proc_handler = &proc_dostring,
345 .strategy = &sysctl_string, 231 .strategy = &sysctl_string,
346 }, 232 },
233#ifdef CONFIG_PROC_SYSCTL
347 { 234 {
348 .ctl_name = KERN_TAINTED, 235 .ctl_name = KERN_TAINTED,
349 .procname = "tainted", 236 .procname = "tainted",
350 .data = &tainted, 237 .data = &tainted,
351 .maxlen = sizeof(int), 238 .maxlen = sizeof(int),
352 .mode = 0444, 239 .mode = 0644,
353 .proc_handler = &proc_dointvec, 240 .proc_handler = &proc_dointvec_taint,
354 }, 241 },
242#endif
355 { 243 {
356 .ctl_name = KERN_CAP_BSET, 244 .ctl_name = KERN_CAP_BSET,
357 .procname = "cap-bound", 245 .procname = "cap-bound",
@@ -473,71 +361,6 @@ static ctl_table kern_table[] = {
473 .proc_handler = &proc_dointvec, 361 .proc_handler = &proc_dointvec,
474 }, 362 },
475#endif 363#endif
476#ifdef CONFIG_SYSVIPC
477 {
478 .ctl_name = KERN_SHMMAX,
479 .procname = "shmmax",
480 .data = &init_ipc_ns.shm_ctlmax,
481 .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
482 .mode = 0644,
483 .proc_handler = &proc_ipc_doulongvec_minmax,
484 .strategy = sysctl_ipc_data,
485 },
486 {
487 .ctl_name = KERN_SHMALL,
488 .procname = "shmall",
489 .data = &init_ipc_ns.shm_ctlall,
490 .maxlen = sizeof (init_ipc_ns.shm_ctlall),
491 .mode = 0644,
492 .proc_handler = &proc_ipc_doulongvec_minmax,
493 .strategy = sysctl_ipc_data,
494 },
495 {
496 .ctl_name = KERN_SHMMNI,
497 .procname = "shmmni",
498 .data = &init_ipc_ns.shm_ctlmni,
499 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
500 .mode = 0644,
501 .proc_handler = &proc_ipc_dointvec,
502 .strategy = sysctl_ipc_data,
503 },
504 {
505 .ctl_name = KERN_MSGMAX,
506 .procname = "msgmax",
507 .data = &init_ipc_ns.msg_ctlmax,
508 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
509 .mode = 0644,
510 .proc_handler = &proc_ipc_dointvec,
511 .strategy = sysctl_ipc_data,
512 },
513 {
514 .ctl_name = KERN_MSGMNI,
515 .procname = "msgmni",
516 .data = &init_ipc_ns.msg_ctlmni,
517 .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
518 .mode = 0644,
519 .proc_handler = &proc_ipc_dointvec,
520 .strategy = sysctl_ipc_data,
521 },
522 {
523 .ctl_name = KERN_MSGMNB,
524 .procname = "msgmnb",
525 .data = &init_ipc_ns.msg_ctlmnb,
526 .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
527 .mode = 0644,
528 .proc_handler = &proc_ipc_dointvec,
529 .strategy = sysctl_ipc_data,
530 },
531 {
532 .ctl_name = KERN_SEM,
533 .procname = "sem",
534 .data = &init_ipc_ns.sem_ctls,
535 .maxlen = 4*sizeof (int),
536 .mode = 0644,
537 .proc_handler = &proc_ipc_dointvec,
538 .strategy = sysctl_ipc_data,
539 },
540#endif
541#ifdef CONFIG_MAGIC_SYSRQ 364#ifdef CONFIG_MAGIC_SYSRQ
542 { 365 {
543 .ctl_name = KERN_SYSRQ, 366 .ctl_name = KERN_SYSRQ,
@@ -1038,6 +861,12 @@ static ctl_table vm_table[] = {
1038 { .ctl_name = 0 } 861 { .ctl_name = 0 }
1039}; 862};
1040 863
864#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
865static ctl_table binfmt_misc_table[] = {
866 { .ctl_name = 0 }
867};
868#endif
869
1041static ctl_table fs_table[] = { 870static ctl_table fs_table[] = {
1042 { 871 {
1043 .ctl_name = FS_NRINODE, 872 .ctl_name = FS_NRINODE,
@@ -1161,6 +990,14 @@ static ctl_table fs_table[] = {
1161 .mode = 0644, 990 .mode = 0644,
1162 .proc_handler = &proc_dointvec, 991 .proc_handler = &proc_dointvec,
1163 }, 992 },
993#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
994 {
995 .ctl_name = CTL_UNNUMBERED,
996 .procname = "binfmt_misc",
997 .mode = 0555,
998 .child = binfmt_misc_table,
999 },
1000#endif
1164 { .ctl_name = 0 } 1001 { .ctl_name = 0 }
1165}; 1002};
1166 1003
@@ -1172,8 +1009,6 @@ static ctl_table dev_table[] = {
1172 { .ctl_name = 0 } 1009 { .ctl_name = 0 }
1173}; 1010};
1174 1011
1175extern void init_irq_proc (void);
1176
1177static DEFINE_SPINLOCK(sysctl_lock); 1012static DEFINE_SPINLOCK(sysctl_lock);
1178 1013
1179/* called under sysctl_lock */ 1014/* called under sysctl_lock */
@@ -1215,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p)
1215 list_del_init(&p->ctl_entry); 1050 list_del_init(&p->ctl_entry);
1216} 1051}
1217 1052
1218void __init sysctl_init(void) 1053void sysctl_head_finish(struct ctl_table_header *head)
1219{ 1054{
1220#ifdef CONFIG_PROC_SYSCTL 1055 if (!head)
1221 register_proc_table(root_table, proc_sys_root, &root_table_header); 1056 return;
1222 init_irq_proc(); 1057 spin_lock(&sysctl_lock);
1223#endif 1058 unuse_table(head);
1059 spin_unlock(&sysctl_lock);
1060}
1061
1062struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1063{
1064 struct ctl_table_header *head;
1065 struct list_head *tmp;
1066 spin_lock(&sysctl_lock);
1067 if (prev) {
1068 tmp = &prev->ctl_entry;
1069 unuse_table(prev);
1070 goto next;
1071 }
1072 tmp = &root_table_header.ctl_entry;
1073 for (;;) {
1074 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1075
1076 if (!use_table(head))
1077 goto next;
1078 spin_unlock(&sysctl_lock);
1079 return head;
1080 next:
1081 tmp = tmp->next;
1082 if (tmp == &root_table_header.ctl_entry)
1083 break;
1084 }
1085 spin_unlock(&sysctl_lock);
1086 return NULL;
1224} 1087}
1225 1088
1226#ifdef CONFIG_SYSCTL_SYSCALL 1089#ifdef CONFIG_SYSCTL_SYSCALL
1227int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1090int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1228 void __user *newval, size_t newlen) 1091 void __user *newval, size_t newlen)
1229{ 1092{
1230 struct list_head *tmp; 1093 struct ctl_table_header *head;
1231 int error = -ENOTDIR; 1094 int error = -ENOTDIR;
1232 1095
1233 if (nlen <= 0 || nlen >= CTL_MAXNAME) 1096 if (nlen <= 0 || nlen >= CTL_MAXNAME)
@@ -1237,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1237 if (!oldlenp || get_user(old_len, oldlenp)) 1100 if (!oldlenp || get_user(old_len, oldlenp))
1238 return -EFAULT; 1101 return -EFAULT;
1239 } 1102 }
1240 spin_lock(&sysctl_lock);
1241 tmp = &root_table_header.ctl_entry;
1242 do {
1243 struct ctl_table_header *head =
1244 list_entry(tmp, struct ctl_table_header, ctl_entry);
1245
1246 if (!use_table(head))
1247 continue;
1248
1249 spin_unlock(&sysctl_lock);
1250 1103
1104 for (head = sysctl_head_next(NULL); head;
1105 head = sysctl_head_next(head)) {
1251 error = parse_table(name, nlen, oldval, oldlenp, 1106 error = parse_table(name, nlen, oldval, oldlenp,
1252 newval, newlen, head->ctl_table); 1107 newval, newlen, head->ctl_table);
1253 1108 if (error != -ENOTDIR) {
1254 spin_lock(&sysctl_lock); 1109 sysctl_head_finish(head);
1255 unuse_table(head);
1256 if (error != -ENOTDIR)
1257 break; 1110 break;
1258 } while ((tmp = tmp->next) != &root_table_header.ctl_entry); 1111 }
1259 spin_unlock(&sysctl_lock); 1112 }
1260 return error; 1113 return error;
1261} 1114}
1262 1115
@@ -1277,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1277#endif /* CONFIG_SYSCTL_SYSCALL */ 1130#endif /* CONFIG_SYSCTL_SYSCALL */
1278 1131
1279/* 1132/*
1280 * ctl_perm does NOT grant the superuser all rights automatically, because 1133 * sysctl_perm does NOT grant the superuser all rights automatically, because
1281 * some sysctl variables are readonly even to root. 1134 * some sysctl variables are readonly even to root.
1282 */ 1135 */
1283 1136
@@ -1292,7 +1145,7 @@ static int test_perm(int mode, int op)
1292 return -EACCES; 1145 return -EACCES;
1293} 1146}
1294 1147
1295static inline int ctl_perm(ctl_table *table, int op) 1148int sysctl_perm(ctl_table *table, int op)
1296{ 1149{
1297 int error; 1150 int error;
1298 error = security_sysctl(table, op); 1151 error = security_sysctl(table, op);
@@ -1316,19 +1169,11 @@ repeat:
1316 for ( ; table->ctl_name || table->procname; table++) { 1169 for ( ; table->ctl_name || table->procname; table++) {
1317 if (!table->ctl_name) 1170 if (!table->ctl_name)
1318 continue; 1171 continue;
1319 if (n == table->ctl_name || table->ctl_name == CTL_ANY) { 1172 if (n == table->ctl_name) {
1320 int error; 1173 int error;
1321 if (table->child) { 1174 if (table->child) {
1322 if (ctl_perm(table, 001)) 1175 if (sysctl_perm(table, 001))
1323 return -EPERM; 1176 return -EPERM;
1324 if (table->strategy) {
1325 error = table->strategy(
1326 table, name, nlen,
1327 oldval, oldlenp,
1328 newval, newlen);
1329 if (error)
1330 return error;
1331 }
1332 name++; 1177 name++;
1333 nlen--; 1178 nlen--;
1334 table = table->child; 1179 table = table->child;
@@ -1356,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table,
1356 op |= 004; 1201 op |= 004;
1357 if (newval) 1202 if (newval)
1358 op |= 002; 1203 op |= 002;
1359 if (ctl_perm(table, op)) 1204 if (sysctl_perm(table, op))
1360 return -EPERM; 1205 return -EPERM;
1361 1206
1362 if (table->strategy) { 1207 if (table->strategy) {
@@ -1395,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table,
1395} 1240}
1396#endif /* CONFIG_SYSCTL_SYSCALL */ 1241#endif /* CONFIG_SYSCTL_SYSCALL */
1397 1242
1243static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1244{
1245 for (; table->ctl_name || table->procname; table++) {
1246 table->parent = parent;
1247 if (table->child)
1248 sysctl_set_parent(table, table->child);
1249 }
1250}
1251
1252static __init int sysctl_init(void)
1253{
1254 sysctl_set_parent(NULL, root_table);
1255 return 0;
1256}
1257
1258core_initcall(sysctl_init);
1259
1398/** 1260/**
1399 * register_sysctl_table - register a sysctl hierarchy 1261 * register_sysctl_table - register a sysctl hierarchy
1400 * @table: the top-level table structure 1262 * @table: the top-level table structure
1401 * @insert_at_head: whether the entry should be inserted in front or at the end
1402 * 1263 *
1403 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1264 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1404 * array. An entry with a ctl_name of 0 terminates the table. 1265 * array. An entry with a ctl_name of 0 terminates the table.
@@ -1464,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table,
1464 * This routine returns %NULL on a failure to register, and a pointer 1325 * This routine returns %NULL on a failure to register, and a pointer
1465 * to the table header on success. 1326 * to the table header on success.
1466 */ 1327 */
1467struct ctl_table_header *register_sysctl_table(ctl_table * table, 1328struct ctl_table_header *register_sysctl_table(ctl_table * table)
1468 int insert_at_head)
1469{ 1329{
1470 struct ctl_table_header *tmp; 1330 struct ctl_table_header *tmp;
1471 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1331 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
@@ -1475,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1475 INIT_LIST_HEAD(&tmp->ctl_entry); 1335 INIT_LIST_HEAD(&tmp->ctl_entry);
1476 tmp->used = 0; 1336 tmp->used = 0;
1477 tmp->unregistering = NULL; 1337 tmp->unregistering = NULL;
1338 sysctl_set_parent(NULL, table);
1478 spin_lock(&sysctl_lock); 1339 spin_lock(&sysctl_lock);
1479 if (insert_at_head) 1340 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1480 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
1481 else
1482 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1483 spin_unlock(&sysctl_lock); 1341 spin_unlock(&sysctl_lock);
1484#ifdef CONFIG_PROC_SYSCTL
1485 register_proc_table(table, proc_sys_root, tmp);
1486#endif
1487 return tmp; 1342 return tmp;
1488} 1343}
1489 1344
@@ -1499,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1499 might_sleep(); 1354 might_sleep();
1500 spin_lock(&sysctl_lock); 1355 spin_lock(&sysctl_lock);
1501 start_unregistering(header); 1356 start_unregistering(header);
1502#ifdef CONFIG_PROC_SYSCTL
1503 unregister_proc_table(header->ctl_table, proc_sys_root);
1504#endif
1505 spin_unlock(&sysctl_lock); 1357 spin_unlock(&sysctl_lock);
1506 kfree(header); 1358 kfree(header);
1507} 1359}
@@ -1525,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table)
1525 1377
1526#ifdef CONFIG_PROC_SYSCTL 1378#ifdef CONFIG_PROC_SYSCTL
1527 1379
1528/* Scan the sysctl entries in table and add them all into /proc */
1529static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
1530{
1531 struct proc_dir_entry *de;
1532 int len;
1533 mode_t mode;
1534
1535 for (; table->ctl_name || table->procname; table++) {
1536 /* Can't do anything without a proc name. */
1537 if (!table->procname)
1538 continue;
1539 /* Maybe we can't do anything with it... */
1540 if (!table->proc_handler && !table->child) {
1541 printk(KERN_WARNING "SYSCTL: Can't register %s\n",
1542 table->procname);
1543 continue;
1544 }
1545
1546 len = strlen(table->procname);
1547 mode = table->mode;
1548
1549 de = NULL;
1550 if (table->proc_handler)
1551 mode |= S_IFREG;
1552 else {
1553 mode |= S_IFDIR;
1554 for (de = root->subdir; de; de = de->next) {
1555 if (proc_match(len, table->procname, de))
1556 break;
1557 }
1558 /* If the subdir exists already, de is non-NULL */
1559 }
1560
1561 if (!de) {
1562 de = create_proc_entry(table->procname, mode, root);
1563 if (!de)
1564 continue;
1565 de->set = set;
1566 de->data = (void *) table;
1567 if (table->proc_handler)
1568 de->proc_fops = &proc_sys_file_operations;
1569 }
1570 table->de = de;
1571 if (de->mode & S_IFDIR)
1572 register_proc_table(table->child, de, set);
1573 }
1574}
1575
1576/*
1577 * Unregister a /proc sysctl table and any subdirectories.
1578 */
1579static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
1580{
1581 struct proc_dir_entry *de;
1582 for (; table->ctl_name || table->procname; table++) {
1583 if (!(de = table->de))
1584 continue;
1585 if (de->mode & S_IFDIR) {
1586 if (!table->child) {
1587 printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
1588 continue;
1589 }
1590 unregister_proc_table(table->child, de);
1591
1592 /* Don't unregister directories which still have entries.. */
1593 if (de->subdir)
1594 continue;
1595 }
1596
1597 /*
1598 * In any case, mark the entry as goner; we'll keep it
1599 * around if it's busy, but we'll know to do nothing with
1600 * its fields. We are under sysctl_lock here.
1601 */
1602 de->data = NULL;
1603
1604 /* Don't unregister proc entries that are still being used.. */
1605 if (atomic_read(&de->count))
1606 continue;
1607
1608 table->de = NULL;
1609 remove_proc_entry(table->procname, root);
1610 }
1611}
1612
1613static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1614 size_t count, loff_t *ppos)
1615{
1616 int op;
1617 struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
1618 struct ctl_table *table;
1619 size_t res;
1620 ssize_t error = -ENOTDIR;
1621
1622 spin_lock(&sysctl_lock);
1623 if (de && de->data && use_table(de->set)) {
1624 /*
1625 * at that point we know that sysctl was not unregistered
1626 * and won't be until we finish
1627 */
1628 spin_unlock(&sysctl_lock);
1629 table = (struct ctl_table *) de->data;
1630 if (!table || !table->proc_handler)
1631 goto out;
1632 error = -EPERM;
1633 op = (write ? 002 : 004);
1634 if (ctl_perm(table, op))
1635 goto out;
1636
1637 /* careful: calling conventions are nasty here */
1638 res = count;
1639 error = (*table->proc_handler)(table, write, file,
1640 buf, &res, ppos);
1641 if (!error)
1642 error = res;
1643 out:
1644 spin_lock(&sysctl_lock);
1645 unuse_table(de->set);
1646 }
1647 spin_unlock(&sysctl_lock);
1648 return error;
1649}
1650
1651static int proc_opensys(struct inode *inode, struct file *file)
1652{
1653 if (file->f_mode & FMODE_WRITE) {
1654 /*
1655 * sysctl entries that are not writable,
1656 * are _NOT_ writable, capabilities or not.
1657 */
1658 if (!(inode->i_mode & S_IWUSR))
1659 return -EPERM;
1660 }
1661
1662 return 0;
1663}
1664
1665static ssize_t proc_readsys(struct file * file, char __user * buf,
1666 size_t count, loff_t *ppos)
1667{
1668 return do_rw_proc(0, file, buf, count, ppos);
1669}
1670
1671static ssize_t proc_writesys(struct file * file, const char __user * buf,
1672 size_t count, loff_t *ppos)
1673{
1674 return do_rw_proc(1, file, (char __user *) buf, count, ppos);
1675}
1676
1677static int _proc_do_string(void* data, int maxlen, int write, 1380static int _proc_do_string(void* data, int maxlen, int write,
1678 struct file *filp, void __user *buffer, 1381 struct file *filp, void __user *buffer,
1679 size_t *lenp, loff_t *ppos) 1382 size_t *lenp, loff_t *ppos)
@@ -1681,13 +1384,12 @@ static int _proc_do_string(void* data, int maxlen, int write,
1681 size_t len; 1384 size_t len;
1682 char __user *p; 1385 char __user *p;
1683 char c; 1386 char c;
1684 1387
1685 if (!data || !maxlen || !*lenp || 1388 if (!data || !maxlen || !*lenp) {
1686 (*ppos && !write)) {
1687 *lenp = 0; 1389 *lenp = 0;
1688 return 0; 1390 return 0;
1689 } 1391 }
1690 1392
1691 if (write) { 1393 if (write) {
1692 len = 0; 1394 len = 0;
1693 p = buffer; 1395 p = buffer;
@@ -1708,6 +1410,15 @@ static int _proc_do_string(void* data, int maxlen, int write,
1708 len = strlen(data); 1410 len = strlen(data);
1709 if (len > maxlen) 1411 if (len > maxlen)
1710 len = maxlen; 1412 len = maxlen;
1413
1414 if (*ppos > len) {
1415 *lenp = 0;
1416 return 0;
1417 }
1418
1419 data += *ppos;
1420 len -= *ppos;
1421
1711 if (len > *lenp) 1422 if (len > *lenp)
1712 len = *lenp; 1423 len = *lenp;
1713 if (len) 1424 if (len)
@@ -1749,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1749 buffer, lenp, ppos); 1460 buffer, lenp, ppos);
1750} 1461}
1751 1462
1752/*
1753 * Special case of dostring for the UTS structure. This has locks
1754 * to observe. Should this be in kernel/sys.c ????
1755 */
1756
1757static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1758 void __user *buffer, size_t *lenp, loff_t *ppos)
1759{
1760 int r;
1761 void *which;
1762 which = get_uts(table, write);
1763 r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
1764 put_uts(table, write, which);
1765 return r;
1766}
1767 1463
1768static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1464static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1769 int *valp, 1465 int *valp,
@@ -1927,6 +1623,7 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
1927 1623
1928#define OP_SET 0 1624#define OP_SET 0
1929#define OP_AND 1 1625#define OP_AND 1
1626#define OP_OR 2
1930 1627
1931static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, 1628static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1932 int *valp, 1629 int *valp,
@@ -1938,6 +1635,7 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1938 switch(op) { 1635 switch(op) {
1939 case OP_SET: *valp = val; break; 1636 case OP_SET: *valp = val; break;
1940 case OP_AND: *valp &= val; break; 1637 case OP_AND: *valp &= val; break;
1638 case OP_OR: *valp |= val; break;
1941 } 1639 }
1942 } else { 1640 } else {
1943 int val = *valp; 1641 int val = *valp;
@@ -1961,7 +1659,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1961{ 1659{
1962 int op; 1660 int op;
1963 1661
1964 if (!capable(CAP_SYS_MODULE)) { 1662 if (write && !capable(CAP_SYS_MODULE)) {
1965 return -EPERM; 1663 return -EPERM;
1966 } 1664 }
1967 1665
@@ -1970,6 +1668,22 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1970 do_proc_dointvec_bset_conv,&op); 1668 do_proc_dointvec_bset_conv,&op);
1971} 1669}
1972 1670
1671/*
1672 * Taint values can only be increased
1673 */
1674static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp,
1675 void __user *buffer, size_t *lenp, loff_t *ppos)
1676{
1677 int op;
1678
1679 if (!capable(CAP_SYS_ADMIN))
1680 return -EPERM;
1681
1682 op = OP_OR;
1683 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1684 do_proc_dointvec_bset_conv,&op);
1685}
1686
1973struct do_proc_dointvec_minmax_conv_param { 1687struct do_proc_dointvec_minmax_conv_param {
1974 int *min; 1688 int *min;
1975 int *max; 1689 int *max;
@@ -2331,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2331 do_proc_dointvec_ms_jiffies_conv, NULL); 2045 do_proc_dointvec_ms_jiffies_conv, NULL);
2332} 2046}
2333 2047
2334#ifdef CONFIG_SYSVIPC
2335static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2336 void __user *buffer, size_t *lenp, loff_t *ppos)
2337{
2338 void *which;
2339 which = get_ipc(table, write);
2340 return __do_proc_dointvec(which, table, write, filp, buffer,
2341 lenp, ppos, NULL, NULL);
2342}
2343
2344static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2345 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
2346{
2347 void *which;
2348 which = get_ipc(table, write);
2349 return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
2350 lenp, ppos, 1l, 1l);
2351}
2352
2353#endif
2354
2355static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2048static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
2356 void __user *buffer, size_t *lenp, loff_t *ppos) 2049 void __user *buffer, size_t *lenp, loff_t *ppos)
2357{ 2050{
@@ -2382,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
2382 return -ENOSYS; 2075 return -ENOSYS;
2383} 2076}
2384 2077
2385static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
2386 void __user *buffer, size_t *lenp, loff_t *ppos)
2387{
2388 return -ENOSYS;
2389}
2390
2391#ifdef CONFIG_SYSVIPC
2392static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2393 void __user *buffer, size_t *lenp, loff_t *ppos)
2394{
2395 return -ENOSYS;
2396}
2397static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2398 void __user *buffer, size_t *lenp, loff_t *ppos)
2399{
2400 return -ENOSYS;
2401}
2402static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2403 struct file *filp, void __user *buffer,
2404 size_t *lenp, loff_t *ppos)
2405{
2406 return -ENOSYS;
2407}
2408#endif
2409
2410int proc_dointvec(ctl_table *table, int write, struct file *filp, 2078int proc_dointvec(ctl_table *table, int write, struct file *filp,
2411 void __user *buffer, size_t *lenp, loff_t *ppos) 2079 void __user *buffer, size_t *lenp, loff_t *ppos)
2412{ 2080{
@@ -2553,17 +2221,23 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2553 void __user *oldval, size_t __user *oldlenp, 2221 void __user *oldval, size_t __user *oldlenp,
2554 void __user *newval, size_t newlen) 2222 void __user *newval, size_t newlen)
2555{ 2223{
2556 if (oldval) { 2224 if (oldval && oldlenp) {
2557 size_t olen; 2225 size_t olen;
2558 if (oldlenp) { 2226
2559 if (get_user(olen, oldlenp)) 2227 if (get_user(olen, oldlenp))
2228 return -EFAULT;
2229 if (olen) {
2230 int val;
2231
2232 if (olen < sizeof(int))
2233 return -EINVAL;
2234
2235 val = *(int *)(table->data) / HZ;
2236 if (put_user(val, (int __user *)oldval))
2237 return -EFAULT;
2238 if (put_user(sizeof(int), oldlenp))
2560 return -EFAULT; 2239 return -EFAULT;
2561 if (olen!=sizeof(int))
2562 return -EINVAL;
2563 } 2240 }
2564 if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) ||
2565 (oldlenp && put_user(sizeof(int),oldlenp)))
2566 return -EFAULT;
2567 } 2241 }
2568 if (newval && newlen) { 2242 if (newval && newlen) {
2569 int new; 2243 int new;
@@ -2581,17 +2255,23 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2581 void __user *oldval, size_t __user *oldlenp, 2255 void __user *oldval, size_t __user *oldlenp,
2582 void __user *newval, size_t newlen) 2256 void __user *newval, size_t newlen)
2583{ 2257{
2584 if (oldval) { 2258 if (oldval && oldlenp) {
2585 size_t olen; 2259 size_t olen;
2586 if (oldlenp) { 2260
2587 if (get_user(olen, oldlenp)) 2261 if (get_user(olen, oldlenp))
2262 return -EFAULT;
2263 if (olen) {
2264 int val;
2265
2266 if (olen < sizeof(int))
2267 return -EINVAL;
2268
2269 val = jiffies_to_msecs(*(int *)(table->data));
2270 if (put_user(val, (int __user *)oldval))
2271 return -EFAULT;
2272 if (put_user(sizeof(int), oldlenp))
2588 return -EFAULT; 2273 return -EFAULT;
2589 if (olen!=sizeof(int))
2590 return -EINVAL;
2591 } 2274 }
2592 if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) ||
2593 (oldlenp && put_user(sizeof(int),oldlenp)))
2594 return -EFAULT;
2595 } 2275 }
2596 if (newval && newlen) { 2276 if (newval && newlen) {
2597 int new; 2277 int new;
@@ -2605,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2605} 2285}
2606 2286
2607 2287
2608/* The generic string strategy routine: */
2609static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2610 void __user *oldval, size_t __user *oldlenp,
2611 void __user *newval, size_t newlen)
2612{
2613 struct ctl_table uts_table;
2614 int r, write;
2615 write = newval && newlen;
2616 memcpy(&uts_table, table, sizeof(uts_table));
2617 uts_table.data = get_uts(table, write);
2618 r = sysctl_string(&uts_table, name, nlen,
2619 oldval, oldlenp, newval, newlen);
2620 put_uts(table, write, uts_table.data);
2621 return r;
2622}
2623
2624#ifdef CONFIG_SYSVIPC
2625/* The generic sysctl ipc data routine. */
2626static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2627 void __user *oldval, size_t __user *oldlenp,
2628 void __user *newval, size_t newlen)
2629{
2630 size_t len;
2631 void *data;
2632
2633 /* Get out of I don't have a variable */
2634 if (!table->data || !table->maxlen)
2635 return -ENOTDIR;
2636
2637 data = get_ipc(table, 1);
2638 if (!data)
2639 return -ENOTDIR;
2640
2641 if (oldval && oldlenp) {
2642 if (get_user(len, oldlenp))
2643 return -EFAULT;
2644 if (len) {
2645 if (len > table->maxlen)
2646 len = table->maxlen;
2647 if (copy_to_user(oldval, data, len))
2648 return -EFAULT;
2649 if (put_user(len, oldlenp))
2650 return -EFAULT;
2651 }
2652 }
2653
2654 if (newval && newlen) {
2655 if (newlen > table->maxlen)
2656 newlen = table->maxlen;
2657
2658 if (copy_from_user(data, newval, newlen))
2659 return -EFAULT;
2660 }
2661 return 1;
2662}
2663#endif
2664 2288
2665#else /* CONFIG_SYSCTL_SYSCALL */ 2289#else /* CONFIG_SYSCTL_SYSCALL */
2666 2290
@@ -2726,18 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2726 return -ENOSYS; 2350 return -ENOSYS;
2727} 2351}
2728 2352
2729static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2730 void __user *oldval, size_t __user *oldlenp,
2731 void __user *newval, size_t newlen)
2732{
2733 return -ENOSYS;
2734}
2735static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2736 void __user *oldval, size_t __user *oldlenp,
2737 void __user *newval, size_t newlen)
2738{
2739 return -ENOSYS;
2740}
2741#endif /* CONFIG_SYSCTL_SYSCALL */ 2353#endif /* CONFIG_SYSCTL_SYSCALL */
2742 2354
2743/* 2355/*
diff --git a/kernel/time.c b/kernel/time.c
index 0e017bff4c19..c6c80ea5d0ea 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec)
470 return tv; 470 return tv;
471} 471}
472 472
473/*
474 * Convert jiffies to milliseconds and back.
475 *
476 * Avoid unnecessary multiplications/divisions in the
477 * two most common HZ cases:
478 */
479unsigned int jiffies_to_msecs(const unsigned long j)
480{
481#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
482 return (MSEC_PER_SEC / HZ) * j;
483#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
484 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
485#else
486 return (j * MSEC_PER_SEC) / HZ;
487#endif
488}
489EXPORT_SYMBOL(jiffies_to_msecs);
490
491unsigned int jiffies_to_usecs(const unsigned long j)
492{
493#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
494 return (USEC_PER_SEC / HZ) * j;
495#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
496 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
497#else
498 return (j * USEC_PER_SEC) / HZ;
499#endif
500}
501EXPORT_SYMBOL(jiffies_to_usecs);
502
503/*
504 * When we convert to jiffies then we interpret incoming values
505 * the following way:
506 *
507 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
508 *
509 * - 'too large' values [that would result in larger than
510 * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
511 *
512 * - all other values are converted to jiffies by either multiplying
513 * the input value by a factor or dividing it with a factor
514 *
515 * We must also be careful about 32-bit overflows.
516 */
517unsigned long msecs_to_jiffies(const unsigned int m)
518{
519 /*
520 * Negative value, means infinite timeout:
521 */
522 if ((int)m < 0)
523 return MAX_JIFFY_OFFSET;
524
525#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
526 /*
527 * HZ is equal to or smaller than 1000, and 1000 is a nice
528 * round multiple of HZ, divide with the factor between them,
529 * but round upwards:
530 */
531 return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
532#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
533 /*
534 * HZ is larger than 1000, and HZ is a nice round multiple of
535 * 1000 - simply multiply with the factor between them.
536 *
537 * But first make sure the multiplication result cannot
538 * overflow:
539 */
540 if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
541 return MAX_JIFFY_OFFSET;
542
543 return m * (HZ / MSEC_PER_SEC);
544#else
545 /*
546 * Generic case - multiply, round and divide. But first
547 * check that if we are doing a net multiplication, that
548 * we wouldnt overflow:
549 */
550 if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
551 return MAX_JIFFY_OFFSET;
552
553 return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
554#endif
555}
556EXPORT_SYMBOL(msecs_to_jiffies);
557
558unsigned long usecs_to_jiffies(const unsigned int u)
559{
560 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
561 return MAX_JIFFY_OFFSET;
562#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
563 return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
564#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
565 return u * (HZ / USEC_PER_SEC);
566#else
567 return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
568#endif
569}
570EXPORT_SYMBOL(usecs_to_jiffies);
571
572/*
573 * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
574 * that a remainder subtract here would not do the right thing as the
575 * resolution values don't fall on second boundries. I.e. the line:
576 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
577 *
578 * Rather, we just shift the bits off the right.
579 *
580 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
581 * value to a scaled second value.
582 */
583unsigned long
584timespec_to_jiffies(const struct timespec *value)
585{
586 unsigned long sec = value->tv_sec;
587 long nsec = value->tv_nsec + TICK_NSEC - 1;
588
589 if (sec >= MAX_SEC_IN_JIFFIES){
590 sec = MAX_SEC_IN_JIFFIES;
591 nsec = 0;
592 }
593 return (((u64)sec * SEC_CONVERSION) +
594 (((u64)nsec * NSEC_CONVERSION) >>
595 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
596
597}
598EXPORT_SYMBOL(timespec_to_jiffies);
599
600void
601jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
602{
603 /*
604 * Convert jiffies to nanoseconds and separate with
605 * one divide.
606 */
607 u64 nsec = (u64)jiffies * TICK_NSEC;
608 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
609}
610EXPORT_SYMBOL(jiffies_to_timespec);
611
612/* Same for "timeval"
613 *
614 * Well, almost. The problem here is that the real system resolution is
615 * in nanoseconds and the value being converted is in micro seconds.
616 * Also for some machines (those that use HZ = 1024, in-particular),
617 * there is a LARGE error in the tick size in microseconds.
618
619 * The solution we use is to do the rounding AFTER we convert the
620 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
621 * Instruction wise, this should cost only an additional add with carry
622 * instruction above the way it was done above.
623 */
624unsigned long
625timeval_to_jiffies(const struct timeval *value)
626{
627 unsigned long sec = value->tv_sec;
628 long usec = value->tv_usec;
629
630 if (sec >= MAX_SEC_IN_JIFFIES){
631 sec = MAX_SEC_IN_JIFFIES;
632 usec = 0;
633 }
634 return (((u64)sec * SEC_CONVERSION) +
635 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
636 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
637}
638
639void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
640{
641 /*
642 * Convert jiffies to nanoseconds and separate with
643 * one divide.
644 */
645 u64 nsec = (u64)jiffies * TICK_NSEC;
646 long tv_usec;
647
648 value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
649 tv_usec /= NSEC_PER_USEC;
650 value->tv_usec = tv_usec;
651}
652
653/*
654 * Convert jiffies/jiffies_64 to clock_t and back.
655 */
656clock_t jiffies_to_clock_t(long x)
657{
658#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
659 return x / (HZ / USER_HZ);
660#else
661 u64 tmp = (u64)x * TICK_NSEC;
662 do_div(tmp, (NSEC_PER_SEC / USER_HZ));
663 return (long)tmp;
664#endif
665}
666EXPORT_SYMBOL(jiffies_to_clock_t);
667
668unsigned long clock_t_to_jiffies(unsigned long x)
669{
670#if (HZ % USER_HZ)==0
671 if (x >= ~0UL / (HZ / USER_HZ))
672 return ~0UL;
673 return x * (HZ / USER_HZ);
674#else
675 u64 jif;
676
677 /* Don't worry about loss of precision here .. */
678 if (x >= ~0UL / HZ * USER_HZ)
679 return ~0UL;
680
681 /* .. but do try to contain it here */
682 jif = x * (u64) HZ;
683 do_div(jif, USER_HZ);
684 return jif;
685#endif
686}
687EXPORT_SYMBOL(clock_t_to_jiffies);
688
689u64 jiffies_64_to_clock_t(u64 x)
690{
691#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
692 do_div(x, HZ / USER_HZ);
693#else
694 /*
695 * There are better ways that don't overflow early,
696 * but even this doesn't overflow in hundreds of years
697 * in 64 bits, so..
698 */
699 x *= TICK_NSEC;
700 do_div(x, (NSEC_PER_SEC / USER_HZ));
701#endif
702 return x;
703}
704
705EXPORT_SYMBOL(jiffies_64_to_clock_t);
706
707u64 nsec_to_clock_t(u64 x)
708{
709#if (NSEC_PER_SEC % USER_HZ) == 0
710 do_div(x, (NSEC_PER_SEC / USER_HZ));
711#elif (USER_HZ % 512) == 0
712 x *= USER_HZ/512;
713 do_div(x, (NSEC_PER_SEC / 512));
714#else
715 /*
716 * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
717 * overflow after 64.99 years.
718 * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
719 */
720 x *= 9;
721 do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
722 USER_HZ));
723#endif
724 return x;
725}
726
473#if (BITS_PER_LONG < 64) 727#if (BITS_PER_LONG < 64)
474u64 get_jiffies_64(void) 728u64 get_jiffies_64(void)
475{ 729{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000000..f66351126544
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
1#
2# Timer subsystem related configuration options
3#
4config TICK_ONESHOT
5 bool
6 default n
7
8config NO_HZ
9 bool "Tickless System (Dynamic Ticks)"
10 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
11 select TICK_ONESHOT
12 help
13 This option enables a tickless system: timer interrupts will
14 only trigger on an as-needed basis both when the system is
15 busy and when the system is idle.
16
17config HIGH_RES_TIMERS
18 bool "High Resolution Timer Support"
19 depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
20 select TICK_ONESHOT
21 help
22 This option enables high resolution timer support. If your
23 hardware is not capable then this option only increases
24 the size of the kernel image.
25
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16fb..93bccba1f265 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
1obj-y += ntp.o clocksource.o jiffies.o 1obj-y += ntp.o clocksource.o jiffies.o timer_list.o
2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
8obj-$(CONFIG_TIMER_STATS) += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000000..67932ea78c17
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
1/*
2 * linux/kernel/time/clockevents.c
3 *
4 * This file contains functions which manage clock event devices.
5 *
6 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
7 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
8 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
9 *
10 * This code is licenced under the GPL version 2. For details see
11 * kernel-base/COPYING.
12 */
13
14#include <linux/clockchips.h>
15#include <linux/hrtimer.h>
16#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h>
20#include <linux/sysdev.h>
21
22/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices);
24static LIST_HEAD(clockevents_released);
25
26/* Notification for clock events */
27static RAW_NOTIFIER_HEAD(clockevents_chain);
28
29/* Protection for the above */
30static DEFINE_SPINLOCK(clockevents_lock);
31
32/**
33 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
34 * @latch: value to convert
35 * @evt: pointer to clock event device descriptor
36 *
37 * Math helper, returns latch value converted to nanoseconds (bound checked)
38 */
39unsigned long clockevent_delta2ns(unsigned long latch,
40 struct clock_event_device *evt)
41{
42 u64 clc = ((u64) latch << evt->shift);
43
44 do_div(clc, evt->mult);
45 if (clc < 1000)
46 clc = 1000;
47 if (clc > LONG_MAX)
48 clc = LONG_MAX;
49
50 return (unsigned long) clc;
51}
52
53/**
54 * clockevents_set_mode - set the operating mode of a clock event device
55 * @dev: device to modify
56 * @mode: new mode
57 *
58 * Must be called with interrupts disabled !
59 */
60void clockevents_set_mode(struct clock_event_device *dev,
61 enum clock_event_mode mode)
62{
63 if (dev->mode != mode) {
64 dev->set_mode(mode, dev);
65 dev->mode = mode;
66 }
67}
68
69/**
70 * clockevents_program_event - Reprogram the clock event device.
71 * @expires: absolute expiry time (monotonic clock)
72 *
73 * Returns 0 on success, -ETIME when the event is in the past.
74 */
75int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
76 ktime_t now)
77{
78 unsigned long long clc;
79 int64_t delta;
80
81 delta = ktime_to_ns(ktime_sub(expires, now));
82
83 if (delta <= 0)
84 return -ETIME;
85
86 dev->next_event = expires;
87
88 if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
89 return 0;
90
91 if (delta > dev->max_delta_ns)
92 delta = dev->max_delta_ns;
93 if (delta < dev->min_delta_ns)
94 delta = dev->min_delta_ns;
95
96 clc = delta * dev->mult;
97 clc >>= dev->shift;
98
99 return dev->set_next_event((unsigned long) clc, dev);
100}
101
102/**
103 * clockevents_register_notifier - register a clock events change listener
104 */
105int clockevents_register_notifier(struct notifier_block *nb)
106{
107 int ret;
108
109 spin_lock(&clockevents_lock);
110 ret = raw_notifier_chain_register(&clockevents_chain, nb);
111 spin_unlock(&clockevents_lock);
112
113 return ret;
114}
115
116/**
117 * clockevents_unregister_notifier - unregister a clock events change listener
118 */
119void clockevents_unregister_notifier(struct notifier_block *nb)
120{
121 spin_lock(&clockevents_lock);
122 raw_notifier_chain_unregister(&clockevents_chain, nb);
123 spin_unlock(&clockevents_lock);
124}
125
126/*
127 * Notify about a clock event change. Called with clockevents_lock
128 * held.
129 */
130static void clockevents_do_notify(unsigned long reason, void *dev)
131{
132 raw_notifier_call_chain(&clockevents_chain, reason, dev);
133}
134
135/*
136 * Called after a notify add to make devices availble which were
137 * released from the notifier call.
138 */
139static void clockevents_notify_released(void)
140{
141 struct clock_event_device *dev;
142
143 while (!list_empty(&clockevents_released)) {
144 dev = list_entry(clockevents_released.next,
145 struct clock_event_device, list);
146 list_del(&dev->list);
147 list_add(&dev->list, &clockevent_devices);
148 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
149 }
150}
151
152/**
153 * clockevents_register_device - register a clock event device
154 * @dev: device to register
155 */
156void clockevents_register_device(struct clock_event_device *dev)
157{
158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159
160 spin_lock(&clockevents_lock);
161
162 list_add(&dev->list, &clockevent_devices);
163 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
164 clockevents_notify_released();
165
166 spin_unlock(&clockevents_lock);
167}
168
169/*
170 * Noop handler when we shut down an event device
171 */
172static void clockevents_handle_noop(struct clock_event_device *dev)
173{
174}
175
176/**
177 * clockevents_exchange_device - release and request clock devices
178 * @old: device to release (can be NULL)
179 * @new: device to request (can be NULL)
180 *
181 * Called from the notifier chain. clockevents_lock is held already
182 */
183void clockevents_exchange_device(struct clock_event_device *old,
184 struct clock_event_device *new)
185{
186 unsigned long flags;
187
188 local_irq_save(flags);
189 /*
190 * Caller releases a clock event device. We queue it into the
191 * released list and do a notify add later.
192 */
193 if (old) {
194 old->event_handler = clockevents_handle_noop;
195 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
196 list_del(&old->list);
197 list_add(&old->list, &clockevents_released);
198 }
199
200 if (new) {
201 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
202 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
203 }
204 local_irq_restore(flags);
205}
206
207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events
250 */
251void clockevents_notify(unsigned long reason, void *arg)
252{
253 spin_lock(&clockevents_lock);
254 clockevents_do_notify(reason, arg);
255
256 switch (reason) {
257 case CLOCK_EVT_NOTIFY_CPU_DEAD:
258 /*
259 * Unregister the clock event devices which were
260 * released from the users in the notify chain.
261 */
262 while (!list_empty(&clockevents_released)) {
263 struct clock_event_device *dev;
264
265 dev = list_entry(clockevents_released.next,
266 struct clock_event_device, list);
267 list_del(&dev->list);
268 }
269 break;
270 default:
271 break;
272 }
273 spin_unlock(&clockevents_lock);
274}
275EXPORT_SYMBOL_GPL(clockevents_notify);
276
277#ifdef CONFIG_SYSFS
278
279/**
280 * clockevents_show_registered - sysfs interface for listing clockevents
281 * @dev: unused
282 * @buf: char buffer to be filled with clock events list
283 *
284 * Provides sysfs interface for listing registered clock event devices
285 */
286static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
287{
288 struct list_head *tmp;
289 char *p = buf;
290 int cpu;
291
292 spin_lock(&clockevents_lock);
293
294 list_for_each(tmp, &clockevent_devices) {
295 struct clock_event_device *ce;
296
297 ce = list_entry(tmp, struct clock_event_device, list);
298 p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
299 ce->features, ce->mode);
300 p += sprintf(p, " C:");
301 if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
302 for_each_cpu_mask(cpu, ce->cpumask)
303 p += sprintf(p, " %d", cpu);
304 } else {
305 /*
306 * FIXME: Add the cpu which is handling this sucker
307 */
308 }
309 p += sprintf(p, "\n");
310 }
311
312 spin_unlock(&clockevents_lock);
313
314 return p - buf;
315}
316
317/*
318 * Sysfs setup bits:
319 */
320static SYSDEV_ATTR(registered, 0600,
321 clockevents_show_registered, NULL);
322
323static struct sysdev_class clockevents_sysclass = {
324 set_kset_name("clockevents"),
325};
326
327static struct sys_device clockevents_sys_device = {
328 .id = 0,
329 .cls = &clockevents_sysclass,
330};
331
332static int __init clockevents_sysfs_init(void)
333{
334 int error = sysdev_class_register(&clockevents_sysclass);
335
336 if (!error)
337 error = sysdev_register(&clockevents_sys_device);
338 if (!error)
339 error = sysdev_create_file(
340 &clockevents_sys_device,
341 &attr_registered);
342 return error;
343}
344device_initcall(clockevents_sysfs_init);
345#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 22504afc0d34..193a0793af95 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -28,6 +28,8 @@
28#include <linux/sysdev.h> 28#include <linux/sysdev.h>
29#include <linux/init.h> 29#include <linux/init.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h>
31 33
32/* XXX - Would like a better way for initializing curr_clocksource */ 34/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies; 35extern struct clocksource clocksource_jiffies;
@@ -47,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
47 */ 49 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies; 50static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource; 51static struct clocksource *next_clocksource;
52static struct clocksource *clocksource_override;
50static LIST_HEAD(clocksource_list); 53static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock); 54static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32]; 55static char override_name[32];
@@ -61,9 +64,123 @@ static int __init clocksource_done_booting(void)
61 finished_booting = 1; 64 finished_booting = 1;
62 return 0; 65 return 0;
63} 66}
64
65late_initcall(clocksource_done_booting); 67late_initcall(clocksource_done_booting);
66 68
69#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
70static LIST_HEAD(watchdog_list);
71static struct clocksource *watchdog;
72static struct timer_list watchdog_timer;
73static DEFINE_SPINLOCK(watchdog_lock);
74static cycle_t watchdog_last;
75/*
76 * Interval: 0.5sec Treshold: 0.0625s
77 */
78#define WATCHDOG_INTERVAL (HZ >> 1)
79#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
80
81static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
82{
83 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
84 return;
85
86 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
87 cs->name, delta);
88 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
89 clocksource_change_rating(cs, 0);
90 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
91 list_del(&cs->wd_list);
92}
93
94static void clocksource_watchdog(unsigned long data)
95{
96 struct clocksource *cs, *tmp;
97 cycle_t csnow, wdnow;
98 int64_t wd_nsec, cs_nsec;
99
100 spin_lock(&watchdog_lock);
101
102 wdnow = watchdog->read();
103 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
104 watchdog_last = wdnow;
105
106 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
107 csnow = cs->read();
108 /* Initialized ? */
109 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
110 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
111 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
112 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
113 /*
114 * We just marked the clocksource as
115 * highres-capable, notify the rest of the
116 * system as well so that we transition
117 * into high-res mode:
118 */
119 tick_clock_notify();
120 }
121 cs->flags |= CLOCK_SOURCE_WATCHDOG;
122 cs->wd_last = csnow;
123 } else {
124 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
125 cs->wd_last = csnow;
126 /* Check the delta. Might remove from the list ! */
127 clocksource_ratewd(cs, cs_nsec - wd_nsec);
128 }
129 }
130
131 if (!list_empty(&watchdog_list)) {
132 __mod_timer(&watchdog_timer,
133 watchdog_timer.expires + WATCHDOG_INTERVAL);
134 }
135 spin_unlock(&watchdog_lock);
136}
137static void clocksource_check_watchdog(struct clocksource *cs)
138{
139 struct clocksource *cse;
140 unsigned long flags;
141
142 spin_lock_irqsave(&watchdog_lock, flags);
143 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
144 int started = !list_empty(&watchdog_list);
145
146 list_add(&cs->wd_list, &watchdog_list);
147 if (!started && watchdog) {
148 watchdog_last = watchdog->read();
149 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
150 add_timer(&watchdog_timer);
151 }
152 } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
153 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
154
155 if (!watchdog || cs->rating > watchdog->rating) {
156 if (watchdog)
157 del_timer(&watchdog_timer);
158 watchdog = cs;
159 init_timer(&watchdog_timer);
160 watchdog_timer.function = clocksource_watchdog;
161
162 /* Reset watchdog cycles */
163 list_for_each_entry(cse, &watchdog_list, wd_list)
164 cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
165 /* Start if list is not empty */
166 if (!list_empty(&watchdog_list)) {
167 watchdog_last = watchdog->read();
168 watchdog_timer.expires =
169 jiffies + WATCHDOG_INTERVAL;
170 add_timer(&watchdog_timer);
171 }
172 }
173 }
174 spin_unlock_irqrestore(&watchdog_lock, flags);
175}
176#else
177static void clocksource_check_watchdog(struct clocksource *cs)
178{
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
180 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
181}
182#endif
183
67/** 184/**
68 * clocksource_get_next - Returns the selected clocksource 185 * clocksource_get_next - Returns the selected clocksource
69 * 186 *
@@ -83,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
83} 200}
84 201
85/** 202/**
86 * select_clocksource - Finds the best registered clocksource. 203 * select_clocksource - Selects the best registered clocksource.
87 * 204 *
88 * Private function. Must hold clocksource_lock when called. 205 * Private function. Must hold clocksource_lock when called.
89 * 206 *
90 * Looks through the list of registered clocksources, returning 207 * Select the clocksource with the best rating, or the clocksource,
91 * the one with the highest rating value. If there is a clocksource 208 * which is selected by userspace override.
92 * name that matches the override string, it returns that clocksource.
93 */ 209 */
94static struct clocksource *select_clocksource(void) 210static struct clocksource *select_clocksource(void)
95{ 211{
96 struct clocksource *best = NULL; 212 struct clocksource *next;
97 struct list_head *tmp;
98 213
99 list_for_each(tmp, &clocksource_list) { 214 if (list_empty(&clocksource_list))
100 struct clocksource *src; 215 return NULL;
101 216
102 src = list_entry(tmp, struct clocksource, list); 217 if (clocksource_override)
103 if (!best) 218 next = clocksource_override;
104 best = src; 219 else
105 220 next = list_entry(clocksource_list.next, struct clocksource,
106 /* check for override: */ 221 list);
107 if (strlen(src->name) == strlen(override_name) && 222
108 !strcmp(src->name, override_name)) { 223 if (next == curr_clocksource)
109 best = src; 224 return NULL;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116 225
117 return best; 226 return next;
118} 227}
119 228
120/** 229/*
121 * is_registered_source - Checks if clocksource is registered 230 * Enqueue the clocksource sorted by rating
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */ 231 */
128static int is_registered_source(struct clocksource *c) 232static int clocksource_enqueue(struct clocksource *c)
129{ 233{
130 int len = strlen(c->name); 234 struct list_head *tmp, *entry = &clocksource_list;
131 struct list_head *tmp;
132 235
133 list_for_each(tmp, &clocksource_list) { 236 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src; 237 struct clocksource *cs;
135 238
136 src = list_entry(tmp, struct clocksource, list); 239 cs = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name)) 240 if (cs == c)
138 return 1; 241 return -EBUSY;
242 /* Keep track of the place, where to insert */
243 if (cs->rating >= c->rating)
244 entry = tmp;
139 } 245 }
246 list_add(&c->list, entry);
247
248 if (strlen(c->name) == strlen(override_name) &&
249 !strcmp(c->name, override_name))
250 clocksource_override = c;
140 251
141 return 0; 252 return 0;
142} 253}
@@ -149,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
149 */ 260 */
150int clocksource_register(struct clocksource *c) 261int clocksource_register(struct clocksource *c)
151{ 262{
152 int ret = 0;
153 unsigned long flags; 263 unsigned long flags;
264 int ret;
154 265
155 spin_lock_irqsave(&clocksource_lock, flags); 266 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */ 267 ret = clocksource_enqueue(c);
157 if (is_registered_source(c)) { 268 if (!ret)
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource(); 269 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags); 270 spin_unlock_irqrestore(&clocksource_lock, flags);
271 if (!ret)
272 clocksource_check_watchdog(c);
168 return ret; 273 return ret;
169} 274}
170EXPORT_SYMBOL(clocksource_register); 275EXPORT_SYMBOL(clocksource_register);
171 276
172/** 277/**
173 * clocksource_reselect - Rescan list for next clocksource 278 * clocksource_change_rating - Change the rating of a registered clocksource
174 * 279 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */ 280 */
179void clocksource_reselect(void) 281void clocksource_change_rating(struct clocksource *cs, int rating)
180{ 282{
181 unsigned long flags; 283 unsigned long flags;
182 284
183 spin_lock_irqsave(&clocksource_lock, flags); 285 spin_lock_irqsave(&clocksource_lock, flags);
286 list_del(&cs->list);
287 cs->rating = rating;
288 clocksource_enqueue(cs);
184 next_clocksource = select_clocksource(); 289 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags); 290 spin_unlock_irqrestore(&clocksource_lock, flags);
186} 291}
187EXPORT_SYMBOL(clocksource_reselect);
188 292
189#ifdef CONFIG_SYSFS 293#ifdef CONFIG_SYSFS
190/** 294/**
@@ -220,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
220static ssize_t sysfs_override_clocksource(struct sys_device *dev, 324static ssize_t sysfs_override_clocksource(struct sys_device *dev,
221 const char *buf, size_t count) 325 const char *buf, size_t count)
222{ 326{
327 struct clocksource *ovr = NULL;
328 struct list_head *tmp;
223 size_t ret = count; 329 size_t ret = count;
330 int len;
331
224 /* strings from sysfs write are not 0 terminated! */ 332 /* strings from sysfs write are not 0 terminated! */
225 if (count >= sizeof(override_name)) 333 if (count >= sizeof(override_name))
226 return -EINVAL; 334 return -EINVAL;
@@ -228,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
228 /* strip of \n: */ 336 /* strip of \n: */
229 if (buf[count-1] == '\n') 337 if (buf[count-1] == '\n')
230 count--; 338 count--;
231 if (count < 1)
232 return -EINVAL;
233 339
234 spin_lock_irq(&clocksource_lock); 340 spin_lock_irq(&clocksource_lock);
235 341
236 /* copy the name given: */ 342 if (count > 0)
237 memcpy(override_name, buf, count); 343 memcpy(override_name, buf, count);
238 override_name[count] = 0; 344 override_name[count] = 0;
239 345
240 /* try to select it: */ 346 len = strlen(override_name);
241 next_clocksource = select_clocksource(); 347 if (len) {
348 ovr = clocksource_override;
349 /* try to select it: */
350 list_for_each(tmp, &clocksource_list) {
351 struct clocksource *cs;
352
353 cs = list_entry(tmp, struct clocksource, list);
354 if (strlen(cs->name) == len &&
355 !strcmp(cs->name, override_name))
356 ovr = cs;
357 }
358 }
359
360 /* Reselect, when the override name has changed */
361 if (ovr != clocksource_override) {
362 clocksource_override = ovr;
363 next_clocksource = select_clocksource();
364 }
242 365
243 spin_unlock_irq(&clocksource_lock); 366 spin_unlock_irq(&clocksource_lock);
244 367
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a07..3be8da8fed7e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66}; 65};
67 66
68static int __init init_jiffies_clocksource(void) 67static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f9..eb12509e00bd 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
24 24
25#define MAX_TICKADJ 500 /* microsecs */ 25#define MAX_TICKADJ 500 /* microsecs */
26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
27 TICK_LENGTH_SHIFT) / HZ) 27 TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
28 28
29/* 29/*
30 * phase-lock loop variables 30 * phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
46 46
47static void ntp_update_frequency(void) 47static void ntp_update_frequency(void)
48{ 48{
49 tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; 49 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
50 tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; 50 << TICK_LENGTH_SHIFT;
51 tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); 51 second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
52 second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
52 53
53 do_div(tick_length_base, HZ); 54 tick_length_base = second_length;
54 55
55 tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; 56 do_div(second_length, HZ);
57 tick_nsec = second_length >> TICK_LENGTH_SHIFT;
58
59 do_div(tick_length_base, NTP_INTERVAL_FREQ);
56} 60}
57 61
58/** 62/**
@@ -162,7 +166,7 @@ void second_overflow(void)
162 tick_length -= MAX_TICKADJ_SCALED; 166 tick_length -= MAX_TICKADJ_SCALED;
163 } else { 167 } else {
164 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 168 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
165 HZ) << TICK_LENGTH_SHIFT; 169 NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
166 time_adjust = 0; 170 time_adjust = 0;
167 } 171 }
168 } 172 }
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
239 result = -EINVAL; 243 result = -EINVAL;
240 goto leave; 244 goto leave;
241 } 245 }
242 time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); 246 time_freq = ((s64)txc->freq * NSEC_PER_USEC)
247 >> (SHIFT_USEC - SHIFT_NSEC);
243 } 248 }
244 249
245 if (txc->modes & ADJ_MAXERROR) { 250 if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
309 freq_adj += time_freq; 314 freq_adj += time_freq;
310 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); 315 freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
311 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); 316 time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
312 time_offset = (time_offset / HZ) << SHIFT_UPDATE; 317 time_offset = (time_offset / NTP_INTERVAL_FREQ)
318 << SHIFT_UPDATE;
313 } /* STA_PLL */ 319 } /* STA_PLL */
314 } /* txc->modes & ADJ_OFFSET */ 320 } /* txc->modes & ADJ_OFFSET */
315 if (txc->modes & ADJ_TICK) 321 if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
324 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 330 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
325 txc->offset = save_adjust; 331 txc->offset = save_adjust;
326 else 332 else
327 txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; 333 txc->offset = shift_right(time_offset, SHIFT_UPDATE)
328 txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); 334 * NTP_INTERVAL_FREQ / 1000;
335 txc->freq = (time_freq / NSEC_PER_USEC)
336 << (SHIFT_USEC - SHIFT_NSEC);
329 txc->maxerror = time_maxerror; 337 txc->maxerror = time_maxerror;
330 txc->esterror = time_esterror; 338 txc->esterror = time_esterror;
331 txc->status = time_status; 339 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000000..12b3efeb9f6f
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
1/*
2 * linux/kernel/time/tick-broadcast.c
3 *
4 * This file contains functions which emulate a local clock-event
5 * device via a broadcast event source.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Broadcast support for broken x86 hardware, where the local apic
27 * timer stops in C3 state.
28 */
29
30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock);
33
34/*
35 * Debugging: see timer_list.c
36 */
37struct tick_device *tick_get_broadcast_device(void)
38{
39 return &tick_broadcast_device;
40}
41
42cpumask_t *tick_get_broadcast_mask(void)
43{
44 return &tick_broadcast_mask;
45}
46
47/*
48 * Start the device in periodic mode
49 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
53 tick_setup_periodic(bc, 1);
54}
55
56/*
57 * Check, if the device can be utilized as broadcast device:
58 */
59int tick_check_broadcast_device(struct clock_event_device *dev)
60{
61 if (tick_broadcast_device.evtdev ||
62 (dev->features & CLOCK_EVT_FEAT_C3STOP))
63 return 0;
64
65 clockevents_exchange_device(NULL, dev);
66 tick_broadcast_device.evtdev = dev;
67 if (!cpus_empty(tick_broadcast_mask))
68 tick_broadcast_start_periodic(dev);
69 return 1;
70}
71
72/*
73 * Check, if the device is the broadcast device
74 */
75int tick_is_broadcast_device(struct clock_event_device *dev)
76{
77 return (dev && tick_broadcast_device.evtdev == dev);
78}
79
80/*
81 * Check, if the device is disfunctional and a place holder, which
82 * needs to be handled by the broadcast device.
83 */
84int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
85{
86 unsigned long flags;
87 int ret = 0;
88
89 spin_lock_irqsave(&tick_broadcast_lock, flags);
90
91 /*
92 * Devices might be registered with both periodic and oneshot
93 * mode disabled. This signals, that the device needs to be
94 * operated from the broadcast device and is a placeholder for
95 * the cpu local device.
96 */
97 if (!tick_device_is_functional(dev)) {
98 dev->event_handler = tick_handle_periodic;
99 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1;
102 }
103
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret;
106}
107
108/*
109 * Broadcast the event to the cpus, which are set in the mask
110 */
111int tick_do_broadcast(cpumask_t mask)
112{
113 int ret = 0, cpu = smp_processor_id();
114 struct tick_device *td;
115
116 /*
117 * Check, if the current cpu is in the mask
118 */
119 if (cpu_isset(cpu, mask)) {
120 cpu_clear(cpu, mask);
121 td = &per_cpu(tick_cpu_device, cpu);
122 td->evtdev->event_handler(td->evtdev);
123 ret = 1;
124 }
125
126 if (!cpus_empty(mask)) {
127 /*
128 * It might be necessary to actually check whether the devices
129 * have different broadcast functions. For now, just use the
130 * one of the first device. This works as long as we have this
131 * misfeature only on x86 (lapic)
132 */
133 cpu = first_cpu(mask);
134 td = &per_cpu(tick_cpu_device, cpu);
135 td->evtdev->broadcast(mask);
136 ret = 1;
137 }
138 return ret;
139}
140
141/*
142 * Periodic broadcast:
143 * - invoke the broadcast handlers
144 */
145static void tick_do_periodic_broadcast(void)
146{
147 cpumask_t mask;
148
149 spin_lock(&tick_broadcast_lock);
150
151 cpus_and(mask, cpu_online_map, tick_broadcast_mask);
152 tick_do_broadcast(mask);
153
154 spin_unlock(&tick_broadcast_lock);
155}
156
157/*
158 * Event handler for periodic broadcast ticks
159 */
160static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
161{
162 dev->next_event.tv64 = KTIME_MAX;
163
164 tick_do_periodic_broadcast();
165
166 /*
167 * The device is in periodic mode. No reprogramming necessary:
168 */
169 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
170 return;
171
172 /*
173 * Setup the next period for devices, which do not have
174 * periodic mode:
175 */
176 for (;;) {
177 ktime_t next = ktime_add(dev->next_event, tick_period);
178
179 if (!clockevents_program_event(dev, next, ktime_get()))
180 return;
181 tick_do_periodic_broadcast();
182 }
183}
184
185/*
186 * Powerstate information: The system enters/leaves a state, where
187 * affected devices might stop
188 */
189static void tick_do_broadcast_on_off(void *why)
190{
191 struct clock_event_device *bc, *dev;
192 struct tick_device *td;
193 unsigned long flags, *reason = why;
194 int cpu;
195
196 spin_lock_irqsave(&tick_broadcast_lock, flags);
197
198 cpu = smp_processor_id();
199 td = &per_cpu(tick_cpu_device, cpu);
200 dev = td->evtdev;
201 bc = tick_broadcast_device.evtdev;
202
203 /*
204 * Is the device in broadcast mode forever or is it not
205 * affected by the powerstate ?
206 */
207 if (!dev || !tick_device_is_functional(dev) ||
208 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
209 goto out;
210
211 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
212 if (!cpu_isset(cpu, tick_broadcast_mask)) {
213 cpu_set(cpu, tick_broadcast_mask);
214 if (td->mode == TICKDEV_MODE_PERIODIC)
215 clockevents_set_mode(dev,
216 CLOCK_EVT_MODE_SHUTDOWN);
217 }
218 } else {
219 if (cpu_isset(cpu, tick_broadcast_mask)) {
220 cpu_clear(cpu, tick_broadcast_mask);
221 if (td->mode == TICKDEV_MODE_PERIODIC)
222 tick_setup_periodic(dev, 0);
223 }
224 }
225
226 if (cpus_empty(tick_broadcast_mask))
227 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
228 else {
229 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
230 tick_broadcast_start_periodic(bc);
231 else
232 tick_broadcast_setup_oneshot(bc);
233 }
234out:
235 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
236}
237
238/*
239 * Powerstate information: The system enters/leaves a state, where
240 * affected devices might stop.
241 */
242void tick_broadcast_on_off(unsigned long reason, int *oncpu)
243{
244 int cpu = get_cpu();
245
246 if (cpu == *oncpu)
247 tick_do_broadcast_on_off(&reason);
248 else
249 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
250 &reason, 1, 1);
251 put_cpu();
252}
253
254/*
255 * Set the periodic handler depending on broadcast on/off
256 */
257void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
258{
259 if (!broadcast)
260 dev->event_handler = tick_handle_periodic;
261 else
262 dev->event_handler = tick_handle_periodic_broadcast;
263}
264
265/*
266 * Remove a CPU from broadcasting
267 */
268void tick_shutdown_broadcast(unsigned int *cpup)
269{
270 struct clock_event_device *bc;
271 unsigned long flags;
272 unsigned int cpu = *cpup;
273
274 spin_lock_irqsave(&tick_broadcast_lock, flags);
275
276 bc = tick_broadcast_device.evtdev;
277 cpu_clear(cpu, tick_broadcast_mask);
278
279 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
280 if (bc && cpus_empty(tick_broadcast_mask))
281 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
282 }
283
284 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
285}
286
287#ifdef CONFIG_TICK_ONESHOT
288
289static cpumask_t tick_broadcast_oneshot_mask;
290
291/*
292 * Debugging: see timer_list.c
293 */
294cpumask_t *tick_get_broadcast_oneshot_mask(void)
295{
296 return &tick_broadcast_oneshot_mask;
297}
298
299static int tick_broadcast_set_event(ktime_t expires, int force)
300{
301 struct clock_event_device *bc = tick_broadcast_device.evtdev;
302 ktime_t now = ktime_get();
303 int res;
304
305 for(;;) {
306 res = clockevents_program_event(bc, expires, now);
307 if (!res || !force)
308 return res;
309 now = ktime_get();
310 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
311 }
312}
313
314/*
315 * Reprogram the broadcast device:
316 *
317 * Called with tick_broadcast_lock held and interrupts disabled.
318 */
319static int tick_broadcast_reprogram(void)
320{
321 ktime_t expires = { .tv64 = KTIME_MAX };
322 struct tick_device *td;
323 int cpu;
324
325 /*
326 * Find the event which expires next:
327 */
328 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
329 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
330 td = &per_cpu(tick_cpu_device, cpu);
331 if (td->evtdev->next_event.tv64 < expires.tv64)
332 expires = td->evtdev->next_event;
333 }
334
335 if (expires.tv64 == KTIME_MAX)
336 return 0;
337
338 return tick_broadcast_set_event(expires, 0);
339}
340
341/*
342 * Handle oneshot mode broadcasting
343 */
344static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
345{
346 struct tick_device *td;
347 cpumask_t mask;
348 ktime_t now;
349 int cpu;
350
351 spin_lock(&tick_broadcast_lock);
352again:
353 dev->next_event.tv64 = KTIME_MAX;
354 mask = CPU_MASK_NONE;
355 now = ktime_get();
356 /* Find all expired events */
357 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
358 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
359 td = &per_cpu(tick_cpu_device, cpu);
360 if (td->evtdev->next_event.tv64 <= now.tv64)
361 cpu_set(cpu, mask);
362 }
363
364 /*
365 * Wakeup the cpus which have an expired event. The broadcast
366 * device is reprogrammed in the return from idle code.
367 */
368 if (!tick_do_broadcast(mask)) {
369 /*
370 * The global event did not expire any CPU local
371 * events. This happens in dyntick mode, as the
372 * maximum PIT delta is quite small.
373 */
374 if (tick_broadcast_reprogram())
375 goto again;
376 }
377 spin_unlock(&tick_broadcast_lock);
378}
379
380/*
381 * Powerstate information: The system enters/leaves a state, where
382 * affected devices might stop
383 */
384void tick_broadcast_oneshot_control(unsigned long reason)
385{
386 struct clock_event_device *bc, *dev;
387 struct tick_device *td;
388 unsigned long flags;
389 int cpu;
390
391 spin_lock_irqsave(&tick_broadcast_lock, flags);
392
393 /*
394 * Periodic mode does not care about the enter/exit of power
395 * states
396 */
397 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
398 goto out;
399
400 bc = tick_broadcast_device.evtdev;
401 cpu = smp_processor_id();
402 td = &per_cpu(tick_cpu_device, cpu);
403 dev = td->evtdev;
404
405 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
406 goto out;
407
408 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
409 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
410 cpu_set(cpu, tick_broadcast_oneshot_mask);
411 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
412 if (dev->next_event.tv64 < bc->next_event.tv64)
413 tick_broadcast_set_event(dev->next_event, 1);
414 }
415 } else {
416 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
417 cpu_clear(cpu, tick_broadcast_oneshot_mask);
418 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
419 if (dev->next_event.tv64 != KTIME_MAX)
420 tick_program_event(dev->next_event, 1);
421 }
422 }
423
424out:
425 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
426}
427
428/**
429 * tick_broadcast_setup_highres - setup the broadcast device for highres
430 */
431void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
432{
433 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
434 bc->event_handler = tick_handle_oneshot_broadcast;
435 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
436 bc->next_event.tv64 = KTIME_MAX;
437 }
438}
439
440/*
441 * Select oneshot operating mode for the broadcast device
442 */
443void tick_broadcast_switch_to_oneshot(void)
444{
445 struct clock_event_device *bc;
446 unsigned long flags;
447
448 spin_lock_irqsave(&tick_broadcast_lock, flags);
449
450 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
451 bc = tick_broadcast_device.evtdev;
452 if (bc)
453 tick_broadcast_setup_oneshot(bc);
454 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
455}
456
457
458/*
459 * Remove a dead CPU from broadcasting
460 */
461void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
462{
463 struct clock_event_device *bc;
464 unsigned long flags;
465 unsigned int cpu = *cpup;
466
467 spin_lock_irqsave(&tick_broadcast_lock, flags);
468
469 bc = tick_broadcast_device.evtdev;
470 cpu_clear(cpu, tick_broadcast_oneshot_mask);
471
472 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
473 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
474 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
475 }
476
477 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
478}
479
480#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000000..0986a2bfab49
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,347 @@
1/*
2 * linux/kernel/time/tick-common.c
3 *
4 * This file contains the base functions to manage periodic tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/*
26 * Tick devices
27 */
28DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
29/*
30 * Tick next event: keeps track of the tick time
31 */
32ktime_t tick_next_period;
33ktime_t tick_period;
34static int tick_do_timer_cpu = -1;
35DEFINE_SPINLOCK(tick_device_lock);
36
37/*
38 * Debugging: see timer_list.c
39 */
40struct tick_device *tick_get_device(int cpu)
41{
42 return &per_cpu(tick_cpu_device, cpu);
43}
44
45/**
46 * tick_is_oneshot_available - check for a oneshot capable event device
47 */
48int tick_is_oneshot_available(void)
49{
50 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
51
52 return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
53}
54
55/*
56 * Periodic tick
57 */
58static void tick_periodic(int cpu)
59{
60 if (tick_do_timer_cpu == cpu) {
61 write_seqlock(&xtime_lock);
62
63 /* Keep track of the next tick event */
64 tick_next_period = ktime_add(tick_next_period, tick_period);
65
66 do_timer(1);
67 write_sequnlock(&xtime_lock);
68 }
69
70 update_process_times(user_mode(get_irq_regs()));
71 profile_tick(CPU_PROFILING);
72}
73
74/*
75 * Event handler for periodic ticks
76 */
77void tick_handle_periodic(struct clock_event_device *dev)
78{
79 int cpu = smp_processor_id();
80 ktime_t next;
81
82 tick_periodic(cpu);
83
84 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
85 return;
86 /*
87 * Setup the next period for devices, which do not have
88 * periodic mode:
89 */
90 next = ktime_add(dev->next_event, tick_period);
91 for (;;) {
92 if (!clockevents_program_event(dev, next, ktime_get()))
93 return;
94 tick_periodic(cpu);
95 next = ktime_add(next, tick_period);
96 }
97}
98
99/*
100 * Setup the device for a periodic tick
101 */
102void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
103{
104 tick_set_periodic_handler(dev, broadcast);
105
106 /* Broadcast setup ? */
107 if (!tick_device_is_functional(dev))
108 return;
109
110 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
111 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
112 } else {
113 unsigned long seq;
114 ktime_t next;
115
116 do {
117 seq = read_seqbegin(&xtime_lock);
118 next = tick_next_period;
119 } while (read_seqretry(&xtime_lock, seq));
120
121 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
122
123 for (;;) {
124 if (!clockevents_program_event(dev, next, ktime_get()))
125 return;
126 next = ktime_add(next, tick_period);
127 }
128 }
129}
130
131/*
132 * Setup the tick device
133 */
134static void tick_setup_device(struct tick_device *td,
135 struct clock_event_device *newdev, int cpu,
136 cpumask_t cpumask)
137{
138 ktime_t next_event;
139 void (*handler)(struct clock_event_device *) = NULL;
140
141 /*
142 * First device setup ?
143 */
144 if (!td->evtdev) {
145 /*
146 * If no cpu took the do_timer update, assign it to
147 * this cpu:
148 */
149 if (tick_do_timer_cpu == -1) {
150 tick_do_timer_cpu = cpu;
151 tick_next_period = ktime_get();
152 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
153 }
154
155 /*
156 * Startup in periodic mode first.
157 */
158 td->mode = TICKDEV_MODE_PERIODIC;
159 } else {
160 handler = td->evtdev->event_handler;
161 next_event = td->evtdev->next_event;
162 }
163
164 td->evtdev = newdev;
165
166 /*
167 * When the device is not per cpu, pin the interrupt to the
168 * current cpu:
169 */
170 if (!cpus_equal(newdev->cpumask, cpumask))
171 irq_set_affinity(newdev->irq, cpumask);
172
173 /*
174 * When global broadcasting is active, check if the current
175 * device is registered as a placeholder for broadcast mode.
176 * This allows us to handle this x86 misfeature in a generic
177 * way.
178 */
179 if (tick_device_uses_broadcast(newdev, cpu))
180 return;
181
182 if (td->mode == TICKDEV_MODE_PERIODIC)
183 tick_setup_periodic(newdev, 0);
184 else
185 tick_setup_oneshot(newdev, handler, next_event);
186}
187
188/*
189 * Check, if the new registered device should be used.
190 */
191static int tick_check_new_device(struct clock_event_device *newdev)
192{
193 struct clock_event_device *curdev;
194 struct tick_device *td;
195 int cpu, ret = NOTIFY_OK;
196 unsigned long flags;
197 cpumask_t cpumask;
198
199 spin_lock_irqsave(&tick_device_lock, flags);
200
201 cpu = smp_processor_id();
202 if (!cpu_isset(cpu, newdev->cpumask))
203 goto out;
204
205 td = &per_cpu(tick_cpu_device, cpu);
206 curdev = td->evtdev;
207 cpumask = cpumask_of_cpu(cpu);
208
209 /* cpu local device ? */
210 if (!cpus_equal(newdev->cpumask, cpumask)) {
211
212 /*
213 * If the cpu affinity of the device interrupt can not
214 * be set, ignore it.
215 */
216 if (!irq_can_set_affinity(newdev->irq))
217 goto out_bc;
218
219 /*
220 * If we have a cpu local device already, do not replace it
221 * by a non cpu local device
222 */
223 if (curdev && cpus_equal(curdev->cpumask, cpumask))
224 goto out_bc;
225 }
226
227 /*
228 * If we have an active device, then check the rating and the oneshot
229 * feature.
230 */
231 if (curdev) {
232 /*
233 * Prefer one shot capable devices !
234 */
235 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
236 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
237 goto out_bc;
238 /*
239 * Check the rating
240 */
241 if (curdev->rating >= newdev->rating)
242 goto out_bc;
243 }
244
245 /*
246 * Replace the eventually existing device by the new
247 * device. If the current device is the broadcast device, do
248 * not give it back to the clockevents layer !
249 */
250 if (tick_is_broadcast_device(curdev)) {
251 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
252 curdev = NULL;
253 }
254 clockevents_exchange_device(curdev, newdev);
255 tick_setup_device(td, newdev, cpu, cpumask);
256 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
257 tick_oneshot_notify();
258
259 spin_unlock_irqrestore(&tick_device_lock, flags);
260 return NOTIFY_STOP;
261
262out_bc:
263 /*
264 * Can the new device be used as a broadcast device ?
265 */
266 if (tick_check_broadcast_device(newdev))
267 ret = NOTIFY_STOP;
268out:
269 spin_unlock_irqrestore(&tick_device_lock, flags);
270
271 return ret;
272}
273
274/*
275 * Shutdown an event device on a given cpu:
276 *
277 * This is called on a life CPU, when a CPU is dead. So we cannot
278 * access the hardware device itself.
279 * We just set the mode and remove it from the lists.
280 */
281static void tick_shutdown(unsigned int *cpup)
282{
283 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
284 struct clock_event_device *dev = td->evtdev;
285 unsigned long flags;
286
287 spin_lock_irqsave(&tick_device_lock, flags);
288 td->mode = TICKDEV_MODE_PERIODIC;
289 if (dev) {
290 /*
291 * Prevent that the clock events layer tries to call
292 * the set mode function!
293 */
294 dev->mode = CLOCK_EVT_MODE_UNUSED;
295 clockevents_exchange_device(dev, NULL);
296 td->evtdev = NULL;
297 }
298 spin_unlock_irqrestore(&tick_device_lock, flags);
299}
300
301/*
302 * Notification about clock event devices
303 */
304static int tick_notify(struct notifier_block *nb, unsigned long reason,
305 void *dev)
306{
307 switch (reason) {
308
309 case CLOCK_EVT_NOTIFY_ADD:
310 return tick_check_new_device(dev);
311
312 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
313 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
314 tick_broadcast_on_off(reason, dev);
315 break;
316
317 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
318 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
319 tick_broadcast_oneshot_control(reason);
320 break;
321
322 case CLOCK_EVT_NOTIFY_CPU_DEAD:
323 tick_shutdown_broadcast_oneshot(dev);
324 tick_shutdown_broadcast(dev);
325 tick_shutdown(dev);
326 break;
327
328 default:
329 break;
330 }
331
332 return NOTIFY_OK;
333}
334
335static struct notifier_block tick_notifier = {
336 .notifier_call = tick_notify,
337};
338
339/**
340 * tick_init - initialize the tick control
341 *
342 * Register the notifier with the clockevents framework
343 */
344void __init tick_init(void)
345{
346 clockevents_register_notifier(&tick_notifier);
347}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000000..54861a0f29ff
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
1/*
2 * tick internal variable and functions used by low/high res code
3 */
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period;
7extern ktime_t tick_period;
8
9extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
10extern void tick_handle_periodic(struct clock_event_device *dev);
11
12/*
13 * NO_HZ / high resolution timer shared code
14 */
15#ifdef CONFIG_TICK_ONESHOT
16extern void tick_setup_oneshot(struct clock_event_device *newdev,
17 void (*handler)(struct clock_event_device *),
18 ktime_t nextevt);
19extern int tick_program_event(ktime_t expires, int force);
20extern void tick_oneshot_notify(void);
21extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
22
23# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
24extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
25extern void tick_broadcast_oneshot_control(unsigned long reason);
26extern void tick_broadcast_switch_to_oneshot(void);
27extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
28# else /* BROADCAST */
29static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
30{
31 BUG();
32}
33static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
34static inline void tick_broadcast_switch_to_oneshot(void) { }
35static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
36# endif /* !BROADCAST */
37
38#else /* !ONESHOT */
39static inline
40void tick_setup_oneshot(struct clock_event_device *newdev,
41 void (*handler)(struct clock_event_device *),
42 ktime_t nextevt)
43{
44 BUG();
45}
46static inline int tick_program_event(ktime_t expires, int force)
47{
48 return 0;
49}
50static inline void tick_oneshot_notify(void) { }
51static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
52{
53 BUG();
54}
55static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
56static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
57#endif /* !TICK_ONESHOT */
58
59/*
60 * Broadcasting support
61 */
62#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
63extern int tick_do_broadcast(cpumask_t mask);
64
65extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
66extern int tick_check_broadcast_device(struct clock_event_device *dev);
67extern int tick_is_broadcast_device(struct clock_event_device *dev);
68extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
69extern void tick_shutdown_broadcast(unsigned int *cpup);
70
71extern void
72tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
73
74#else /* !BROADCAST */
75
76static inline int tick_check_broadcast_device(struct clock_event_device *dev)
77{
78 return 0;
79}
80
81static inline int tick_is_broadcast_device(struct clock_event_device *dev)
82{
83 return 0;
84}
85static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
86 int cpu)
87{
88 return 0;
89}
90static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
91static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
92static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
93
94/*
95 * Set the periodic handler in non broadcast mode
96 */
97static inline void tick_set_periodic_handler(struct clock_event_device *dev,
98 int broadcast)
99{
100 dev->event_handler = tick_handle_periodic;
101}
102#endif /* !BROADCAST */
103
104/*
105 * Check, if the device is functional or a dummy for broadcast
106 */
107static inline int tick_device_is_functional(struct clock_event_device *dev)
108{
109 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
110}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000000..2e8b7ff863cc
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
1/*
2 * linux/kernel/time/tick-oneshot.c
3 *
4 * This file contains functions which manage high resolution tick
5 * related events.
6 *
7 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
8 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
9 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
10 *
11 * This code is licenced under the GPL version 2. For details see
12 * kernel-base/COPYING.
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/irq.h>
18#include <linux/percpu.h>
19#include <linux/profile.h>
20#include <linux/sched.h>
21#include <linux/tick.h>
22
23#include "tick-internal.h"
24
25/**
26 * tick_program_event
27 */
28int tick_program_event(ktime_t expires, int force)
29{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get();
32
33 while (1) {
34 int ret = clockevents_program_event(dev, expires, now);
35
36 if (!ret || !force)
37 return ret;
38 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
40 }
41}
42
43/**
44 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
45 */
46void tick_setup_oneshot(struct clock_event_device *newdev,
47 void (*handler)(struct clock_event_device *),
48 ktime_t next_event)
49{
50 newdev->event_handler = handler;
51 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
52 clockevents_program_event(newdev, next_event, ktime_get());
53}
54
55/**
56 * tick_switch_to_oneshot - switch to oneshot mode
57 */
58int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
59{
60 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
61 struct clock_event_device *dev = td->evtdev;
62
63 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
64 !tick_device_is_functional(dev))
65 return -EINVAL;
66
67 td->mode = TICKDEV_MODE_ONESHOT;
68 dev->event_handler = handler;
69 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
70 tick_broadcast_switch_to_oneshot();
71 return 0;
72}
73
74#ifdef CONFIG_HIGH_RES_TIMERS
75/**
76 * tick_init_highres - switch to high resolution mode
77 *
78 * Called with interrupts disabled.
79 */
80int tick_init_highres(void)
81{
82 return tick_switch_to_oneshot(hrtimer_interrupt);
83}
84#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000000..51556b95f60f
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,567 @@
1/*
2 * linux/kernel/time/tick-sched.c
3 *
4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
7 *
8 * No idle tick implementation for low and high resolution timers
9 *
10 * Started by: Thomas Gleixner and Ingo Molnar
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14#include <linux/cpu.h>
15#include <linux/err.h>
16#include <linux/hrtimer.h>
17#include <linux/interrupt.h>
18#include <linux/kernel_stat.h>
19#include <linux/percpu.h>
20#include <linux/profile.h>
21#include <linux/sched.h>
22#include <linux/tick.h>
23
24#include <asm/irq_regs.h>
25
26#include "tick-internal.h"
27
28/*
29 * Per cpu nohz control structure
30 */
31static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
32
33/*
34 * The time, when the last jiffy update happened. Protected by xtime_lock.
35 */
36static ktime_t last_jiffies_update;
37
38struct tick_sched *tick_get_tick_sched(int cpu)
39{
40 return &per_cpu(tick_cpu_sched, cpu);
41}
42
43/*
44 * Must be called with interrupts disabled !
45 */
46static void tick_do_update_jiffies64(ktime_t now)
47{
48 unsigned long ticks = 0;
49 ktime_t delta;
50
51 /* Reevalute with xtime_lock held */
52 write_seqlock(&xtime_lock);
53
54 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 >= tick_period.tv64) {
56
57 delta = ktime_sub(delta, tick_period);
58 last_jiffies_update = ktime_add(last_jiffies_update,
59 tick_period);
60
61 /* Slow path for long timeouts */
62 if (unlikely(delta.tv64 >= tick_period.tv64)) {
63 s64 incr = ktime_to_ns(tick_period);
64
65 ticks = ktime_divns(delta, incr);
66
67 last_jiffies_update = ktime_add_ns(last_jiffies_update,
68 incr * ticks);
69 }
70 do_timer(++ticks);
71 }
72 write_sequnlock(&xtime_lock);
73}
74
75/*
76 * Initialize and return retrieve the jiffies update.
77 */
78static ktime_t tick_init_jiffy_update(void)
79{
80 ktime_t period;
81
82 write_seqlock(&xtime_lock);
83 /* Did we start the jiffies update yet ? */
84 if (last_jiffies_update.tv64 == 0)
85 last_jiffies_update = tick_next_period;
86 period = last_jiffies_update;
87 write_sequnlock(&xtime_lock);
88 return period;
89}
90
91/*
92 * NOHZ - aka dynamic tick functionality
93 */
94#ifdef CONFIG_NO_HZ
95/*
96 * NO HZ enabled ?
97 */
98static int tick_nohz_enabled __read_mostly = 1;
99
100/*
101 * Enable / Disable tickless mode
102 */
103static int __init setup_tick_nohz(char *str)
104{
105 if (!strcmp(str, "off"))
106 tick_nohz_enabled = 0;
107 else if (!strcmp(str, "on"))
108 tick_nohz_enabled = 1;
109 else
110 return 0;
111 return 1;
112}
113
114__setup("nohz=", setup_tick_nohz);
115
116/**
117 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
118 *
119 * Called from interrupt entry when the CPU was idle
120 *
121 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
122 * must be updated. Otherwise an interrupt handler could use a stale jiffy
123 * value. We do this unconditionally on any cpu, as we don't know whether the
124 * cpu, which has the update task assigned is in a long sleep.
125 */
126void tick_nohz_update_jiffies(void)
127{
128 int cpu = smp_processor_id();
129 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
130 unsigned long flags;
131 ktime_t now;
132
133 if (!ts->tick_stopped)
134 return;
135
136 cpu_clear(cpu, nohz_cpu_mask);
137 now = ktime_get();
138
139 local_irq_save(flags);
140 tick_do_update_jiffies64(now);
141 local_irq_restore(flags);
142}
143
144/**
145 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
146 *
147 * When the next event is more than a tick into the future, stop the idle tick
148 * Called either from the idle loop or from irq_exit() when an idle period was
149 * just interrupted by an interrupt which did not cause a reschedule.
150 */
151void tick_nohz_stop_sched_tick(void)
152{
153 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
154 struct tick_sched *ts;
155 ktime_t last_update, expires, now, delta;
156 int cpu;
157
158 local_irq_save(flags);
159
160 cpu = smp_processor_id();
161 ts = &per_cpu(tick_cpu_sched, cpu);
162
163 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
164 goto end;
165
166 if (need_resched())
167 goto end;
168
169 cpu = smp_processor_id();
170 if (unlikely(local_softirq_pending()))
171 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
172 local_softirq_pending());
173
174 now = ktime_get();
175 /*
176 * When called from irq_exit we need to account the idle sleep time
177 * correctly.
178 */
179 if (ts->tick_stopped) {
180 delta = ktime_sub(now, ts->idle_entrytime);
181 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
182 }
183
184 ts->idle_entrytime = now;
185 ts->idle_calls++;
186
187 /* Read jiffies and the time when jiffies were updated last */
188 do {
189 seq = read_seqbegin(&xtime_lock);
190 last_update = last_jiffies_update;
191 last_jiffies = jiffies;
192 } while (read_seqretry(&xtime_lock, seq));
193
194 /* Get the next timer wheel timer */
195 next_jiffies = get_next_timer_interrupt(last_jiffies);
196 delta_jiffies = next_jiffies - last_jiffies;
197
198 if (rcu_needs_cpu(cpu))
199 delta_jiffies = 1;
200 /*
201 * Do not stop the tick, if we are only one off
202 * or if the cpu is required for rcu
203 */
204 if (!ts->tick_stopped && delta_jiffies == 1)
205 goto out;
206
207 /* Schedule the tick, if we are at least one jiffie off */
208 if ((long)delta_jiffies >= 1) {
209
210 if (delta_jiffies > 1)
211 cpu_set(cpu, nohz_cpu_mask);
212 /*
213 * nohz_stop_sched_tick can be called several times before
214 * the nohz_restart_sched_tick is called. This happens when
215 * interrupts arrive which do not cause a reschedule. In the
216 * first call we save the current tick time, so we can restart
217 * the scheduler tick in nohz_restart_sched_tick.
218 */
219 if (!ts->tick_stopped) {
220 ts->idle_tick = ts->sched_timer.expires;
221 ts->tick_stopped = 1;
222 ts->idle_jiffies = last_jiffies;
223 }
224 /*
225 * calculate the expiry time for the next timer wheel
226 * timer
227 */
228 expires = ktime_add_ns(last_update, tick_period.tv64 *
229 delta_jiffies);
230 ts->idle_expires = expires;
231 ts->idle_sleeps++;
232
233 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
234 hrtimer_start(&ts->sched_timer, expires,
235 HRTIMER_MODE_ABS);
236 /* Check, if the timer was already in the past */
237 if (hrtimer_active(&ts->sched_timer))
238 goto out;
239 } else if(!tick_program_event(expires, 0))
240 goto out;
241 /*
242 * We are past the event already. So we crossed a
243 * jiffie boundary. Update jiffies and raise the
244 * softirq.
245 */
246 tick_do_update_jiffies64(ktime_get());
247 cpu_clear(cpu, nohz_cpu_mask);
248 }
249 raise_softirq_irqoff(TIMER_SOFTIRQ);
250out:
251 ts->next_jiffies = next_jiffies;
252 ts->last_jiffies = last_jiffies;
253end:
254 local_irq_restore(flags);
255}
256
257/**
258 * nohz_restart_sched_tick - restart the idle tick from the idle task
259 *
260 * Restart the idle tick when the CPU is woken up from idle
261 */
262void tick_nohz_restart_sched_tick(void)
263{
264 int cpu = smp_processor_id();
265 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
266 unsigned long ticks;
267 ktime_t now, delta;
268
269 if (!ts->tick_stopped)
270 return;
271
272 /* Update jiffies first */
273 now = ktime_get();
274
275 local_irq_disable();
276 tick_do_update_jiffies64(now);
277 cpu_clear(cpu, nohz_cpu_mask);
278
279 /* Account the idle time */
280 delta = ktime_sub(now, ts->idle_entrytime);
281 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
282
283 /*
284 * We stopped the tick in idle. Update process times would miss the
285 * time we slept as update_process_times does only a 1 tick
286 * accounting. Enforce that this is accounted to idle !
287 */
288 ticks = jiffies - ts->idle_jiffies;
289 /*
290 * We might be one off. Do not randomly account a huge number of ticks!
291 */
292 if (ticks && ticks < LONG_MAX) {
293 add_preempt_count(HARDIRQ_OFFSET);
294 account_system_time(current, HARDIRQ_OFFSET,
295 jiffies_to_cputime(ticks));
296 sub_preempt_count(HARDIRQ_OFFSET);
297 }
298
299 /*
300 * Cancel the scheduled timer and restore the tick
301 */
302 ts->tick_stopped = 0;
303 hrtimer_cancel(&ts->sched_timer);
304 ts->sched_timer.expires = ts->idle_tick;
305
306 while (1) {
307 /* Forward the time to expire in the future */
308 hrtimer_forward(&ts->sched_timer, now, tick_period);
309
310 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
311 hrtimer_start(&ts->sched_timer,
312 ts->sched_timer.expires,
313 HRTIMER_MODE_ABS);
314 /* Check, if the timer was already in the past */
315 if (hrtimer_active(&ts->sched_timer))
316 break;
317 } else {
318 if (!tick_program_event(ts->sched_timer.expires, 0))
319 break;
320 }
321 /* Update jiffies and reread time */
322 tick_do_update_jiffies64(now);
323 now = ktime_get();
324 }
325 local_irq_enable();
326}
327
328static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
329{
330 hrtimer_forward(&ts->sched_timer, now, tick_period);
331 return tick_program_event(ts->sched_timer.expires, 0);
332}
333
334/*
335 * The nohz low res interrupt handler
336 */
337static void tick_nohz_handler(struct clock_event_device *dev)
338{
339 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
340 struct pt_regs *regs = get_irq_regs();
341 ktime_t now = ktime_get();
342
343 dev->next_event.tv64 = KTIME_MAX;
344
345 /* Check, if the jiffies need an update */
346 tick_do_update_jiffies64(now);
347
348 /*
349 * When we are idle and the tick is stopped, we have to touch
350 * the watchdog as we might not schedule for a really long
351 * time. This happens on complete idle SMP systems while
352 * waiting on the login prompt. We also increment the "start
353 * of idle" jiffy stamp so the idle accounting adjustment we
354 * do when we go busy again does not account too much ticks.
355 */
356 if (ts->tick_stopped) {
357 touch_softlockup_watchdog();
358 ts->idle_jiffies++;
359 }
360
361 update_process_times(user_mode(regs));
362 profile_tick(CPU_PROFILING);
363
364 /* Do not restart, when we are in the idle loop */
365 if (ts->tick_stopped)
366 return;
367
368 while (tick_nohz_reprogram(ts, now)) {
369 now = ktime_get();
370 tick_do_update_jiffies64(now);
371 }
372}
373
374/**
375 * tick_nohz_switch_to_nohz - switch to nohz mode
376 */
377static void tick_nohz_switch_to_nohz(void)
378{
379 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
380 ktime_t next;
381
382 if (!tick_nohz_enabled)
383 return;
384
385 local_irq_disable();
386 if (tick_switch_to_oneshot(tick_nohz_handler)) {
387 local_irq_enable();
388 return;
389 }
390
391 ts->nohz_mode = NOHZ_MODE_LOWRES;
392
393 /*
394 * Recycle the hrtimer in ts, so we can share the
395 * hrtimer_forward with the highres code.
396 */
397 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
398 /* Get the next period */
399 next = tick_init_jiffy_update();
400
401 for (;;) {
402 ts->sched_timer.expires = next;
403 if (!tick_program_event(next, 0))
404 break;
405 next = ktime_add(next, tick_period);
406 }
407 local_irq_enable();
408
409 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
410 smp_processor_id());
411}
412
413#else
414
415static inline void tick_nohz_switch_to_nohz(void) { }
416
417#endif /* NO_HZ */
418
419/*
420 * High resolution timer specific code
421 */
422#ifdef CONFIG_HIGH_RES_TIMERS
423/*
424 * We rearm the timer until we get disabled by the idle code
425 * Called with interrupts disabled and timer->base->cpu_base->lock held.
426 */
427static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
428{
429 struct tick_sched *ts =
430 container_of(timer, struct tick_sched, sched_timer);
431 struct hrtimer_cpu_base *base = timer->base->cpu_base;
432 struct pt_regs *regs = get_irq_regs();
433 ktime_t now = ktime_get();
434
435 /* Check, if the jiffies need an update */
436 tick_do_update_jiffies64(now);
437
438 /*
439 * Do not call, when we are not in irq context and have
440 * no valid regs pointer
441 */
442 if (regs) {
443 /*
444 * When we are idle and the tick is stopped, we have to touch
445 * the watchdog as we might not schedule for a really long
446 * time. This happens on complete idle SMP systems while
447 * waiting on the login prompt. We also increment the "start of
448 * idle" jiffy stamp so the idle accounting adjustment we do
449 * when we go busy again does not account too much ticks.
450 */
451 if (ts->tick_stopped) {
452 touch_softlockup_watchdog();
453 ts->idle_jiffies++;
454 }
455 /*
456 * update_process_times() might take tasklist_lock, hence
457 * drop the base lock. sched-tick hrtimers are per-CPU and
458 * never accessible by userspace APIs, so this is safe to do.
459 */
460 spin_unlock(&base->lock);
461 update_process_times(user_mode(regs));
462 profile_tick(CPU_PROFILING);
463 spin_lock(&base->lock);
464 }
465
466 /* Do not restart, when we are in the idle loop */
467 if (ts->tick_stopped)
468 return HRTIMER_NORESTART;
469
470 hrtimer_forward(timer, now, tick_period);
471
472 return HRTIMER_RESTART;
473}
474
475/**
476 * tick_setup_sched_timer - setup the tick emulation timer
477 */
478void tick_setup_sched_timer(void)
479{
480 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
481 ktime_t now = ktime_get();
482
483 /*
484 * Emulate tick processing via per-CPU hrtimers:
485 */
486 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
487 ts->sched_timer.function = tick_sched_timer;
488 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
489
490 /* Get the next period */
491 ts->sched_timer.expires = tick_init_jiffy_update();
492
493 for (;;) {
494 hrtimer_forward(&ts->sched_timer, now, tick_period);
495 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
496 HRTIMER_MODE_ABS);
497 /* Check, if the timer was already in the past */
498 if (hrtimer_active(&ts->sched_timer))
499 break;
500 now = ktime_get();
501 }
502
503#ifdef CONFIG_NO_HZ
504 if (tick_nohz_enabled)
505 ts->nohz_mode = NOHZ_MODE_HIGHRES;
506#endif
507}
508
509void tick_cancel_sched_timer(int cpu)
510{
511 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
512
513 if (ts->sched_timer.base)
514 hrtimer_cancel(&ts->sched_timer);
515 ts->tick_stopped = 0;
516 ts->nohz_mode = NOHZ_MODE_INACTIVE;
517}
518#endif /* HIGH_RES_TIMERS */
519
520/**
521 * Async notification about clocksource changes
522 */
523void tick_clock_notify(void)
524{
525 int cpu;
526
527 for_each_possible_cpu(cpu)
528 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
529}
530
531/*
532 * Async notification about clock event changes
533 */
534void tick_oneshot_notify(void)
535{
536 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
537
538 set_bit(0, &ts->check_clocks);
539}
540
541/**
542 * Check, if a change happened, which makes oneshot possible.
543 *
544 * Called cyclic from the hrtimer softirq (driven by the timer
545 * softirq) allow_nohz signals, that we can switch into low-res nohz
546 * mode, because high resolution timers are disabled (either compile
547 * or runtime).
548 */
549int tick_check_oneshot_change(int allow_nohz)
550{
551 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
552
553 if (!test_and_clear_bit(0, &ts->check_clocks))
554 return 0;
555
556 if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
557 return 0;
558
559 if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
560 return 0;
561
562 if (!allow_nohz)
563 return 1;
564
565 tick_nohz_switch_to_nohz();
566 return 0;
567}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000000..f82c635c3d5c
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
1/*
2 * kernel/time/timer_list.c
3 *
4 * List pending timers
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/sched.h>
17#include <linux/seq_file.h>
18#include <linux/kallsyms.h>
19#include <linux/tick.h>
20
21#include <asm/uaccess.h>
22
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26
27/*
28 * This allows printing both to /proc/timer_list and
29 * to the console (on SysRq-Q):
30 */
31#define SEQ_printf(m, x...) \
32 do { \
33 if (m) \
34 seq_printf(m, x); \
35 else \
36 printk(x); \
37 } while (0)
38
39static void print_name_offset(struct seq_file *m, void *sym)
40{
41 unsigned long addr = (unsigned long)sym;
42 char namebuf[KSYM_NAME_LEN+1];
43 unsigned long size, offset;
44 const char *sym_name;
45 char *modname;
46
47 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
48 if (sym_name)
49 SEQ_printf(m, "%s", sym_name);
50 else
51 SEQ_printf(m, "<%p>", sym);
52}
53
54static void
55print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
56{
57#ifdef CONFIG_TIMER_STATS
58 char tmp[TASK_COMM_LEN + 1];
59#endif
60 SEQ_printf(m, " #%d: ", idx);
61 print_name_offset(m, timer);
62 SEQ_printf(m, ", ");
63 print_name_offset(m, timer->function);
64 SEQ_printf(m, ", S:%02lx", timer->state);
65#ifdef CONFIG_TIMER_STATS
66 SEQ_printf(m, ", ");
67 print_name_offset(m, timer->start_site);
68 memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
69 tmp[TASK_COMM_LEN] = 0;
70 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
71#endif
72 SEQ_printf(m, "\n");
73 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
74 (unsigned long long)ktime_to_ns(timer->expires),
75 (unsigned long long)(ktime_to_ns(timer->expires) - now));
76}
77
78static void
79print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
80 u64 now)
81{
82 struct hrtimer *timer, tmp;
83 unsigned long next = 0, i;
84 struct rb_node *curr;
85 unsigned long flags;
86
87next_one:
88 i = 0;
89 spin_lock_irqsave(&base->cpu_base->lock, flags);
90
91 curr = base->first;
92 /*
93 * Crude but we have to do this O(N*N) thing, because
94 * we have to unlock the base when printing:
95 */
96 while (curr && i < next) {
97 curr = rb_next(curr);
98 i++;
99 }
100
101 if (curr) {
102
103 timer = rb_entry(curr, struct hrtimer, node);
104 tmp = *timer;
105 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
106
107 print_timer(m, &tmp, i, now);
108 next++;
109 goto next_one;
110 }
111 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
112}
113
114static void
115print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
116{
117 SEQ_printf(m, " .index: %d\n",
118 base->index);
119 SEQ_printf(m, " .resolution: %Ld nsecs\n",
120 (unsigned long long)ktime_to_ns(base->resolution));
121 SEQ_printf(m, " .get_time: ");
122 print_name_offset(m, base->get_time);
123 SEQ_printf(m, "\n");
124#ifdef CONFIG_HIGH_RES_TIMERS
125 SEQ_printf(m, " .offset: %Ld nsecs\n",
126 ktime_to_ns(base->offset));
127#endif
128 SEQ_printf(m, "active timers:\n");
129 print_active_timers(m, base, now);
130}
131
132static void print_cpu(struct seq_file *m, int cpu, u64 now)
133{
134 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
135 int i;
136
137 SEQ_printf(m, "\ncpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i);
140 print_base(m, cpu_base->clock_base + i, now);
141 }
142#define P(x) \
143 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
144#define P_ns(x) \
145 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
146 (u64)(ktime_to_ns(cpu_base->x)))
147
148#ifdef CONFIG_HIGH_RES_TIMERS
149 P_ns(expires_next);
150 P(hres_active);
151 P(nr_events);
152#endif
153#undef P
154#undef P_ns
155
156#ifdef CONFIG_TICK_ONESHOT
157# define P(x) \
158 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
159# define P_ns(x) \
160 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
161 (u64)(ktime_to_ns(ts->x)))
162 {
163 struct tick_sched *ts = tick_get_tick_sched(cpu);
164 P(nohz_mode);
165 P_ns(idle_tick);
166 P(tick_stopped);
167 P(idle_jiffies);
168 P(idle_calls);
169 P(idle_sleeps);
170 P_ns(idle_entrytime);
171 P_ns(idle_sleeptime);
172 P(last_jiffies);
173 P(next_jiffies);
174 P_ns(idle_expires);
175 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
176 }
177#endif
178
179#undef P
180#undef P_ns
181}
182
183#ifdef CONFIG_GENERIC_CLOCKEVENTS
184static void
185print_tickdevice(struct seq_file *m, struct tick_device *td)
186{
187 struct clock_event_device *dev = td->evtdev;
188
189 SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode);
190
191 SEQ_printf(m, "Clock Event Device: ");
192 if (!dev) {
193 SEQ_printf(m, "<NULL>\n");
194 return;
195 }
196 SEQ_printf(m, "%s\n", dev->name);
197 SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
198 SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
199 SEQ_printf(m, " mult: %ld\n", dev->mult);
200 SEQ_printf(m, " shift: %d\n", dev->shift);
201 SEQ_printf(m, " mode: %d\n", dev->mode);
202 SEQ_printf(m, " next_event: %Ld nsecs\n",
203 (unsigned long long) ktime_to_ns(dev->next_event));
204
205 SEQ_printf(m, " set_next_event: ");
206 print_name_offset(m, dev->set_next_event);
207 SEQ_printf(m, "\n");
208
209 SEQ_printf(m, " set_mode: ");
210 print_name_offset(m, dev->set_mode);
211 SEQ_printf(m, "\n");
212
213 SEQ_printf(m, " event_handler: ");
214 print_name_offset(m, dev->event_handler);
215 SEQ_printf(m, "\n");
216}
217
218static void timer_list_show_tickdevices(struct seq_file *m)
219{
220 int cpu;
221
222#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
223 print_tickdevice(m, tick_get_broadcast_device());
224 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
225 tick_get_broadcast_mask()->bits[0]);
226#ifdef CONFIG_TICK_ONESHOT
227 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
228 tick_get_broadcast_oneshot_mask()->bits[0]);
229#endif
230 SEQ_printf(m, "\n");
231#endif
232 for_each_online_cpu(cpu)
233 print_tickdevice(m, tick_get_device(cpu));
234 SEQ_printf(m, "\n");
235}
236#else
237static void timer_list_show_tickdevices(struct seq_file *m) { }
238#endif
239
240static int timer_list_show(struct seq_file *m, void *v)
241{
242 u64 now = ktime_to_ns(ktime_get());
243 int cpu;
244
245 SEQ_printf(m, "Timer List Version: v0.3\n");
246 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
247 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
248
249 for_each_online_cpu(cpu)
250 print_cpu(m, cpu, now);
251
252 SEQ_printf(m, "\n");
253 timer_list_show_tickdevices(m);
254
255 return 0;
256}
257
258void sysrq_timer_list_show(void)
259{
260 timer_list_show(NULL, NULL);
261}
262
263static int timer_list_open(struct inode *inode, struct file *filp)
264{
265 return single_open(filp, timer_list_show, NULL);
266}
267
268static struct file_operations timer_list_fops = {
269 .open = timer_list_open,
270 .read = seq_read,
271 .llseek = seq_lseek,
272 .release = seq_release,
273};
274
275static int __init init_timer_list_procfs(void)
276{
277 struct proc_dir_entry *pe;
278
279 pe = create_proc_entry("timer_list", 0644, NULL);
280 if (!pe)
281 return -ENOMEM;
282
283 pe->proc_fops = &timer_list_fops;
284
285 return 0;
286}
287__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000000..1bc4882e28e0
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
1/*
2 * kernel/time/timer_stats.c
3 *
4 * Collect timer usage statistics.
5 *
6 * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
7 * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * timer_stats is based on timer_top, a similar functionality which was part of
10 * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
11 * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
12 * on dynamic allocation of the statistics entries and linear search based
13 * lookup combined with a global lock, rather than the static array, hash
14 * and per-CPU locking which is used by timer_stats. It was written for the
15 * pre hrtimer kernel code and therefore did not take hrtimers into account.
16 * Nevertheless it provided the base for the timer_stats implementation and
17 * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
18 * for this effort.
19 *
20 * timer_top.c is
21 * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
22 * Written by Daniel Petrini <d.pensator@gmail.com>
23 * timer_top.c was released under the GNU General Public License version 2
24 *
25 * We export the addresses and counting of timer functions being called,
26 * the pid and cmdline from the owner process if applicable.
27 *
28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats
30 *
31 * Display the information collected so far:
32 * # cat /proc/timer_stats
33 *
34 * This program is free software; you can redistribute it and/or modify
35 * it under the terms of the GNU General Public License version 2 as
36 * published by the Free Software Foundation.
37 */
38
39#include <linux/proc_fs.h>
40#include <linux/module.h>
41#include <linux/spinlock.h>
42#include <linux/sched.h>
43#include <linux/seq_file.h>
44#include <linux/kallsyms.h>
45
46#include <asm/uaccess.h>
47
48/*
49 * This is our basic unit of interest: a timer expiry event identified
50 * by the timer, its start/expire functions and the PID of the task that
51 * started the timer. We count the number of times an event happens:
52 */
53struct entry {
54 /*
55 * Hash list:
56 */
57 struct entry *next;
58
59 /*
60 * Hash keys:
61 */
62 void *timer;
63 void *start_func;
64 void *expire_func;
65 pid_t pid;
66
67 /*
68 * Number of timeout events:
69 */
70 unsigned long count;
71
72 /*
73 * We save the command-line string to preserve
74 * this information past task exit:
75 */
76 char comm[TASK_COMM_LEN + 1];
77
78} ____cacheline_aligned_in_smp;
79
80/*
81 * Spinlock protecting the tables - not taken during lookup:
82 */
83static DEFINE_SPINLOCK(table_lock);
84
85/*
86 * Per-CPU lookup locks for fast hash lookup:
87 */
88static DEFINE_PER_CPU(spinlock_t, lookup_lock);
89
90/*
91 * Mutex to serialize state changes with show-stats activities:
92 */
93static DEFINE_MUTEX(show_mutex);
94
95/*
96 * Collection status, active/inactive:
97 */
98static int __read_mostly active;
99
100/*
101 * Beginning/end timestamps of measurement:
102 */
103static ktime_t time_start, time_stop;
104
105/*
106 * tstat entry structs only get allocated while collection is
107 * active and never freed during that time - this simplifies
108 * things quite a bit.
109 *
110 * They get freed when a new collection period is started.
111 */
112#define MAX_ENTRIES_BITS 10
113#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
114
115static unsigned long nr_entries;
116static struct entry entries[MAX_ENTRIES];
117
118static atomic_t overflow_count;
119
120static void reset_entries(void)
121{
122 nr_entries = 0;
123 memset(entries, 0, sizeof(entries));
124 atomic_set(&overflow_count, 0);
125}
126
127static struct entry *alloc_entry(void)
128{
129 if (nr_entries >= MAX_ENTRIES)
130 return NULL;
131
132 return entries + nr_entries++;
133}
134
135/*
136 * The entries are in a hash-table, for fast lookup:
137 */
138#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
139#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
140#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
141
142#define __tstat_hashfn(entry) \
143 (((unsigned long)(entry)->timer ^ \
144 (unsigned long)(entry)->start_func ^ \
145 (unsigned long)(entry)->expire_func ^ \
146 (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
147
148#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
149
150static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
151
152static int match_entries(struct entry *entry1, struct entry *entry2)
153{
154 return entry1->timer == entry2->timer &&
155 entry1->start_func == entry2->start_func &&
156 entry1->expire_func == entry2->expire_func &&
157 entry1->pid == entry2->pid;
158}
159
160/*
161 * Look up whether an entry matching this item is present
162 * in the hash already. Must be called with irqs off and the
163 * lookup lock held:
164 */
165static struct entry *tstat_lookup(struct entry *entry, char *comm)
166{
167 struct entry **head, *curr, *prev;
168
169 head = tstat_hashentry(entry);
170 curr = *head;
171
172 /*
173 * The fastpath is when the entry is already hashed,
174 * we do this with the lookup lock held, but with the
175 * table lock not held:
176 */
177 while (curr) {
178 if (match_entries(curr, entry))
179 return curr;
180
181 curr = curr->next;
182 }
183 /*
184 * Slowpath: allocate, set up and link a new hash entry:
185 */
186 prev = NULL;
187 curr = *head;
188
189 spin_lock(&table_lock);
190 /*
191 * Make sure we have not raced with another CPU:
192 */
193 while (curr) {
194 if (match_entries(curr, entry))
195 goto out_unlock;
196
197 prev = curr;
198 curr = curr->next;
199 }
200
201 curr = alloc_entry();
202 if (curr) {
203 *curr = *entry;
204 curr->count = 0;
205 memcpy(curr->comm, comm, TASK_COMM_LEN);
206 if (prev)
207 prev->next = curr;
208 else
209 *head = curr;
210 curr->next = NULL;
211 }
212 out_unlock:
213 spin_unlock(&table_lock);
214
215 return curr;
216}
217
218/**
219 * timer_stats_update_stats - Update the statistics for a timer.
220 * @timer: pointer to either a timer_list or a hrtimer
221 * @pid: the pid of the task which set up the timer
222 * @startf: pointer to the function which did the timer setup
223 * @timerf: pointer to the timer callback function of the timer
224 * @comm: name of the process which set up the timer
225 *
226 * When the timer is already registered, then the event counter is
227 * incremented. Otherwise the timer is registered in a free slot.
228 */
229void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
230 void *timerf, char * comm)
231{
232 /*
233 * It doesnt matter which lock we take:
234 */
235 spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
236 struct entry *entry, input;
237 unsigned long flags;
238
239 input.timer = timer;
240 input.start_func = startf;
241 input.expire_func = timerf;
242 input.pid = pid;
243
244 spin_lock_irqsave(lock, flags);
245 if (!active)
246 goto out_unlock;
247
248 entry = tstat_lookup(&input, comm);
249 if (likely(entry))
250 entry->count++;
251 else
252 atomic_inc(&overflow_count);
253
254 out_unlock:
255 spin_unlock_irqrestore(lock, flags);
256}
257
258static void print_name_offset(struct seq_file *m, unsigned long addr)
259{
260 char namebuf[KSYM_NAME_LEN+1];
261 unsigned long size, offset;
262 const char *sym_name;
263 char *modname;
264
265 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
266 if (sym_name)
267 seq_printf(m, "%s", sym_name);
268 else
269 seq_printf(m, "<%p>", (void *)addr);
270}
271
272static int tstats_show(struct seq_file *m, void *v)
273{
274 struct timespec period;
275 struct entry *entry;
276 unsigned long ms;
277 long events = 0;
278 ktime_t time;
279 int i;
280
281 mutex_lock(&show_mutex);
282 /*
283 * If still active then calculate up to now:
284 */
285 if (active)
286 time_stop = ktime_get();
287
288 time = ktime_sub(time_stop, time_start);
289
290 period = ktime_to_timespec(time);
291 ms = period.tv_nsec / 1000000;
292
293 seq_puts(m, "Timer Stats Version: v0.1\n");
294 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
295 if (atomic_read(&overflow_count))
296 seq_printf(m, "Overflow: %d entries\n",
297 atomic_read(&overflow_count));
298
299 for (i = 0; i < nr_entries; i++) {
300 entry = entries + i;
301 seq_printf(m, "%4lu, %5d %-16s ",
302 entry->count, entry->pid, entry->comm);
303
304 print_name_offset(m, (unsigned long)entry->start_func);
305 seq_puts(m, " (");
306 print_name_offset(m, (unsigned long)entry->expire_func);
307 seq_puts(m, ")\n");
308
309 events += entry->count;
310 }
311
312 ms += period.tv_sec * 1000;
313 if (!ms)
314 ms = 1;
315
316 if (events && period.tv_sec)
317 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
318 events / period.tv_sec, events * 1000 / ms);
319 else
320 seq_printf(m, "%ld total events\n", events);
321
322 mutex_unlock(&show_mutex);
323
324 return 0;
325}
326
327/*
328 * After a state change, make sure all concurrent lookup/update
329 * activities have stopped:
330 */
331static void sync_access(void)
332{
333 unsigned long flags;
334 int cpu;
335
336 for_each_online_cpu(cpu) {
337 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
338 /* nothing */
339 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
340 }
341}
342
343static ssize_t tstats_write(struct file *file, const char __user *buf,
344 size_t count, loff_t *offs)
345{
346 char ctl[2];
347
348 if (count != 2 || *offs)
349 return -EINVAL;
350
351 if (copy_from_user(ctl, buf, count))
352 return -EFAULT;
353
354 mutex_lock(&show_mutex);
355 switch (ctl[0]) {
356 case '0':
357 if (active) {
358 active = 0;
359 time_stop = ktime_get();
360 sync_access();
361 }
362 break;
363 case '1':
364 if (!active) {
365 reset_entries();
366 time_start = ktime_get();
367 active = 1;
368 }
369 break;
370 default:
371 count = -EINVAL;
372 }
373 mutex_unlock(&show_mutex);
374
375 return count;
376}
377
378static int tstats_open(struct inode *inode, struct file *filp)
379{
380 return single_open(filp, tstats_show, NULL);
381}
382
383static struct file_operations tstats_fops = {
384 .open = tstats_open,
385 .read = seq_read,
386 .write = tstats_write,
387 .llseek = seq_lseek,
388 .release = seq_release,
389};
390
391void __init init_timer_stats(void)
392{
393 int cpu;
394
395 for_each_possible_cpu(cpu)
396 spin_lock_init(&per_cpu(lookup_lock, cpu));
397}
398
399static int __init init_tstats_procfs(void)
400{
401 struct proc_dir_entry *pe;
402
403 pe = create_proc_entry("timer_stats", 0644, NULL);
404 if (!pe)
405 return -ENOMEM;
406
407 pe->proc_fops = &tstats_fops;
408
409 return 0;
410}
411__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index c2a8ccfc2882..cb1b86a9c52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,8 @@
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/tick.h>
38#include <linux/kallsyms.h>
37 39
38#include <asm/uaccess.h> 40#include <asm/uaccess.h>
39#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -85,7 +87,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
85 * @j: the time in (absolute) jiffies that should be rounded 87 * @j: the time in (absolute) jiffies that should be rounded
86 * @cpu: the processor number on which the timeout will happen 88 * @cpu: the processor number on which the timeout will happen
87 * 89 *
88 * __round_jiffies rounds an absolute time in the future (in jiffies) 90 * __round_jiffies() rounds an absolute time in the future (in jiffies)
89 * up or down to (approximately) full seconds. This is useful for timers 91 * up or down to (approximately) full seconds. This is useful for timers
90 * for which the exact time they fire does not matter too much, as long as 92 * for which the exact time they fire does not matter too much, as long as
91 * they fire approximately every X seconds. 93 * they fire approximately every X seconds.
@@ -98,7 +100,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
98 * processors firing at the exact same time, which could lead 100 * processors firing at the exact same time, which could lead
99 * to lock contention or spurious cache line bouncing. 101 * to lock contention or spurious cache line bouncing.
100 * 102 *
101 * The return value is the rounded version of the "j" parameter. 103 * The return value is the rounded version of the @j parameter.
102 */ 104 */
103unsigned long __round_jiffies(unsigned long j, int cpu) 105unsigned long __round_jiffies(unsigned long j, int cpu)
104{ 106{
@@ -142,7 +144,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
142 * @j: the time in (relative) jiffies that should be rounded 144 * @j: the time in (relative) jiffies that should be rounded
143 * @cpu: the processor number on which the timeout will happen 145 * @cpu: the processor number on which the timeout will happen
144 * 146 *
145 * __round_jiffies_relative rounds a time delta in the future (in jiffies) 147 * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
146 * up or down to (approximately) full seconds. This is useful for timers 148 * up or down to (approximately) full seconds. This is useful for timers
147 * for which the exact time they fire does not matter too much, as long as 149 * for which the exact time they fire does not matter too much, as long as
148 * they fire approximately every X seconds. 150 * they fire approximately every X seconds.
@@ -155,7 +157,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
155 * processors firing at the exact same time, which could lead 157 * processors firing at the exact same time, which could lead
156 * to lock contention or spurious cache line bouncing. 158 * to lock contention or spurious cache line bouncing.
157 * 159 *
158 * The return value is the rounded version of the "j" parameter. 160 * The return value is the rounded version of the @j parameter.
159 */ 161 */
160unsigned long __round_jiffies_relative(unsigned long j, int cpu) 162unsigned long __round_jiffies_relative(unsigned long j, int cpu)
161{ 163{
@@ -173,7 +175,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
173 * round_jiffies - function to round jiffies to a full second 175 * round_jiffies - function to round jiffies to a full second
174 * @j: the time in (absolute) jiffies that should be rounded 176 * @j: the time in (absolute) jiffies that should be rounded
175 * 177 *
176 * round_jiffies rounds an absolute time in the future (in jiffies) 178 * round_jiffies() rounds an absolute time in the future (in jiffies)
177 * up or down to (approximately) full seconds. This is useful for timers 179 * up or down to (approximately) full seconds. This is useful for timers
178 * for which the exact time they fire does not matter too much, as long as 180 * for which the exact time they fire does not matter too much, as long as
179 * they fire approximately every X seconds. 181 * they fire approximately every X seconds.
@@ -182,7 +184,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
182 * at the same time, rather than at various times spread out. The goal 184 * at the same time, rather than at various times spread out. The goal
183 * of this is to have the CPU wake up less, which saves power. 185 * of this is to have the CPU wake up less, which saves power.
184 * 186 *
185 * The return value is the rounded version of the "j" parameter. 187 * The return value is the rounded version of the @j parameter.
186 */ 188 */
187unsigned long round_jiffies(unsigned long j) 189unsigned long round_jiffies(unsigned long j)
188{ 190{
@@ -194,7 +196,7 @@ EXPORT_SYMBOL_GPL(round_jiffies);
194 * round_jiffies_relative - function to round jiffies to a full second 196 * round_jiffies_relative - function to round jiffies to a full second
195 * @j: the time in (relative) jiffies that should be rounded 197 * @j: the time in (relative) jiffies that should be rounded
196 * 198 *
197 * round_jiffies_relative rounds a time delta in the future (in jiffies) 199 * round_jiffies_relative() rounds a time delta in the future (in jiffies)
198 * up or down to (approximately) full seconds. This is useful for timers 200 * up or down to (approximately) full seconds. This is useful for timers
199 * for which the exact time they fire does not matter too much, as long as 201 * for which the exact time they fire does not matter too much, as long as
200 * they fire approximately every X seconds. 202 * they fire approximately every X seconds.
@@ -203,7 +205,7 @@ EXPORT_SYMBOL_GPL(round_jiffies);
203 * at the same time, rather than at various times spread out. The goal 205 * at the same time, rather than at various times spread out. The goal
204 * of this is to have the CPU wake up less, which saves power. 206 * of this is to have the CPU wake up less, which saves power.
205 * 207 *
206 * The return value is the rounded version of the "j" parameter. 208 * The return value is the rounded version of the @j parameter.
207 */ 209 */
208unsigned long round_jiffies_relative(unsigned long j) 210unsigned long round_jiffies_relative(unsigned long j)
209{ 211{
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
262 list_add_tail(&timer->entry, vec); 264 list_add_tail(&timer->entry, vec);
263} 265}
264 266
267#ifdef CONFIG_TIMER_STATS
268void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
269{
270 if (timer->start_site)
271 return;
272
273 timer->start_site = addr;
274 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
275 timer->start_pid = current->pid;
276}
277#endif
278
265/** 279/**
266 * init_timer - initialize a timer. 280 * init_timer - initialize a timer.
267 * @timer: the timer to be initialized 281 * @timer: the timer to be initialized
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
273{ 287{
274 timer->entry.next = NULL; 288 timer->entry.next = NULL;
275 timer->base = __raw_get_cpu_var(tvec_bases); 289 timer->base = __raw_get_cpu_var(tvec_bases);
290#ifdef CONFIG_TIMER_STATS
291 timer->start_site = NULL;
292 timer->start_pid = -1;
293 memset(timer->start_comm, 0, TASK_COMM_LEN);
294#endif
276} 295}
277EXPORT_SYMBOL(init_timer); 296EXPORT_SYMBOL(init_timer);
278 297
279static inline void detach_timer(struct timer_list *timer, 298static inline void detach_timer(struct timer_list *timer,
280 int clear_pending) 299 int clear_pending)
281{ 300{
282 struct list_head *entry = &timer->entry; 301 struct list_head *entry = &timer->entry;
283 302
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
324 unsigned long flags; 343 unsigned long flags;
325 int ret = 0; 344 int ret = 0;
326 345
346 timer_stats_timer_set_start_info(timer);
327 BUG_ON(!timer->function); 347 BUG_ON(!timer->function);
328 348
329 base = lock_timer_base(timer, &flags); 349 base = lock_timer_base(timer, &flags);
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
374 tvec_base_t *base = per_cpu(tvec_bases, cpu); 394 tvec_base_t *base = per_cpu(tvec_bases, cpu);
375 unsigned long flags; 395 unsigned long flags;
376 396
397 timer_stats_timer_set_start_info(timer);
377 BUG_ON(timer_pending(timer) || !timer->function); 398 BUG_ON(timer_pending(timer) || !timer->function);
378 spin_lock_irqsave(&base->lock, flags); 399 spin_lock_irqsave(&base->lock, flags);
379 timer->base = base; 400 timer->base = base;
@@ -387,7 +408,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
387 * @timer: the timer to be modified 408 * @timer: the timer to be modified
388 * @expires: new timeout in jiffies 409 * @expires: new timeout in jiffies
389 * 410 *
390 * mod_timer is a more efficient way to update the expire field of an 411 * mod_timer() is a more efficient way to update the expire field of an
391 * active timer (if the timer is inactive it will be activated) 412 * active timer (if the timer is inactive it will be activated)
392 * 413 *
393 * mod_timer(timer, expires) is equivalent to: 414 * mod_timer(timer, expires) is equivalent to:
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
406{ 427{
407 BUG_ON(!timer->function); 428 BUG_ON(!timer->function);
408 429
430 timer_stats_timer_set_start_info(timer);
409 /* 431 /*
410 * This is a common optimization triggered by the 432 * This is a common optimization triggered by the
411 * networking code - if the timer is re-modified 433 * networking code - if the timer is re-modified
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer)
436 unsigned long flags; 458 unsigned long flags;
437 int ret = 0; 459 int ret = 0;
438 460
461 timer_stats_timer_clear_start_info(timer);
439 if (timer_pending(timer)) { 462 if (timer_pending(timer)) {
440 base = lock_timer_base(timer, &flags); 463 base = lock_timer_base(timer, &flags);
441 if (timer_pending(timer)) { 464 if (timer_pending(timer)) {
@@ -490,7 +513,7 @@ out:
490 * the timer it also makes sure the handler has finished executing on other 513 * the timer it also makes sure the handler has finished executing on other
491 * CPUs. 514 * CPUs.
492 * 515 *
493 * Synchronization rules: callers must prevent restarting of the timer, 516 * Synchronization rules: Callers must prevent restarting of the timer,
494 * otherwise this function is meaningless. It must not be called from 517 * otherwise this function is meaningless. It must not be called from
495 * interrupt contexts. The caller must not hold locks which would prevent 518 * interrupt contexts. The caller must not hold locks which would prevent
496 * completion of the timer's handler. The timer's handler must not call 519 * completion of the timer's handler. The timer's handler must not call
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
569 fn = timer->function; 592 fn = timer->function;
570 data = timer->data; 593 data = timer->data;
571 594
595 timer_stats_account_timer(timer);
596
572 set_running_timer(base, timer); 597 set_running_timer(base, timer);
573 detach_timer(timer, 1); 598 detach_timer(timer, 1);
574 spin_unlock_irq(&base->lock); 599 spin_unlock_irq(&base->lock);
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base)
591 spin_unlock_irq(&base->lock); 616 spin_unlock_irq(&base->lock);
592} 617}
593 618
594#ifdef CONFIG_NO_IDLE_HZ 619#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
595/* 620/*
596 * Find out when the next timer event is due to happen. This 621 * Find out when the next timer event is due to happen. This
597 * is used on S/390 to stop all activity when a cpus is idle. 622 * is used on S/390 to stop all activity when a cpus is idle.
598 * This functions needs to be called disabled. 623 * This functions needs to be called disabled.
599 */ 624 */
600unsigned long next_timer_interrupt(void) 625static unsigned long __next_timer_interrupt(tvec_base_t *base)
601{ 626{
602 tvec_base_t *base; 627 unsigned long timer_jiffies = base->timer_jiffies;
603 struct list_head *list; 628 unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
629 int index, slot, array, found = 0;
604 struct timer_list *nte; 630 struct timer_list *nte;
605 unsigned long expires;
606 unsigned long hr_expires = MAX_JIFFY_OFFSET;
607 ktime_t hr_delta;
608 tvec_t *varray[4]; 631 tvec_t *varray[4];
609 int i, j;
610
611 hr_delta = hrtimer_get_next_event();
612 if (hr_delta.tv64 != KTIME_MAX) {
613 struct timespec tsdelta;
614 tsdelta = ktime_to_timespec(hr_delta);
615 hr_expires = timespec_to_jiffies(&tsdelta);
616 if (hr_expires < 3)
617 return hr_expires + jiffies;
618 }
619 hr_expires += jiffies;
620
621 base = __get_cpu_var(tvec_bases);
622 spin_lock(&base->lock);
623 expires = base->timer_jiffies + (LONG_MAX >> 1);
624 list = NULL;
625 632
626 /* Look for timer events in tv1. */ 633 /* Look for timer events in tv1. */
627 j = base->timer_jiffies & TVR_MASK; 634 index = slot = timer_jiffies & TVR_MASK;
628 do { 635 do {
629 list_for_each_entry(nte, base->tv1.vec + j, entry) { 636 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
637 found = 1;
630 expires = nte->expires; 638 expires = nte->expires;
631 if (j < (base->timer_jiffies & TVR_MASK)) 639 /* Look at the cascade bucket(s)? */
632 list = base->tv2.vec + (INDEX(0)); 640 if (!index || slot < index)
633 goto found; 641 goto cascade;
642 return expires;
634 } 643 }
635 j = (j + 1) & TVR_MASK; 644 slot = (slot + 1) & TVR_MASK;
636 } while (j != (base->timer_jiffies & TVR_MASK)); 645 } while (slot != index);
646
647cascade:
648 /* Calculate the next cascade event */
649 if (index)
650 timer_jiffies += TVR_SIZE - index;
651 timer_jiffies >>= TVR_BITS;
637 652
638 /* Check tv2-tv5. */ 653 /* Check tv2-tv5. */
639 varray[0] = &base->tv2; 654 varray[0] = &base->tv2;
640 varray[1] = &base->tv3; 655 varray[1] = &base->tv3;
641 varray[2] = &base->tv4; 656 varray[2] = &base->tv4;
642 varray[3] = &base->tv5; 657 varray[3] = &base->tv5;
643 for (i = 0; i < 4; i++) { 658
644 j = INDEX(i); 659 for (array = 0; array < 4; array++) {
660 tvec_t *varp = varray[array];
661
662 index = slot = timer_jiffies & TVN_MASK;
645 do { 663 do {
646 if (list_empty(varray[i]->vec + j)) { 664 list_for_each_entry(nte, varp->vec + slot, entry) {
647 j = (j + 1) & TVN_MASK; 665 found = 1;
648 continue;
649 }
650 list_for_each_entry(nte, varray[i]->vec + j, entry)
651 if (time_before(nte->expires, expires)) 666 if (time_before(nte->expires, expires))
652 expires = nte->expires; 667 expires = nte->expires;
653 if (j < (INDEX(i)) && i < 3) 668 }
654 list = varray[i + 1]->vec + (INDEX(i + 1)); 669 /*
655 goto found; 670 * Do we still search for the first timer or are
656 } while (j != (INDEX(i))); 671 * we looking up the cascade buckets ?
657 } 672 */
658found: 673 if (found) {
659 if (list) { 674 /* Look at the cascade bucket(s)? */
660 /* 675 if (!index || slot < index)
661 * The search wrapped. We need to look at the next list 676 break;
662 * from next tv element that would cascade into tv element 677 return expires;
663 * where we found the timer element. 678 }
664 */ 679 slot = (slot + 1) & TVN_MASK;
665 list_for_each_entry(nte, list, entry) { 680 } while (slot != index);
666 if (time_before(nte->expires, expires)) 681
667 expires = nte->expires; 682 if (index)
668 } 683 timer_jiffies += TVN_SIZE - index;
684 timer_jiffies >>= TVN_BITS;
669 } 685 }
670 spin_unlock(&base->lock); 686 return expires;
687}
671 688
672 /* 689/*
673 * It can happen that other CPUs service timer IRQs and increment 690 * Check, if the next hrtimer event is before the next timer wheel
674 * jiffies, but we have not yet got a local timer tick to process 691 * event:
675 * the timer wheels. In that case, the expiry time can be before 692 */
676 * jiffies, but since the high-resolution timer here is relative to 693static unsigned long cmp_next_hrtimer_event(unsigned long now,
677 * jiffies, the default expression when high-resolution timers are 694 unsigned long expires)
678 * not active, 695{
679 * 696 ktime_t hr_delta = hrtimer_get_next_event();
680 * time_before(MAX_JIFFY_OFFSET + jiffies, expires) 697 struct timespec tsdelta;
681 *
682 * would falsely evaluate to true. If that is the case, just
683 * return jiffies so that we can immediately fire the local timer
684 */
685 if (time_before(expires, jiffies))
686 return jiffies;
687 698
688 if (time_before(hr_expires, expires)) 699 if (hr_delta.tv64 == KTIME_MAX)
689 return hr_expires; 700 return expires;
690 701
702 if (hr_delta.tv64 <= TICK_NSEC)
703 return now;
704
705 tsdelta = ktime_to_timespec(hr_delta);
706 now += timespec_to_jiffies(&tsdelta);
707 if (time_before(now, expires))
708 return now;
691 return expires; 709 return expires;
692} 710}
711
712/**
713 * next_timer_interrupt - return the jiffy of the next pending timer
714 */
715unsigned long get_next_timer_interrupt(unsigned long now)
716{
717 tvec_base_t *base = __get_cpu_var(tvec_bases);
718 unsigned long expires;
719
720 spin_lock(&base->lock);
721 expires = __next_timer_interrupt(base);
722 spin_unlock(&base->lock);
723
724 if (time_before_eq(expires, now))
725 return now;
726
727 return cmp_next_hrtimer_event(now, expires);
728}
729
730#ifdef CONFIG_NO_IDLE_HZ
731unsigned long next_timer_interrupt(void)
732{
733 return get_next_timer_interrupt(jiffies);
734}
735#endif
736
693#endif 737#endif
694 738
695/******************************************************************/ 739/******************************************************************/
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday);
832 * 876 *
833 * Accumulates current time interval and initializes new clocksource 877 * Accumulates current time interval and initializes new clocksource
834 */ 878 */
835static int change_clocksource(void) 879static void change_clocksource(void)
836{ 880{
837 struct clocksource *new; 881 struct clocksource *new;
838 cycle_t now; 882 cycle_t now;
839 u64 nsec; 883 u64 nsec;
884
840 new = clocksource_get_next(); 885 new = clocksource_get_next();
841 if (clock != new) { 886
842 now = clocksource_read(new); 887 if (clock == new)
843 nsec = __get_nsec_offset(); 888 return;
844 timespec_add_ns(&xtime, nsec); 889
845 890 now = clocksource_read(new);
846 clock = new; 891 nsec = __get_nsec_offset();
847 clock->cycle_last = now; 892 timespec_add_ns(&xtime, nsec);
848 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 893
849 clock->name); 894 clock = new;
850 return 1; 895 clock->cycle_last = now;
851 } else if (clock->update_callback) { 896
852 return clock->update_callback(); 897 clock->error = 0;
853 } 898 clock->xtime_nsec = 0;
854 return 0; 899 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
900
901 tick_clock_notify();
902
903 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
904 clock->name);
855} 905}
856#else 906#else
857static inline int change_clocksource(void) 907static inline void change_clocksource(void) { }
858{
859 return 0;
860}
861#endif 908#endif
862 909
863/** 910/**
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void)
871 do { 918 do {
872 seq = read_seqbegin(&xtime_lock); 919 seq = read_seqbegin(&xtime_lock);
873 920
874 ret = clock->is_continuous; 921 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
875 922
876 } while (read_seqretry(&xtime_lock, seq)); 923 } while (read_seqretry(&xtime_lock, seq));
877 924
878 return ret; 925 return ret;
879} 926}
880 927
928/**
929 * read_persistent_clock - Return time in seconds from the persistent clock.
930 *
931 * Weak dummy function for arches that do not yet support it.
932 * Returns seconds from epoch using the battery backed persistent clock.
933 * Returns zero if unsupported.
934 *
935 * XXX - Do be sure to remove it once all arches implement it.
936 */
937unsigned long __attribute__((weak)) read_persistent_clock(void)
938{
939 return 0;
940}
941
881/* 942/*
882 * timekeeping_init - Initializes the clocksource and common timekeeping values 943 * timekeeping_init - Initializes the clocksource and common timekeeping values
883 */ 944 */
884void __init timekeeping_init(void) 945void __init timekeeping_init(void)
885{ 946{
886 unsigned long flags; 947 unsigned long flags;
948 unsigned long sec = read_persistent_clock();
887 949
888 write_seqlock_irqsave(&xtime_lock, flags); 950 write_seqlock_irqsave(&xtime_lock, flags);
889 951
890 ntp_clear(); 952 ntp_clear();
891 953
892 clock = clocksource_get_next(); 954 clock = clocksource_get_next();
893 clocksource_calculate_interval(clock, tick_nsec); 955 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
894 clock->cycle_last = clocksource_read(clock); 956 clock->cycle_last = clocksource_read(clock);
895 957
958 xtime.tv_sec = sec;
959 xtime.tv_nsec = 0;
960 set_normalized_timespec(&wall_to_monotonic,
961 -xtime.tv_sec, -xtime.tv_nsec);
962
896 write_sequnlock_irqrestore(&xtime_lock, flags); 963 write_sequnlock_irqrestore(&xtime_lock, flags);
897} 964}
898 965
899 966/* flag for if timekeeping is suspended */
900static int timekeeping_suspended; 967static int timekeeping_suspended;
968/* time in seconds when suspend began */
969static unsigned long timekeeping_suspend_time;
970
901/** 971/**
902 * timekeeping_resume - Resumes the generic timekeeping subsystem. 972 * timekeeping_resume - Resumes the generic timekeeping subsystem.
903 * @dev: unused 973 * @dev: unused
@@ -909,13 +979,26 @@ static int timekeeping_suspended;
909static int timekeeping_resume(struct sys_device *dev) 979static int timekeeping_resume(struct sys_device *dev)
910{ 980{
911 unsigned long flags; 981 unsigned long flags;
982 unsigned long now = read_persistent_clock();
912 983
913 write_seqlock_irqsave(&xtime_lock, flags); 984 write_seqlock_irqsave(&xtime_lock, flags);
914 /* restart the last cycle value */ 985
986 if (now && (now > timekeeping_suspend_time)) {
987 unsigned long sleep_length = now - timekeeping_suspend_time;
988
989 xtime.tv_sec += sleep_length;
990 wall_to_monotonic.tv_sec -= sleep_length;
991 }
992 /* re-base the last cycle value */
915 clock->cycle_last = clocksource_read(clock); 993 clock->cycle_last = clocksource_read(clock);
916 clock->error = 0; 994 clock->error = 0;
917 timekeeping_suspended = 0; 995 timekeeping_suspended = 0;
918 write_sequnlock_irqrestore(&xtime_lock, flags); 996 write_sequnlock_irqrestore(&xtime_lock, flags);
997
998 touch_softlockup_watchdog();
999 /* Resume hrtimers */
1000 clock_was_set();
1001
919 return 0; 1002 return 0;
920} 1003}
921 1004
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
925 1008
926 write_seqlock_irqsave(&xtime_lock, flags); 1009 write_seqlock_irqsave(&xtime_lock, flags);
927 timekeeping_suspended = 1; 1010 timekeeping_suspended = 1;
1011 timekeeping_suspend_time = read_persistent_clock();
928 write_sequnlock_irqrestore(&xtime_lock, flags); 1012 write_sequnlock_irqrestore(&xtime_lock, flags);
929 return 0; 1013 return 0;
930} 1014}
@@ -1089,11 +1173,8 @@ static void update_wall_time(void)
1089 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 1173 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1090 1174
1091 /* check to see if there is a new clocksource to use */ 1175 /* check to see if there is a new clocksource to use */
1092 if (change_clocksource()) { 1176 change_clocksource();
1093 clock->error = 0; 1177 update_vsyscall(&xtime, clock);
1094 clock->xtime_nsec = 0;
1095 clocksource_calculate_interval(clock, tick_nsec);
1096 }
1097} 1178}
1098 1179
1099/* 1180/*
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks)
1162 * This read-write spinlock protects us from races in SMP while 1243 * This read-write spinlock protects us from races in SMP while
1163 * playing with xtime and avenrun. 1244 * playing with xtime and avenrun.
1164 */ 1245 */
1165#ifndef ARCH_HAVE_XTIME_LOCK 1246__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1166__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1167 1247
1168EXPORT_SYMBOL(xtime_lock); 1248EXPORT_SYMBOL(xtime_lock);
1169#endif
1170 1249
1171/* 1250/*
1172 * This function runs timers and the timer-tq in bottom half context. 1251 * This function runs timers and the timer-tq in bottom half context.
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h)
1175{ 1254{
1176 tvec_base_t *base = __get_cpu_var(tvec_bases); 1255 tvec_base_t *base = __get_cpu_var(tvec_bases);
1177 1256
1178 hrtimer_run_queues(); 1257 hrtimer_run_queues();
1258
1179 if (time_after_eq(jiffies, base->timer_jiffies)) 1259 if (time_after_eq(jiffies, base->timer_jiffies))
1180 __run_timers(base); 1260 __run_timers(base);
1181} 1261}
@@ -1392,17 +1472,16 @@ asmlinkage long sys_gettid(void)
1392} 1472}
1393 1473
1394/** 1474/**
1395 * sys_sysinfo - fill in sysinfo struct 1475 * do_sysinfo - fill in sysinfo struct
1396 * @info: pointer to buffer to fill 1476 * @info: pointer to buffer to fill
1397 */ 1477 */
1398asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1478int do_sysinfo(struct sysinfo *info)
1399{ 1479{
1400 struct sysinfo val;
1401 unsigned long mem_total, sav_total; 1480 unsigned long mem_total, sav_total;
1402 unsigned int mem_unit, bitcount; 1481 unsigned int mem_unit, bitcount;
1403 unsigned long seq; 1482 unsigned long seq;
1404 1483
1405 memset((char *)&val, 0, sizeof(struct sysinfo)); 1484 memset(info, 0, sizeof(struct sysinfo));
1406 1485
1407 do { 1486 do {
1408 struct timespec tp; 1487 struct timespec tp;
@@ -1422,17 +1501,17 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1422 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; 1501 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1423 tp.tv_sec++; 1502 tp.tv_sec++;
1424 } 1503 }
1425 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 1504 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1426 1505
1427 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1506 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1428 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); 1507 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1429 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); 1508 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1430 1509
1431 val.procs = nr_threads; 1510 info->procs = nr_threads;
1432 } while (read_seqretry(&xtime_lock, seq)); 1511 } while (read_seqretry(&xtime_lock, seq));
1433 1512
1434 si_meminfo(&val); 1513 si_meminfo(info);
1435 si_swapinfo(&val); 1514 si_swapinfo(info);
1436 1515
1437 /* 1516 /*
1438 * If the sum of all the available memory (i.e. ram + swap) 1517 * If the sum of all the available memory (i.e. ram + swap)
@@ -1443,11 +1522,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1443 * -Erik Andersen <andersee@debian.org> 1522 * -Erik Andersen <andersee@debian.org>
1444 */ 1523 */
1445 1524
1446 mem_total = val.totalram + val.totalswap; 1525 mem_total = info->totalram + info->totalswap;
1447 if (mem_total < val.totalram || mem_total < val.totalswap) 1526 if (mem_total < info->totalram || mem_total < info->totalswap)
1448 goto out; 1527 goto out;
1449 bitcount = 0; 1528 bitcount = 0;
1450 mem_unit = val.mem_unit; 1529 mem_unit = info->mem_unit;
1451 while (mem_unit > 1) { 1530 while (mem_unit > 1) {
1452 bitcount++; 1531 bitcount++;
1453 mem_unit >>= 1; 1532 mem_unit >>= 1;
@@ -1459,22 +1538,31 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1459 1538
1460 /* 1539 /*
1461 * If mem_total did not overflow, multiply all memory values by 1540 * If mem_total did not overflow, multiply all memory values by
1462 * val.mem_unit and set it to 1. This leaves things compatible 1541 * info->mem_unit and set it to 1. This leaves things compatible
1463 * with 2.2.x, and also retains compatibility with earlier 2.4.x 1542 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1464 * kernels... 1543 * kernels...
1465 */ 1544 */
1466 1545
1467 val.mem_unit = 1; 1546 info->mem_unit = 1;
1468 val.totalram <<= bitcount; 1547 info->totalram <<= bitcount;
1469 val.freeram <<= bitcount; 1548 info->freeram <<= bitcount;
1470 val.sharedram <<= bitcount; 1549 info->sharedram <<= bitcount;
1471 val.bufferram <<= bitcount; 1550 info->bufferram <<= bitcount;
1472 val.totalswap <<= bitcount; 1551 info->totalswap <<= bitcount;
1473 val.freeswap <<= bitcount; 1552 info->freeswap <<= bitcount;
1474 val.totalhigh <<= bitcount; 1553 info->totalhigh <<= bitcount;
1475 val.freehigh <<= bitcount; 1554 info->freehigh <<= bitcount;
1555
1556out:
1557 return 0;
1558}
1559
1560asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1561{
1562 struct sysinfo val;
1563
1564 do_sysinfo(&val);
1476 1565
1477 out:
1478 if (copy_to_user(info, &val, sizeof(struct sysinfo))) 1566 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1479 return -EFAULT; 1567 return -EFAULT;
1480 1568
@@ -1613,6 +1701,8 @@ void __init init_timers(void)
1613 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1701 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1614 (void *)(long)smp_processor_id()); 1702 (void *)(long)smp_processor_id());
1615 1703
1704 init_timer_stats();
1705
1616 BUG_ON(err == NOTIFY_BAD); 1706 BUG_ON(err == NOTIFY_BAD);
1617 register_cpu_notifier(&timers_nb); 1707 register_cpu_notifier(&timers_nb);
1618 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1708 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
@@ -1624,7 +1714,7 @@ struct time_interpolator *time_interpolator __read_mostly;
1624static struct time_interpolator *time_interpolator_list __read_mostly; 1714static struct time_interpolator *time_interpolator_list __read_mostly;
1625static DEFINE_SPINLOCK(time_interpolator_lock); 1715static DEFINE_SPINLOCK(time_interpolator_lock);
1626 1716
1627static inline u64 time_interpolator_get_cycles(unsigned int src) 1717static inline cycles_t time_interpolator_get_cycles(unsigned int src)
1628{ 1718{
1629 unsigned long (*x)(void); 1719 unsigned long (*x)(void);
1630 1720
@@ -1650,8 +1740,8 @@ static inline u64 time_interpolator_get_counter(int writelock)
1650 1740
1651 if (time_interpolator->jitter) 1741 if (time_interpolator->jitter)
1652 { 1742 {
1653 u64 lcycle; 1743 cycles_t lcycle;
1654 u64 now; 1744 cycles_t now;
1655 1745
1656 do { 1746 do {
1657 lcycle = time_interpolator->last_cycle; 1747 lcycle = time_interpolator->last_cycle;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index baacc3691415..658f638c402c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -22,8 +22,6 @@
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24 24
25
26#define USEC_PER_TICK (USEC_PER_SEC/HZ)
27/* 25/*
28 * fill in basic accounting fields 26 * fill in basic accounting fields
29 */ 27 */
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
new file mode 100644
index 000000000000..f22b9dbd2a9c
--- /dev/null
+++ b/kernel/utsname_sysctl.c
@@ -0,0 +1,146 @@
1/*
2 * Copyright (C) 2007
3 *
4 * Author: Eric Biederman <ebiederm@xmision.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 */
11
12#include <linux/module.h>
13#include <linux/uts.h>
14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/sysctl.h>
17
18static void *get_uts(ctl_table *table, int write)
19{
20 char *which = table->data;
21#ifdef CONFIG_UTS_NS
22 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
24#endif
25 if (!write)
26 down_read(&uts_sem);
27 else
28 down_write(&uts_sem);
29 return which;
30}
31
32static void put_uts(ctl_table *table, int write, void *which)
33{
34 if (!write)
35 up_read(&uts_sem);
36 else
37 up_write(&uts_sem);
38}
39
40#ifdef CONFIG_PROC_FS
41/*
42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ????
44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{
48 struct ctl_table uts_table;
49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data);
54 return r;
55}
56#else
57#define proc_do_uts_string NULL
58#endif
59
60
61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66{
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, name, nlen,
73 oldval, oldlenp, newval, newlen);
74 put_uts(table, write, uts_table.data);
75 return r;
76}
77#else
78#define sysctl_uts_string NULL
79#endif
80
81static struct ctl_table uts_kern_table[] = {
82 {
83 .ctl_name = KERN_OSTYPE,
84 .procname = "ostype",
85 .data = init_uts_ns.name.sysname,
86 .maxlen = sizeof(init_uts_ns.name.sysname),
87 .mode = 0444,
88 .proc_handler = proc_do_uts_string,
89 .strategy = sysctl_uts_string,
90 },
91 {
92 .ctl_name = KERN_OSRELEASE,
93 .procname = "osrelease",
94 .data = init_uts_ns.name.release,
95 .maxlen = sizeof(init_uts_ns.name.release),
96 .mode = 0444,
97 .proc_handler = proc_do_uts_string,
98 .strategy = sysctl_uts_string,
99 },
100 {
101 .ctl_name = KERN_VERSION,
102 .procname = "version",
103 .data = init_uts_ns.name.version,
104 .maxlen = sizeof(init_uts_ns.name.version),
105 .mode = 0444,
106 .proc_handler = proc_do_uts_string,
107 .strategy = sysctl_uts_string,
108 },
109 {
110 .ctl_name = KERN_NODENAME,
111 .procname = "hostname",
112 .data = init_uts_ns.name.nodename,
113 .maxlen = sizeof(init_uts_ns.name.nodename),
114 .mode = 0644,
115 .proc_handler = proc_do_uts_string,
116 .strategy = sysctl_uts_string,
117 },
118 {
119 .ctl_name = KERN_DOMAINNAME,
120 .procname = "domainname",
121 .data = init_uts_ns.name.domainname,
122 .maxlen = sizeof(init_uts_ns.name.domainname),
123 .mode = 0644,
124 .proc_handler = proc_do_uts_string,
125 .strategy = sysctl_uts_string,
126 },
127 {}
128};
129
130static struct ctl_table uts_root_table[] = {
131 {
132 .ctl_name = CTL_KERN,
133 .procname = "kernel",
134 .mode = 0555,
135 .child = uts_kern_table,
136 },
137 {}
138};
139
140static int __init utsname_sysctl_init(void)
141{
142 register_sysctl_table(uts_root_table);
143 return 0;
144}
145
146__initcall(utsname_sysctl_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3da07c5af28..b6fa5e63085d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
218} 218}
219EXPORT_SYMBOL_GPL(queue_work); 219EXPORT_SYMBOL_GPL(queue_work);
220 220
221static void delayed_work_timer_fn(unsigned long __data) 221void delayed_work_timer_fn(unsigned long __data)
222{ 222{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
245 struct timer_list *timer = &dwork->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work; 246 struct work_struct *work = &dwork->work;
247 247
248 timer_stats_timer_set_start_info(timer);
248 if (delay == 0) 249 if (delay == 0)
249 return queue_work(wq, work); 250 return queue_work(wq, work);
250 251
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
593 * After waiting for a given time this puts a job in the kernel-global 594 * After waiting for a given time this puts a job in the kernel-global
594 * workqueue. 595 * workqueue.
595 */ 596 */
596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) 597int fastcall schedule_delayed_work(struct delayed_work *dwork,
598 unsigned long delay)
597{ 599{
600 timer_stats_timer_set_start_info(&dwork->timer);
598 return queue_delayed_work(keventd_wq, dwork, delay); 601 return queue_delayed_work(keventd_wq, dwork, delay);
599} 602}
600EXPORT_SYMBOL(schedule_delayed_work); 603EXPORT_SYMBOL(schedule_delayed_work);
@@ -656,8 +659,7 @@ void flush_scheduled_work(void)
656EXPORT_SYMBOL(flush_scheduled_work); 659EXPORT_SYMBOL(flush_scheduled_work);
657 660
658/** 661/**
659 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 662 * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
660 * work whose handler rearms the delayed work.
661 * @wq: the controlling workqueue structure 663 * @wq: the controlling workqueue structure
662 * @dwork: the delayed work struct 664 * @dwork: the delayed work struct
663 */ 665 */
@@ -670,8 +672,7 @@ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
670EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); 672EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
671 673
672/** 674/**
673 * cancel_rearming_delayed_work - reliably kill off a delayed keventd 675 * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
674 * work whose handler rearms the delayed work.
675 * @dwork: the delayed work struct 676 * @dwork: the delayed work struct
676 */ 677 */
677void cancel_rearming_delayed_work(struct delayed_work *dwork) 678void cancel_rearming_delayed_work(struct delayed_work *dwork)