diff options
Diffstat (limited to 'kernel')
65 files changed, 5782 insertions, 1435 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 14f4d45e0ae9..ac6b27abb1ad 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | |||
47 | obj-$(CONFIG_SECCOMP) += seccomp.o | 47 | obj-$(CONFIG_SECCOMP) += seccomp.o |
48 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 48 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
49 | obj-$(CONFIG_RELAY) += relay.o | 49 | obj-$(CONFIG_RELAY) += relay.o |
50 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | ||
50 | obj-$(CONFIG_UTS_NS) += utsname.o | 51 | obj-$(CONFIG_UTS_NS) += utsname.o |
51 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 52 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
52 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 53 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
diff --git a/kernel/audit.c b/kernel/audit.c index d9b690ac684b..76c9a11b72d6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -2,7 +2,7 @@ | |||
2 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. | 2 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. |
3 | * System-call specific features have moved to auditsc.c | 3 | * System-call specific features have moved to auditsc.c |
4 | * | 4 | * |
5 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | 5 | * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. |
6 | * All Rights Reserved. | 6 | * All Rights Reserved. |
7 | * | 7 | * |
8 | * This program is free software; you can redistribute it and/or modify | 8 | * This program is free software; you can redistribute it and/or modify |
@@ -65,7 +65,9 @@ | |||
65 | * (Initialization happens after skb_init is called.) */ | 65 | * (Initialization happens after skb_init is called.) */ |
66 | static int audit_initialized; | 66 | static int audit_initialized; |
67 | 67 | ||
68 | /* No syscall auditing will take place unless audit_enabled != 0. */ | 68 | /* 0 - no auditing |
69 | * 1 - auditing enabled | ||
70 | * 2 - auditing enabled and configuration is locked/unchangeable. */ | ||
69 | int audit_enabled; | 71 | int audit_enabled; |
70 | 72 | ||
71 | /* Default state when kernel boots without any parameters. */ | 73 | /* Default state when kernel boots without any parameters. */ |
@@ -239,102 +241,150 @@ void audit_log_lost(const char *message) | |||
239 | 241 | ||
240 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) | 242 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) |
241 | { | 243 | { |
242 | int old = audit_rate_limit; | 244 | int res, rc = 0, old = audit_rate_limit; |
245 | |||
246 | /* check if we are locked */ | ||
247 | if (audit_enabled == 2) | ||
248 | res = 0; | ||
249 | else | ||
250 | res = 1; | ||
243 | 251 | ||
244 | if (sid) { | 252 | if (sid) { |
245 | char *ctx = NULL; | 253 | char *ctx = NULL; |
246 | u32 len; | 254 | u32 len; |
247 | int rc; | 255 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { |
248 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) | ||
249 | return rc; | ||
250 | else | ||
251 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 256 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
252 | "audit_rate_limit=%d old=%d by auid=%u subj=%s", | 257 | "audit_rate_limit=%d old=%d by auid=%u" |
253 | limit, old, loginuid, ctx); | 258 | " subj=%s res=%d", |
254 | kfree(ctx); | 259 | limit, old, loginuid, ctx, res); |
255 | } else | 260 | kfree(ctx); |
256 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 261 | } else |
257 | "audit_rate_limit=%d old=%d by auid=%u", | 262 | res = 0; /* Something weird, deny request */ |
258 | limit, old, loginuid); | 263 | } |
259 | audit_rate_limit = limit; | 264 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
260 | return 0; | 265 | "audit_rate_limit=%d old=%d by auid=%u res=%d", |
266 | limit, old, loginuid, res); | ||
267 | |||
268 | /* If we are allowed, make the change */ | ||
269 | if (res == 1) | ||
270 | audit_rate_limit = limit; | ||
271 | /* Not allowed, update reason */ | ||
272 | else if (rc == 0) | ||
273 | rc = -EPERM; | ||
274 | return rc; | ||
261 | } | 275 | } |
262 | 276 | ||
263 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | 277 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) |
264 | { | 278 | { |
265 | int old = audit_backlog_limit; | 279 | int res, rc = 0, old = audit_backlog_limit; |
280 | |||
281 | /* check if we are locked */ | ||
282 | if (audit_enabled == 2) | ||
283 | res = 0; | ||
284 | else | ||
285 | res = 1; | ||
266 | 286 | ||
267 | if (sid) { | 287 | if (sid) { |
268 | char *ctx = NULL; | 288 | char *ctx = NULL; |
269 | u32 len; | 289 | u32 len; |
270 | int rc; | 290 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { |
271 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) | ||
272 | return rc; | ||
273 | else | ||
274 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 291 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
275 | "audit_backlog_limit=%d old=%d by auid=%u subj=%s", | 292 | "audit_backlog_limit=%d old=%d by auid=%u" |
276 | limit, old, loginuid, ctx); | 293 | " subj=%s res=%d", |
277 | kfree(ctx); | 294 | limit, old, loginuid, ctx, res); |
278 | } else | 295 | kfree(ctx); |
279 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 296 | } else |
280 | "audit_backlog_limit=%d old=%d by auid=%u", | 297 | res = 0; /* Something weird, deny request */ |
281 | limit, old, loginuid); | 298 | } |
282 | audit_backlog_limit = limit; | 299 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
283 | return 0; | 300 | "audit_backlog_limit=%d old=%d by auid=%u res=%d", |
301 | limit, old, loginuid, res); | ||
302 | |||
303 | /* If we are allowed, make the change */ | ||
304 | if (res == 1) | ||
305 | audit_backlog_limit = limit; | ||
306 | /* Not allowed, update reason */ | ||
307 | else if (rc == 0) | ||
308 | rc = -EPERM; | ||
309 | return rc; | ||
284 | } | 310 | } |
285 | 311 | ||
286 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | 312 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) |
287 | { | 313 | { |
288 | int old = audit_enabled; | 314 | int res, rc = 0, old = audit_enabled; |
289 | 315 | ||
290 | if (state != 0 && state != 1) | 316 | if (state < 0 || state > 2) |
291 | return -EINVAL; | 317 | return -EINVAL; |
292 | 318 | ||
319 | /* check if we are locked */ | ||
320 | if (audit_enabled == 2) | ||
321 | res = 0; | ||
322 | else | ||
323 | res = 1; | ||
324 | |||
293 | if (sid) { | 325 | if (sid) { |
294 | char *ctx = NULL; | 326 | char *ctx = NULL; |
295 | u32 len; | 327 | u32 len; |
296 | int rc; | 328 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { |
297 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) | ||
298 | return rc; | ||
299 | else | ||
300 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 329 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
301 | "audit_enabled=%d old=%d by auid=%u subj=%s", | 330 | "audit_enabled=%d old=%d by auid=%u" |
302 | state, old, loginuid, ctx); | 331 | " subj=%s res=%d", |
303 | kfree(ctx); | 332 | state, old, loginuid, ctx, res); |
304 | } else | 333 | kfree(ctx); |
305 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 334 | } else |
306 | "audit_enabled=%d old=%d by auid=%u", | 335 | res = 0; /* Something weird, deny request */ |
307 | state, old, loginuid); | 336 | } |
308 | audit_enabled = state; | 337 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
309 | return 0; | 338 | "audit_enabled=%d old=%d by auid=%u res=%d", |
339 | state, old, loginuid, res); | ||
340 | |||
341 | /* If we are allowed, make the change */ | ||
342 | if (res == 1) | ||
343 | audit_enabled = state; | ||
344 | /* Not allowed, update reason */ | ||
345 | else if (rc == 0) | ||
346 | rc = -EPERM; | ||
347 | return rc; | ||
310 | } | 348 | } |
311 | 349 | ||
312 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) | 350 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) |
313 | { | 351 | { |
314 | int old = audit_failure; | 352 | int res, rc = 0, old = audit_failure; |
315 | 353 | ||
316 | if (state != AUDIT_FAIL_SILENT | 354 | if (state != AUDIT_FAIL_SILENT |
317 | && state != AUDIT_FAIL_PRINTK | 355 | && state != AUDIT_FAIL_PRINTK |
318 | && state != AUDIT_FAIL_PANIC) | 356 | && state != AUDIT_FAIL_PANIC) |
319 | return -EINVAL; | 357 | return -EINVAL; |
320 | 358 | ||
359 | /* check if we are locked */ | ||
360 | if (audit_enabled == 2) | ||
361 | res = 0; | ||
362 | else | ||
363 | res = 1; | ||
364 | |||
321 | if (sid) { | 365 | if (sid) { |
322 | char *ctx = NULL; | 366 | char *ctx = NULL; |
323 | u32 len; | 367 | u32 len; |
324 | int rc; | 368 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { |
325 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) | ||
326 | return rc; | ||
327 | else | ||
328 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 369 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
329 | "audit_failure=%d old=%d by auid=%u subj=%s", | 370 | "audit_failure=%d old=%d by auid=%u" |
330 | state, old, loginuid, ctx); | 371 | " subj=%s res=%d", |
331 | kfree(ctx); | 372 | state, old, loginuid, ctx, res); |
332 | } else | 373 | kfree(ctx); |
333 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 374 | } else |
334 | "audit_failure=%d old=%d by auid=%u", | 375 | res = 0; /* Something weird, deny request */ |
335 | state, old, loginuid); | 376 | } |
336 | audit_failure = state; | 377 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
337 | return 0; | 378 | "audit_failure=%d old=%d by auid=%u res=%d", |
379 | state, old, loginuid, res); | ||
380 | |||
381 | /* If we are allowed, make the change */ | ||
382 | if (res == 1) | ||
383 | audit_failure = state; | ||
384 | /* Not allowed, update reason */ | ||
385 | else if (rc == 0) | ||
386 | rc = -EPERM; | ||
387 | return rc; | ||
338 | } | 388 | } |
339 | 389 | ||
340 | static int kauditd_thread(void *dummy) | 390 | static int kauditd_thread(void *dummy) |
@@ -599,6 +649,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
599 | case AUDIT_DEL: | 649 | case AUDIT_DEL: |
600 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) | 650 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) |
601 | return -EINVAL; | 651 | return -EINVAL; |
652 | if (audit_enabled == 2) { | ||
653 | ab = audit_log_start(NULL, GFP_KERNEL, | ||
654 | AUDIT_CONFIG_CHANGE); | ||
655 | if (ab) { | ||
656 | audit_log_format(ab, | ||
657 | "pid=%d uid=%u auid=%u", | ||
658 | pid, uid, loginuid); | ||
659 | if (sid) { | ||
660 | if (selinux_sid_to_string( | ||
661 | sid, &ctx, &len)) { | ||
662 | audit_log_format(ab, | ||
663 | " ssid=%u", sid); | ||
664 | /* Maybe call audit_panic? */ | ||
665 | } else | ||
666 | audit_log_format(ab, | ||
667 | " subj=%s", ctx); | ||
668 | kfree(ctx); | ||
669 | } | ||
670 | audit_log_format(ab, " audit_enabled=%d res=0", | ||
671 | audit_enabled); | ||
672 | audit_log_end(ab); | ||
673 | } | ||
674 | return -EPERM; | ||
675 | } | ||
602 | /* fallthrough */ | 676 | /* fallthrough */ |
603 | case AUDIT_LIST: | 677 | case AUDIT_LIST: |
604 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 678 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
@@ -609,6 +683,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
609 | case AUDIT_DEL_RULE: | 683 | case AUDIT_DEL_RULE: |
610 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) | 684 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) |
611 | return -EINVAL; | 685 | return -EINVAL; |
686 | if (audit_enabled == 2) { | ||
687 | ab = audit_log_start(NULL, GFP_KERNEL, | ||
688 | AUDIT_CONFIG_CHANGE); | ||
689 | if (ab) { | ||
690 | audit_log_format(ab, | ||
691 | "pid=%d uid=%u auid=%u", | ||
692 | pid, uid, loginuid); | ||
693 | if (sid) { | ||
694 | if (selinux_sid_to_string( | ||
695 | sid, &ctx, &len)) { | ||
696 | audit_log_format(ab, | ||
697 | " ssid=%u", sid); | ||
698 | /* Maybe call audit_panic? */ | ||
699 | } else | ||
700 | audit_log_format(ab, | ||
701 | " subj=%s", ctx); | ||
702 | kfree(ctx); | ||
703 | } | ||
704 | audit_log_format(ab, " audit_enabled=%d res=0", | ||
705 | audit_enabled); | ||
706 | audit_log_end(ab); | ||
707 | } | ||
708 | return -EPERM; | ||
709 | } | ||
612 | /* fallthrough */ | 710 | /* fallthrough */ |
613 | case AUDIT_LIST_RULES: | 711 | case AUDIT_LIST_RULES: |
614 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | 712 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9c8c23227c7f..3749193aed8c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -937,9 +937,10 @@ static void audit_update_watch(struct audit_parent *parent, | |||
937 | } | 937 | } |
938 | 938 | ||
939 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 939 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
940 | audit_log_format(ab, "audit updated rules specifying path="); | 940 | audit_log_format(ab, "op=updated rules specifying path="); |
941 | audit_log_untrustedstring(ab, owatch->path); | 941 | audit_log_untrustedstring(ab, owatch->path); |
942 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); | 942 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); |
943 | audit_log_format(ab, " list=%d res=1", r->listnr); | ||
943 | audit_log_end(ab); | 944 | audit_log_end(ab); |
944 | 945 | ||
945 | audit_remove_watch(owatch); | 946 | audit_remove_watch(owatch); |
@@ -969,14 +970,14 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
969 | e = container_of(r, struct audit_entry, rule); | 970 | e = container_of(r, struct audit_entry, rule); |
970 | 971 | ||
971 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 972 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
972 | audit_log_format(ab, "audit implicitly removed rule path="); | 973 | audit_log_format(ab, "op=remove rule path="); |
973 | audit_log_untrustedstring(ab, w->path); | 974 | audit_log_untrustedstring(ab, w->path); |
974 | if (r->filterkey) { | 975 | if (r->filterkey) { |
975 | audit_log_format(ab, " key="); | 976 | audit_log_format(ab, " key="); |
976 | audit_log_untrustedstring(ab, r->filterkey); | 977 | audit_log_untrustedstring(ab, r->filterkey); |
977 | } else | 978 | } else |
978 | audit_log_format(ab, " key=(null)"); | 979 | audit_log_format(ab, " key=(null)"); |
979 | audit_log_format(ab, " list=%d", r->listnr); | 980 | audit_log_format(ab, " list=%d res=1", r->listnr); |
980 | audit_log_end(ab); | 981 | audit_log_end(ab); |
981 | 982 | ||
982 | list_del(&r->rlist); | 983 | list_del(&r->rlist); |
@@ -1410,7 +1411,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | |||
1410 | audit_log_format(ab, " subj=%s", ctx); | 1411 | audit_log_format(ab, " subj=%s", ctx); |
1411 | kfree(ctx); | 1412 | kfree(ctx); |
1412 | } | 1413 | } |
1413 | audit_log_format(ab, " %s rule key=", action); | 1414 | audit_log_format(ab, " op=%s rule key=", action); |
1414 | if (rule->filterkey) | 1415 | if (rule->filterkey) |
1415 | audit_log_untrustedstring(ab, rule->filterkey); | 1416 | audit_log_untrustedstring(ab, rule->filterkey); |
1416 | else | 1417 | else |
@@ -1601,8 +1602,8 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
1601 | 1602 | ||
1602 | int audit_filter_user(struct netlink_skb_parms *cb, int type) | 1603 | int audit_filter_user(struct netlink_skb_parms *cb, int type) |
1603 | { | 1604 | { |
1605 | enum audit_state state = AUDIT_DISABLED; | ||
1604 | struct audit_entry *e; | 1606 | struct audit_entry *e; |
1605 | enum audit_state state; | ||
1606 | int ret = 1; | 1607 | int ret = 1; |
1607 | 1608 | ||
1608 | rcu_read_lock(); | 1609 | rcu_read_lock(); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 298897559ca4..359955800dd2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -170,6 +170,11 @@ struct audit_aux_data_sockaddr { | |||
170 | char a[0]; | 170 | char a[0]; |
171 | }; | 171 | }; |
172 | 172 | ||
173 | struct audit_aux_data_fd_pair { | ||
174 | struct audit_aux_data d; | ||
175 | int fd[2]; | ||
176 | }; | ||
177 | |||
173 | struct audit_aux_data_path { | 178 | struct audit_aux_data_path { |
174 | struct audit_aux_data d; | 179 | struct audit_aux_data d; |
175 | struct dentry *dentry; | 180 | struct dentry *dentry; |
@@ -961,6 +966,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
961 | audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); | 966 | audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); |
962 | break; } | 967 | break; } |
963 | 968 | ||
969 | case AUDIT_FD_PAIR: { | ||
970 | struct audit_aux_data_fd_pair *axs = (void *)aux; | ||
971 | audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); | ||
972 | break; } | ||
973 | |||
964 | } | 974 | } |
965 | audit_log_end(ab); | 975 | audit_log_end(ab); |
966 | } | 976 | } |
@@ -1815,6 +1825,36 @@ int audit_socketcall(int nargs, unsigned long *args) | |||
1815 | } | 1825 | } |
1816 | 1826 | ||
1817 | /** | 1827 | /** |
1828 | * __audit_fd_pair - record audit data for pipe and socketpair | ||
1829 | * @fd1: the first file descriptor | ||
1830 | * @fd2: the second file descriptor | ||
1831 | * | ||
1832 | * Returns 0 for success or NULL context or < 0 on error. | ||
1833 | */ | ||
1834 | int __audit_fd_pair(int fd1, int fd2) | ||
1835 | { | ||
1836 | struct audit_context *context = current->audit_context; | ||
1837 | struct audit_aux_data_fd_pair *ax; | ||
1838 | |||
1839 | if (likely(!context)) { | ||
1840 | return 0; | ||
1841 | } | ||
1842 | |||
1843 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | ||
1844 | if (!ax) { | ||
1845 | return -ENOMEM; | ||
1846 | } | ||
1847 | |||
1848 | ax->fd[0] = fd1; | ||
1849 | ax->fd[1] = fd2; | ||
1850 | |||
1851 | ax->d.type = AUDIT_FD_PAIR; | ||
1852 | ax->d.next = context->aux; | ||
1853 | context->aux = (void *)ax; | ||
1854 | return 0; | ||
1855 | } | ||
1856 | |||
1857 | /** | ||
1818 | * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto | 1858 | * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto |
1819 | * @len: data length in user space | 1859 | * @len: data length in user space |
1820 | * @a: data address in kernel space | 1860 | * @a: data address in kernel space |
diff --git a/kernel/capability.c b/kernel/capability.c index edb845a6e84a..c8d3c7762034 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -92,15 +92,17 @@ out: | |||
92 | * cap_set_pg - set capabilities for all processes in a given process | 92 | * cap_set_pg - set capabilities for all processes in a given process |
93 | * group. We call this holding task_capability_lock and tasklist_lock. | 93 | * group. We call this holding task_capability_lock and tasklist_lock. |
94 | */ | 94 | */ |
95 | static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, | 95 | static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, |
96 | kernel_cap_t *inheritable, | 96 | kernel_cap_t *inheritable, |
97 | kernel_cap_t *permitted) | 97 | kernel_cap_t *permitted) |
98 | { | 98 | { |
99 | struct task_struct *g, *target; | 99 | struct task_struct *g, *target; |
100 | int ret = -EPERM; | 100 | int ret = -EPERM; |
101 | int found = 0; | 101 | int found = 0; |
102 | struct pid *pgrp; | ||
102 | 103 | ||
103 | do_each_task_pid(pgrp, PIDTYPE_PGID, g) { | 104 | pgrp = find_pid(pgrp_nr); |
105 | do_each_pid_task(pgrp, PIDTYPE_PGID, g) { | ||
104 | target = g; | 106 | target = g; |
105 | while_each_thread(g, target) { | 107 | while_each_thread(g, target) { |
106 | if (!security_capset_check(target, effective, | 108 | if (!security_capset_check(target, effective, |
@@ -113,7 +115,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, | |||
113 | } | 115 | } |
114 | found = 1; | 116 | found = 1; |
115 | } | 117 | } |
116 | } while_each_task_pid(pgrp, PIDTYPE_PGID, g); | 118 | } while_each_pid_task(pgrp, PIDTYPE_PGID, g); |
117 | 119 | ||
118 | if (!found) | 120 | if (!found) |
119 | ret = 0; | 121 | ret = 0; |
diff --git a/kernel/compat.c b/kernel/compat.c index 6952dd057300..cebb4c28c039 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -1016,3 +1016,69 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, | |||
1016 | return sys_migrate_pages(pid, nr_bits + 1, old, new); | 1016 | return sys_migrate_pages(pid, nr_bits + 1, old, new); |
1017 | } | 1017 | } |
1018 | #endif | 1018 | #endif |
1019 | |||
1020 | struct compat_sysinfo { | ||
1021 | s32 uptime; | ||
1022 | u32 loads[3]; | ||
1023 | u32 totalram; | ||
1024 | u32 freeram; | ||
1025 | u32 sharedram; | ||
1026 | u32 bufferram; | ||
1027 | u32 totalswap; | ||
1028 | u32 freeswap; | ||
1029 | u16 procs; | ||
1030 | u16 pad; | ||
1031 | u32 totalhigh; | ||
1032 | u32 freehigh; | ||
1033 | u32 mem_unit; | ||
1034 | char _f[20-2*sizeof(u32)-sizeof(int)]; | ||
1035 | }; | ||
1036 | |||
1037 | asmlinkage long | ||
1038 | compat_sys_sysinfo(struct compat_sysinfo __user *info) | ||
1039 | { | ||
1040 | struct sysinfo s; | ||
1041 | |||
1042 | do_sysinfo(&s); | ||
1043 | |||
1044 | /* Check to see if any memory value is too large for 32-bit and scale | ||
1045 | * down if needed | ||
1046 | */ | ||
1047 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | ||
1048 | int bitcount = 0; | ||
1049 | |||
1050 | while (s.mem_unit < PAGE_SIZE) { | ||
1051 | s.mem_unit <<= 1; | ||
1052 | bitcount++; | ||
1053 | } | ||
1054 | |||
1055 | s.totalram >>= bitcount; | ||
1056 | s.freeram >>= bitcount; | ||
1057 | s.sharedram >>= bitcount; | ||
1058 | s.bufferram >>= bitcount; | ||
1059 | s.totalswap >>= bitcount; | ||
1060 | s.freeswap >>= bitcount; | ||
1061 | s.totalhigh >>= bitcount; | ||
1062 | s.freehigh >>= bitcount; | ||
1063 | } | ||
1064 | |||
1065 | if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || | ||
1066 | __put_user (s.uptime, &info->uptime) || | ||
1067 | __put_user (s.loads[0], &info->loads[0]) || | ||
1068 | __put_user (s.loads[1], &info->loads[1]) || | ||
1069 | __put_user (s.loads[2], &info->loads[2]) || | ||
1070 | __put_user (s.totalram, &info->totalram) || | ||
1071 | __put_user (s.freeram, &info->freeram) || | ||
1072 | __put_user (s.sharedram, &info->sharedram) || | ||
1073 | __put_user (s.bufferram, &info->bufferram) || | ||
1074 | __put_user (s.totalswap, &info->totalswap) || | ||
1075 | __put_user (s.freeswap, &info->freeswap) || | ||
1076 | __put_user (s.procs, &info->procs) || | ||
1077 | __put_user (s.totalhigh, &info->totalhigh) || | ||
1078 | __put_user (s.freehigh, &info->freehigh) || | ||
1079 | __put_user (s.mem_unit, &info->mem_unit)) | ||
1080 | return -EFAULT; | ||
1081 | |||
1082 | return 0; | ||
1083 | } | ||
1084 | |||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 7406fe6966f9..3d4206ada5c9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -309,6 +309,8 @@ void enable_nonboot_cpus(void) | |||
309 | mutex_lock(&cpu_add_remove_lock); | 309 | mutex_lock(&cpu_add_remove_lock); |
310 | cpu_hotplug_disabled = 0; | 310 | cpu_hotplug_disabled = 0; |
311 | mutex_unlock(&cpu_add_remove_lock); | 311 | mutex_unlock(&cpu_add_remove_lock); |
312 | if (cpus_empty(frozen_cpus)) | ||
313 | return; | ||
312 | 314 | ||
313 | printk("Enabling non-boot CPUs ...\n"); | 315 | printk("Enabling non-boot CPUs ...\n"); |
314 | for_each_cpu_mask(cpu, frozen_cpus) { | 316 | for_each_cpu_mask(cpu, frozen_cpus) { |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6b05dc69c959..f382b0f775e1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1540,7 +1540,7 @@ static const struct file_operations cpuset_file_operations = { | |||
1540 | .release = cpuset_file_release, | 1540 | .release = cpuset_file_release, |
1541 | }; | 1541 | }; |
1542 | 1542 | ||
1543 | static struct inode_operations cpuset_dir_inode_operations = { | 1543 | static const struct inode_operations cpuset_dir_inode_operations = { |
1544 | .lookup = simple_lookup, | 1544 | .lookup = simple_lookup, |
1545 | .mkdir = cpuset_mkdir, | 1545 | .mkdir = cpuset_mkdir, |
1546 | .rmdir = cpuset_rmdir, | 1546 | .rmdir = cpuset_rmdir, |
@@ -2656,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file) | |||
2656 | return single_open(file, proc_cpuset_show, pid); | 2656 | return single_open(file, proc_cpuset_show, pid); |
2657 | } | 2657 | } |
2658 | 2658 | ||
2659 | struct file_operations proc_cpuset_operations = { | 2659 | const struct file_operations proc_cpuset_operations = { |
2660 | .open = cpuset_open, | 2660 | .open = cpuset_open, |
2661 | .read = seq_read, | 2661 | .read = seq_read, |
2662 | .llseek = seq_lseek, | 2662 | .llseek = seq_lseek, |
diff --git a/kernel/exit.c b/kernel/exit.c index fec12eb12471..f132349c0325 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -185,21 +185,19 @@ repeat: | |||
185 | * This checks not only the pgrp, but falls back on the pid if no | 185 | * This checks not only the pgrp, but falls back on the pid if no |
186 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | 186 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly |
187 | * without this... | 187 | * without this... |
188 | * | ||
189 | * The caller must hold rcu lock or the tasklist lock. | ||
188 | */ | 190 | */ |
189 | int session_of_pgrp(int pgrp) | 191 | struct pid *session_of_pgrp(struct pid *pgrp) |
190 | { | 192 | { |
191 | struct task_struct *p; | 193 | struct task_struct *p; |
192 | int sid = 0; | 194 | struct pid *sid = NULL; |
193 | |||
194 | read_lock(&tasklist_lock); | ||
195 | 195 | ||
196 | p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); | 196 | p = pid_task(pgrp, PIDTYPE_PGID); |
197 | if (p == NULL) | 197 | if (p == NULL) |
198 | p = find_task_by_pid(pgrp); | 198 | p = pid_task(pgrp, PIDTYPE_PID); |
199 | if (p != NULL) | 199 | if (p != NULL) |
200 | sid = process_session(p); | 200 | sid = task_session(p); |
201 | |||
202 | read_unlock(&tasklist_lock); | ||
203 | 201 | ||
204 | return sid; | 202 | return sid; |
205 | } | 203 | } |
@@ -212,53 +210,52 @@ int session_of_pgrp(int pgrp) | |||
212 | * | 210 | * |
213 | * "I ask you, have you ever known what it is to be an orphan?" | 211 | * "I ask you, have you ever known what it is to be an orphan?" |
214 | */ | 212 | */ |
215 | static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) | 213 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) |
216 | { | 214 | { |
217 | struct task_struct *p; | 215 | struct task_struct *p; |
218 | int ret = 1; | 216 | int ret = 1; |
219 | 217 | ||
220 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 218 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
221 | if (p == ignored_task | 219 | if (p == ignored_task |
222 | || p->exit_state | 220 | || p->exit_state |
223 | || is_init(p->real_parent)) | 221 | || is_init(p->real_parent)) |
224 | continue; | 222 | continue; |
225 | if (process_group(p->real_parent) != pgrp && | 223 | if (task_pgrp(p->real_parent) != pgrp && |
226 | process_session(p->real_parent) == process_session(p)) { | 224 | task_session(p->real_parent) == task_session(p)) { |
227 | ret = 0; | 225 | ret = 0; |
228 | break; | 226 | break; |
229 | } | 227 | } |
230 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | 228 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
231 | return ret; /* (sighing) "Often!" */ | 229 | return ret; /* (sighing) "Often!" */ |
232 | } | 230 | } |
233 | 231 | ||
234 | int is_orphaned_pgrp(int pgrp) | 232 | int is_current_pgrp_orphaned(void) |
235 | { | 233 | { |
236 | int retval; | 234 | int retval; |
237 | 235 | ||
238 | read_lock(&tasklist_lock); | 236 | read_lock(&tasklist_lock); |
239 | retval = will_become_orphaned_pgrp(pgrp, NULL); | 237 | retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); |
240 | read_unlock(&tasklist_lock); | 238 | read_unlock(&tasklist_lock); |
241 | 239 | ||
242 | return retval; | 240 | return retval; |
243 | } | 241 | } |
244 | 242 | ||
245 | static int has_stopped_jobs(int pgrp) | 243 | static int has_stopped_jobs(struct pid *pgrp) |
246 | { | 244 | { |
247 | int retval = 0; | 245 | int retval = 0; |
248 | struct task_struct *p; | 246 | struct task_struct *p; |
249 | 247 | ||
250 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 248 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
251 | if (p->state != TASK_STOPPED) | 249 | if (p->state != TASK_STOPPED) |
252 | continue; | 250 | continue; |
253 | retval = 1; | 251 | retval = 1; |
254 | break; | 252 | break; |
255 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | 253 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
256 | return retval; | 254 | return retval; |
257 | } | 255 | } |
258 | 256 | ||
259 | /** | 257 | /** |
260 | * reparent_to_init - Reparent the calling kernel thread to the init task | 258 | * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. |
261 | * of the pid space that the thread belongs to. | ||
262 | * | 259 | * |
263 | * If a kernel thread is launched as a result of a system call, or if | 260 | * If a kernel thread is launched as a result of a system call, or if |
264 | * it ever exits, it should generally reparent itself to init so that | 261 | * it ever exits, it should generally reparent itself to init so that |
@@ -431,8 +428,10 @@ static void close_files(struct files_struct * files) | |||
431 | while (set) { | 428 | while (set) { |
432 | if (set & 1) { | 429 | if (set & 1) { |
433 | struct file * file = xchg(&fdt->fd[i], NULL); | 430 | struct file * file = xchg(&fdt->fd[i], NULL); |
434 | if (file) | 431 | if (file) { |
435 | filp_close(file, files); | 432 | filp_close(file, files); |
433 | cond_resched(); | ||
434 | } | ||
436 | } | 435 | } |
437 | i++; | 436 | i++; |
438 | set >>= 1; | 437 | set >>= 1; |
@@ -649,14 +648,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
649 | * than we are, and it was the only connection | 648 | * than we are, and it was the only connection |
650 | * outside, so the child pgrp is now orphaned. | 649 | * outside, so the child pgrp is now orphaned. |
651 | */ | 650 | */ |
652 | if ((process_group(p) != process_group(father)) && | 651 | if ((task_pgrp(p) != task_pgrp(father)) && |
653 | (process_session(p) == process_session(father))) { | 652 | (task_session(p) == task_session(father))) { |
654 | int pgrp = process_group(p); | 653 | struct pid *pgrp = task_pgrp(p); |
655 | 654 | ||
656 | if (will_become_orphaned_pgrp(pgrp, NULL) && | 655 | if (will_become_orphaned_pgrp(pgrp, NULL) && |
657 | has_stopped_jobs(pgrp)) { | 656 | has_stopped_jobs(pgrp)) { |
658 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); | 657 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
659 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 658 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
660 | } | 659 | } |
661 | } | 660 | } |
662 | } | 661 | } |
@@ -736,6 +735,7 @@ static void exit_notify(struct task_struct *tsk) | |||
736 | int state; | 735 | int state; |
737 | struct task_struct *t; | 736 | struct task_struct *t; |
738 | struct list_head ptrace_dead, *_p, *_n; | 737 | struct list_head ptrace_dead, *_p, *_n; |
738 | struct pid *pgrp; | ||
739 | 739 | ||
740 | if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) | 740 | if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) |
741 | && !thread_group_empty(tsk)) { | 741 | && !thread_group_empty(tsk)) { |
@@ -788,12 +788,13 @@ static void exit_notify(struct task_struct *tsk) | |||
788 | 788 | ||
789 | t = tsk->real_parent; | 789 | t = tsk->real_parent; |
790 | 790 | ||
791 | if ((process_group(t) != process_group(tsk)) && | 791 | pgrp = task_pgrp(tsk); |
792 | (process_session(t) == process_session(tsk)) && | 792 | if ((task_pgrp(t) != pgrp) && |
793 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 793 | (task_session(t) != task_session(tsk)) && |
794 | has_stopped_jobs(process_group(tsk))) { | 794 | will_become_orphaned_pgrp(pgrp, tsk) && |
795 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); | 795 | has_stopped_jobs(pgrp)) { |
796 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); | 796 | __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
797 | __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); | ||
797 | } | 798 | } |
798 | 799 | ||
799 | /* Let father know we died | 800 | /* Let father know we died |
diff --git a/kernel/fork.c b/kernel/fork.c index d57118da73ff..d154cc786489 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
858 | init_sigpending(&sig->shared_pending); | 858 | init_sigpending(&sig->shared_pending); |
859 | INIT_LIST_HEAD(&sig->posix_timers); | 859 | INIT_LIST_HEAD(&sig->posix_timers); |
860 | 860 | ||
861 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); | 861 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
862 | sig->it_real_incr.tv64 = 0; | 862 | sig->it_real_incr.tv64 = 0; |
863 | sig->real_timer.function = it_real_fn; | 863 | sig->real_timer.function = it_real_fn; |
864 | sig->tsk = tsk; | 864 | sig->tsk = tsk; |
@@ -869,7 +869,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
869 | sig->it_prof_incr = cputime_zero; | 869 | sig->it_prof_incr = cputime_zero; |
870 | 870 | ||
871 | sig->leader = 0; /* session leadership doesn't inherit */ | 871 | sig->leader = 0; /* session leadership doesn't inherit */ |
872 | sig->tty_old_pgrp = 0; | 872 | sig->tty_old_pgrp = NULL; |
873 | 873 | ||
874 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | 874 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; |
875 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | 875 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; |
@@ -1038,10 +1038,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1038 | p->utime = cputime_zero; | 1038 | p->utime = cputime_zero; |
1039 | p->stime = cputime_zero; | 1039 | p->stime = cputime_zero; |
1040 | p->sched_time = 0; | 1040 | p->sched_time = 0; |
1041 | #ifdef CONFIG_TASK_XACCT | ||
1041 | p->rchar = 0; /* I/O counter: bytes read */ | 1042 | p->rchar = 0; /* I/O counter: bytes read */ |
1042 | p->wchar = 0; /* I/O counter: bytes written */ | 1043 | p->wchar = 0; /* I/O counter: bytes written */ |
1043 | p->syscr = 0; /* I/O counter: read syscalls */ | 1044 | p->syscr = 0; /* I/O counter: read syscalls */ |
1044 | p->syscw = 0; /* I/O counter: write syscalls */ | 1045 | p->syscw = 0; /* I/O counter: write syscalls */ |
1046 | #endif | ||
1045 | task_io_accounting_init(p); | 1047 | task_io_accounting_init(p); |
1046 | acct_clear_integrals(p); | 1048 | acct_clear_integrals(p); |
1047 | 1049 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 5a737de857d3..e749e7df14b1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
1134 | 1134 | ||
1135 | if (sec != MAX_SCHEDULE_TIMEOUT) { | 1135 | if (sec != MAX_SCHEDULE_TIMEOUT) { |
1136 | to = &timeout; | 1136 | to = &timeout; |
1137 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | 1137 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); |
1138 | hrtimer_init_sleeper(to, current); | 1138 | hrtimer_init_sleeper(to, current); |
1139 | to->timer.expires = ktime_set(sec, nsec); | 1139 | to->timer.expires = ktime_set(sec, nsec); |
1140 | } | 1140 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d0ba190dfeb6..476cb0c0b4a4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -1,8 +1,9 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/hrtimer.c | 2 | * linux/kernel/hrtimer.c |
3 | * | 3 | * |
4 | * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar | 5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar |
6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner | ||
6 | * | 7 | * |
7 | * High-resolution kernel timers | 8 | * High-resolution kernel timers |
8 | * | 9 | * |
@@ -31,12 +32,17 @@ | |||
31 | */ | 32 | */ |
32 | 33 | ||
33 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/irq.h> | ||
34 | #include <linux/module.h> | 36 | #include <linux/module.h> |
35 | #include <linux/percpu.h> | 37 | #include <linux/percpu.h> |
36 | #include <linux/hrtimer.h> | 38 | #include <linux/hrtimer.h> |
37 | #include <linux/notifier.h> | 39 | #include <linux/notifier.h> |
38 | #include <linux/syscalls.h> | 40 | #include <linux/syscalls.h> |
41 | #include <linux/kallsyms.h> | ||
39 | #include <linux/interrupt.h> | 42 | #include <linux/interrupt.h> |
43 | #include <linux/tick.h> | ||
44 | #include <linux/seq_file.h> | ||
45 | #include <linux/err.h> | ||
40 | 46 | ||
41 | #include <asm/uaccess.h> | 47 | #include <asm/uaccess.h> |
42 | 48 | ||
@@ -45,7 +51,7 @@ | |||
45 | * | 51 | * |
46 | * returns the time in ktime_t format | 52 | * returns the time in ktime_t format |
47 | */ | 53 | */ |
48 | static ktime_t ktime_get(void) | 54 | ktime_t ktime_get(void) |
49 | { | 55 | { |
50 | struct timespec now; | 56 | struct timespec now; |
51 | 57 | ||
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void) | |||
59 | * | 65 | * |
60 | * returns the time in ktime_t format | 66 | * returns the time in ktime_t format |
61 | */ | 67 | */ |
62 | static ktime_t ktime_get_real(void) | 68 | ktime_t ktime_get_real(void) |
63 | { | 69 | { |
64 | struct timespec now; | 70 | struct timespec now; |
65 | 71 | ||
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); | |||
79 | * This ensures that we capture erroneous accesses to these clock ids | 85 | * This ensures that we capture erroneous accesses to these clock ids |
80 | * rather than moving them into the range of valid clock id's. | 86 | * rather than moving them into the range of valid clock id's. |
81 | */ | 87 | */ |
82 | 88 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |
83 | #define MAX_HRTIMER_BASES 2 | ||
84 | |||
85 | static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | ||
86 | { | 89 | { |
90 | |||
91 | .clock_base = | ||
87 | { | 92 | { |
88 | .index = CLOCK_REALTIME, | 93 | { |
89 | .get_time = &ktime_get_real, | 94 | .index = CLOCK_REALTIME, |
90 | .resolution = KTIME_REALTIME_RES, | 95 | .get_time = &ktime_get_real, |
91 | }, | 96 | .resolution = KTIME_LOW_RES, |
92 | { | 97 | }, |
93 | .index = CLOCK_MONOTONIC, | 98 | { |
94 | .get_time = &ktime_get, | 99 | .index = CLOCK_MONOTONIC, |
95 | .resolution = KTIME_MONOTONIC_RES, | 100 | .get_time = &ktime_get, |
96 | }, | 101 | .resolution = KTIME_LOW_RES, |
102 | }, | ||
103 | } | ||
97 | }; | 104 | }; |
98 | 105 | ||
99 | /** | 106 | /** |
@@ -102,7 +109,7 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | |||
102 | * | 109 | * |
103 | * The function calculates the monotonic clock from the realtime | 110 | * The function calculates the monotonic clock from the realtime |
104 | * clock and the wall_to_monotonic offset and stores the result | 111 | * clock and the wall_to_monotonic offset and stores the result |
105 | * in normalized timespec format in the variable pointed to by ts. | 112 | * in normalized timespec format in the variable pointed to by @ts. |
106 | */ | 113 | */ |
107 | void ktime_get_ts(struct timespec *ts) | 114 | void ktime_get_ts(struct timespec *ts) |
108 | { | 115 | { |
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); | |||
125 | * Get the coarse grained time at the softirq based on xtime and | 132 | * Get the coarse grained time at the softirq based on xtime and |
126 | * wall_to_monotonic. | 133 | * wall_to_monotonic. |
127 | */ | 134 | */ |
128 | static void hrtimer_get_softirq_time(struct hrtimer_base *base) | 135 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) |
129 | { | 136 | { |
130 | ktime_t xtim, tomono; | 137 | ktime_t xtim, tomono; |
138 | struct timespec xts; | ||
131 | unsigned long seq; | 139 | unsigned long seq; |
132 | 140 | ||
133 | do { | 141 | do { |
134 | seq = read_seqbegin(&xtime_lock); | 142 | seq = read_seqbegin(&xtime_lock); |
135 | xtim = timespec_to_ktime(xtime); | 143 | #ifdef CONFIG_NO_HZ |
136 | tomono = timespec_to_ktime(wall_to_monotonic); | 144 | getnstimeofday(&xts); |
137 | 145 | #else | |
146 | xts = xtime; | ||
147 | #endif | ||
138 | } while (read_seqretry(&xtime_lock, seq)); | 148 | } while (read_seqretry(&xtime_lock, seq)); |
139 | 149 | ||
140 | base[CLOCK_REALTIME].softirq_time = xtim; | 150 | xtim = timespec_to_ktime(xts); |
141 | base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); | 151 | tomono = timespec_to_ktime(wall_to_monotonic); |
152 | base->clock_base[CLOCK_REALTIME].softirq_time = xtim; | ||
153 | base->clock_base[CLOCK_MONOTONIC].softirq_time = | ||
154 | ktime_add(xtim, tomono); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Helper function to check, whether the timer is running the callback | ||
159 | * function | ||
160 | */ | ||
161 | static inline int hrtimer_callback_running(struct hrtimer *timer) | ||
162 | { | ||
163 | return timer->state & HRTIMER_STATE_CALLBACK; | ||
142 | } | 164 | } |
143 | 165 | ||
144 | /* | 166 | /* |
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) | |||
147 | */ | 169 | */ |
148 | #ifdef CONFIG_SMP | 170 | #ifdef CONFIG_SMP |
149 | 171 | ||
150 | #define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) | ||
151 | |||
152 | /* | 172 | /* |
153 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock | 173 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock |
154 | * means that all timers which are tied to this base via timer->base are | 174 | * means that all timers which are tied to this base via timer->base are |
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) | |||
161 | * possible to set timer->base = NULL and drop the lock: the timer remains | 181 | * possible to set timer->base = NULL and drop the lock: the timer remains |
162 | * locked. | 182 | * locked. |
163 | */ | 183 | */ |
164 | static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, | 184 | static |
165 | unsigned long *flags) | 185 | struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, |
186 | unsigned long *flags) | ||
166 | { | 187 | { |
167 | struct hrtimer_base *base; | 188 | struct hrtimer_clock_base *base; |
168 | 189 | ||
169 | for (;;) { | 190 | for (;;) { |
170 | base = timer->base; | 191 | base = timer->base; |
171 | if (likely(base != NULL)) { | 192 | if (likely(base != NULL)) { |
172 | spin_lock_irqsave(&base->lock, *flags); | 193 | spin_lock_irqsave(&base->cpu_base->lock, *flags); |
173 | if (likely(base == timer->base)) | 194 | if (likely(base == timer->base)) |
174 | return base; | 195 | return base; |
175 | /* The timer has migrated to another CPU: */ | 196 | /* The timer has migrated to another CPU: */ |
176 | spin_unlock_irqrestore(&base->lock, *flags); | 197 | spin_unlock_irqrestore(&base->cpu_base->lock, *flags); |
177 | } | 198 | } |
178 | cpu_relax(); | 199 | cpu_relax(); |
179 | } | 200 | } |
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
182 | /* | 203 | /* |
183 | * Switch the timer base to the current CPU when possible. | 204 | * Switch the timer base to the current CPU when possible. |
184 | */ | 205 | */ |
185 | static inline struct hrtimer_base * | 206 | static inline struct hrtimer_clock_base * |
186 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | 207 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) |
187 | { | 208 | { |
188 | struct hrtimer_base *new_base; | 209 | struct hrtimer_clock_base *new_base; |
210 | struct hrtimer_cpu_base *new_cpu_base; | ||
189 | 211 | ||
190 | new_base = &__get_cpu_var(hrtimer_bases)[base->index]; | 212 | new_cpu_base = &__get_cpu_var(hrtimer_bases); |
213 | new_base = &new_cpu_base->clock_base[base->index]; | ||
191 | 214 | ||
192 | if (base != new_base) { | 215 | if (base != new_base) { |
193 | /* | 216 | /* |
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | |||
199 | * completed. There is no conflict as we hold the lock until | 222 | * completed. There is no conflict as we hold the lock until |
200 | * the timer is enqueued. | 223 | * the timer is enqueued. |
201 | */ | 224 | */ |
202 | if (unlikely(base->curr_timer == timer)) | 225 | if (unlikely(hrtimer_callback_running(timer))) |
203 | return base; | 226 | return base; |
204 | 227 | ||
205 | /* See the comment in lock_timer_base() */ | 228 | /* See the comment in lock_timer_base() */ |
206 | timer->base = NULL; | 229 | timer->base = NULL; |
207 | spin_unlock(&base->lock); | 230 | spin_unlock(&base->cpu_base->lock); |
208 | spin_lock(&new_base->lock); | 231 | spin_lock(&new_base->cpu_base->lock); |
209 | timer->base = new_base; | 232 | timer->base = new_base; |
210 | } | 233 | } |
211 | return new_base; | 234 | return new_base; |
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | |||
213 | 236 | ||
214 | #else /* CONFIG_SMP */ | 237 | #else /* CONFIG_SMP */ |
215 | 238 | ||
216 | #define set_curr_timer(b, t) do { } while (0) | 239 | static inline struct hrtimer_clock_base * |
217 | |||
218 | static inline struct hrtimer_base * | ||
219 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | 240 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) |
220 | { | 241 | { |
221 | struct hrtimer_base *base = timer->base; | 242 | struct hrtimer_clock_base *base = timer->base; |
222 | 243 | ||
223 | spin_lock_irqsave(&base->lock, *flags); | 244 | spin_lock_irqsave(&base->cpu_base->lock, *flags); |
224 | 245 | ||
225 | return base; | 246 | return base; |
226 | } | 247 | } |
227 | 248 | ||
228 | #define switch_hrtimer_base(t, b) (b) | 249 | # define switch_hrtimer_base(t, b) (b) |
229 | 250 | ||
230 | #endif /* !CONFIG_SMP */ | 251 | #endif /* !CONFIG_SMP */ |
231 | 252 | ||
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | |||
256 | 277 | ||
257 | return ktime_add(kt, tmp); | 278 | return ktime_add(kt, tmp); |
258 | } | 279 | } |
259 | |||
260 | #else /* CONFIG_KTIME_SCALAR */ | ||
261 | |||
262 | # endif /* !CONFIG_KTIME_SCALAR */ | 280 | # endif /* !CONFIG_KTIME_SCALAR */ |
263 | 281 | ||
264 | /* | 282 | /* |
265 | * Divide a ktime value by a nanosecond value | 283 | * Divide a ktime value by a nanosecond value |
266 | */ | 284 | */ |
267 | static unsigned long ktime_divns(const ktime_t kt, s64 div) | 285 | unsigned long ktime_divns(const ktime_t kt, s64 div) |
268 | { | 286 | { |
269 | u64 dclc, inc, dns; | 287 | u64 dclc, inc, dns; |
270 | int sft = 0; | 288 | int sft = 0; |
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
281 | 299 | ||
282 | return (unsigned long) dclc; | 300 | return (unsigned long) dclc; |
283 | } | 301 | } |
284 | |||
285 | #else /* BITS_PER_LONG < 64 */ | ||
286 | # define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) | ||
287 | #endif /* BITS_PER_LONG >= 64 */ | 302 | #endif /* BITS_PER_LONG >= 64 */ |
288 | 303 | ||
304 | /* High resolution timer related functions */ | ||
305 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
306 | |||
307 | /* | ||
308 | * High resolution timer enabled ? | ||
309 | */ | ||
310 | static int hrtimer_hres_enabled __read_mostly = 1; | ||
311 | |||
312 | /* | ||
313 | * Enable / Disable high resolution mode | ||
314 | */ | ||
315 | static int __init setup_hrtimer_hres(char *str) | ||
316 | { | ||
317 | if (!strcmp(str, "off")) | ||
318 | hrtimer_hres_enabled = 0; | ||
319 | else if (!strcmp(str, "on")) | ||
320 | hrtimer_hres_enabled = 1; | ||
321 | else | ||
322 | return 0; | ||
323 | return 1; | ||
324 | } | ||
325 | |||
326 | __setup("highres=", setup_hrtimer_hres); | ||
327 | |||
328 | /* | ||
329 | * hrtimer_high_res_enabled - query, if the highres mode is enabled | ||
330 | */ | ||
331 | static inline int hrtimer_is_hres_enabled(void) | ||
332 | { | ||
333 | return hrtimer_hres_enabled; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Is the high resolution mode active ? | ||
338 | */ | ||
339 | static inline int hrtimer_hres_active(void) | ||
340 | { | ||
341 | return __get_cpu_var(hrtimer_bases).hres_active; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * Reprogram the event source with checking both queues for the | ||
346 | * next event | ||
347 | * Called with interrupts disabled and base->lock held | ||
348 | */ | ||
349 | static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) | ||
350 | { | ||
351 | int i; | ||
352 | struct hrtimer_clock_base *base = cpu_base->clock_base; | ||
353 | ktime_t expires; | ||
354 | |||
355 | cpu_base->expires_next.tv64 = KTIME_MAX; | ||
356 | |||
357 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | ||
358 | struct hrtimer *timer; | ||
359 | |||
360 | if (!base->first) | ||
361 | continue; | ||
362 | timer = rb_entry(base->first, struct hrtimer, node); | ||
363 | expires = ktime_sub(timer->expires, base->offset); | ||
364 | if (expires.tv64 < cpu_base->expires_next.tv64) | ||
365 | cpu_base->expires_next = expires; | ||
366 | } | ||
367 | |||
368 | if (cpu_base->expires_next.tv64 != KTIME_MAX) | ||
369 | tick_program_event(cpu_base->expires_next, 1); | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Shared reprogramming for clock_realtime and clock_monotonic | ||
374 | * | ||
375 | * When a timer is enqueued and expires earlier than the already enqueued | ||
376 | * timers, we have to check, whether it expires earlier than the timer for | ||
377 | * which the clock event device was armed. | ||
378 | * | ||
379 | * Called with interrupts disabled and base->cpu_base.lock held | ||
380 | */ | ||
381 | static int hrtimer_reprogram(struct hrtimer *timer, | ||
382 | struct hrtimer_clock_base *base) | ||
383 | { | ||
384 | ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; | ||
385 | ktime_t expires = ktime_sub(timer->expires, base->offset); | ||
386 | int res; | ||
387 | |||
388 | /* | ||
389 | * When the callback is running, we do not reprogram the clock event | ||
390 | * device. The timer callback is either running on a different CPU or | ||
391 | * the callback is executed in the hrtimer_interupt context. The | ||
392 | * reprogramming is handled either by the softirq, which called the | ||
393 | * callback or at the end of the hrtimer_interrupt. | ||
394 | */ | ||
395 | if (hrtimer_callback_running(timer)) | ||
396 | return 0; | ||
397 | |||
398 | if (expires.tv64 >= expires_next->tv64) | ||
399 | return 0; | ||
400 | |||
401 | /* | ||
402 | * Clockevents returns -ETIME, when the event was in the past. | ||
403 | */ | ||
404 | res = tick_program_event(expires, 0); | ||
405 | if (!IS_ERR_VALUE(res)) | ||
406 | *expires_next = expires; | ||
407 | return res; | ||
408 | } | ||
409 | |||
410 | |||
411 | /* | ||
412 | * Retrigger next event is called after clock was set | ||
413 | * | ||
414 | * Called with interrupts disabled via on_each_cpu() | ||
415 | */ | ||
416 | static void retrigger_next_event(void *arg) | ||
417 | { | ||
418 | struct hrtimer_cpu_base *base; | ||
419 | struct timespec realtime_offset; | ||
420 | unsigned long seq; | ||
421 | |||
422 | if (!hrtimer_hres_active()) | ||
423 | return; | ||
424 | |||
425 | do { | ||
426 | seq = read_seqbegin(&xtime_lock); | ||
427 | set_normalized_timespec(&realtime_offset, | ||
428 | -wall_to_monotonic.tv_sec, | ||
429 | -wall_to_monotonic.tv_nsec); | ||
430 | } while (read_seqretry(&xtime_lock, seq)); | ||
431 | |||
432 | base = &__get_cpu_var(hrtimer_bases); | ||
433 | |||
434 | /* Adjust CLOCK_REALTIME offset */ | ||
435 | spin_lock(&base->lock); | ||
436 | base->clock_base[CLOCK_REALTIME].offset = | ||
437 | timespec_to_ktime(realtime_offset); | ||
438 | |||
439 | hrtimer_force_reprogram(base); | ||
440 | spin_unlock(&base->lock); | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * Clock realtime was set | ||
445 | * | ||
446 | * Change the offset of the realtime clock vs. the monotonic | ||
447 | * clock. | ||
448 | * | ||
449 | * We might have to reprogram the high resolution timer interrupt. On | ||
450 | * SMP we call the architecture specific code to retrigger _all_ high | ||
451 | * resolution timer interrupts. On UP we just disable interrupts and | ||
452 | * call the high resolution interrupt code. | ||
453 | */ | ||
454 | void clock_was_set(void) | ||
455 | { | ||
456 | /* Retrigger the CPU local events everywhere */ | ||
457 | on_each_cpu(retrigger_next_event, NULL, 0, 1); | ||
458 | } | ||
459 | |||
460 | /* | ||
461 | * Check, whether the timer is on the callback pending list | ||
462 | */ | ||
463 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
464 | { | ||
465 | return timer->state & HRTIMER_STATE_PENDING; | ||
466 | } | ||
467 | |||
468 | /* | ||
469 | * Remove a timer from the callback pending list | ||
470 | */ | ||
471 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
472 | { | ||
473 | list_del_init(&timer->cb_entry); | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * Initialize the high resolution related parts of cpu_base | ||
478 | */ | ||
479 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | ||
480 | { | ||
481 | base->expires_next.tv64 = KTIME_MAX; | ||
482 | base->hres_active = 0; | ||
483 | INIT_LIST_HEAD(&base->cb_pending); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Initialize the high resolution related parts of a hrtimer | ||
488 | */ | ||
489 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | ||
490 | { | ||
491 | INIT_LIST_HEAD(&timer->cb_entry); | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * When High resolution timers are active, try to reprogram. Note, that in case | ||
496 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry | ||
497 | * check happens. The timer gets enqueued into the rbtree. The reprogramming | ||
498 | * and expiry check is done in the hrtimer_interrupt or in the softirq. | ||
499 | */ | ||
500 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | ||
501 | struct hrtimer_clock_base *base) | ||
502 | { | ||
503 | if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { | ||
504 | |||
505 | /* Timer is expired, act upon the callback mode */ | ||
506 | switch(timer->cb_mode) { | ||
507 | case HRTIMER_CB_IRQSAFE_NO_RESTART: | ||
508 | /* | ||
509 | * We can call the callback from here. No restart | ||
510 | * happens, so no danger of recursion | ||
511 | */ | ||
512 | BUG_ON(timer->function(timer) != HRTIMER_NORESTART); | ||
513 | return 1; | ||
514 | case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: | ||
515 | /* | ||
516 | * This is solely for the sched tick emulation with | ||
517 | * dynamic tick support to ensure that we do not | ||
518 | * restart the tick right on the edge and end up with | ||
519 | * the tick timer in the softirq ! The calling site | ||
520 | * takes care of this. | ||
521 | */ | ||
522 | return 1; | ||
523 | case HRTIMER_CB_IRQSAFE: | ||
524 | case HRTIMER_CB_SOFTIRQ: | ||
525 | /* | ||
526 | * Move everything else into the softirq pending list ! | ||
527 | */ | ||
528 | list_add_tail(&timer->cb_entry, | ||
529 | &base->cpu_base->cb_pending); | ||
530 | timer->state = HRTIMER_STATE_PENDING; | ||
531 | raise_softirq(HRTIMER_SOFTIRQ); | ||
532 | return 1; | ||
533 | default: | ||
534 | BUG(); | ||
535 | } | ||
536 | } | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Switch to high resolution mode | ||
542 | */ | ||
543 | static void hrtimer_switch_to_hres(void) | ||
544 | { | ||
545 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | ||
546 | unsigned long flags; | ||
547 | |||
548 | if (base->hres_active) | ||
549 | return; | ||
550 | |||
551 | local_irq_save(flags); | ||
552 | |||
553 | if (tick_init_highres()) { | ||
554 | local_irq_restore(flags); | ||
555 | return; | ||
556 | } | ||
557 | base->hres_active = 1; | ||
558 | base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; | ||
559 | base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; | ||
560 | |||
561 | tick_setup_sched_timer(); | ||
562 | |||
563 | /* "Retrigger" the interrupt to get things going */ | ||
564 | retrigger_next_event(NULL); | ||
565 | local_irq_restore(flags); | ||
566 | printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", | ||
567 | smp_processor_id()); | ||
568 | } | ||
569 | |||
570 | #else | ||
571 | |||
572 | static inline int hrtimer_hres_active(void) { return 0; } | ||
573 | static inline int hrtimer_is_hres_enabled(void) { return 0; } | ||
574 | static inline void hrtimer_switch_to_hres(void) { } | ||
575 | static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } | ||
576 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | ||
577 | struct hrtimer_clock_base *base) | ||
578 | { | ||
579 | return 0; | ||
580 | } | ||
581 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
582 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
583 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | ||
584 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | ||
585 | |||
586 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
587 | |||
588 | #ifdef CONFIG_TIMER_STATS | ||
589 | void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) | ||
590 | { | ||
591 | if (timer->start_site) | ||
592 | return; | ||
593 | |||
594 | timer->start_site = addr; | ||
595 | memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); | ||
596 | timer->start_pid = current->pid; | ||
597 | } | ||
598 | #endif | ||
599 | |||
289 | /* | 600 | /* |
290 | * Counterpart to lock_timer_base above: | 601 | * Counterpart to lock_timer_base above: |
291 | */ | 602 | */ |
292 | static inline | 603 | static inline |
293 | void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | 604 | void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) |
294 | { | 605 | { |
295 | spin_unlock_irqrestore(&timer->base->lock, *flags); | 606 | spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); |
296 | } | 607 | } |
297 | 608 | ||
298 | /** | 609 | /** |
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) | |||
342 | * The timer is inserted in expiry order. Insertion into the | 653 | * The timer is inserted in expiry order. Insertion into the |
343 | * red black tree is O(log(n)). Must hold the base lock. | 654 | * red black tree is O(log(n)). Must hold the base lock. |
344 | */ | 655 | */ |
345 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 656 | static void enqueue_hrtimer(struct hrtimer *timer, |
657 | struct hrtimer_clock_base *base, int reprogram) | ||
346 | { | 658 | { |
347 | struct rb_node **link = &base->active.rb_node; | 659 | struct rb_node **link = &base->active.rb_node; |
348 | struct rb_node *parent = NULL; | 660 | struct rb_node *parent = NULL; |
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
368 | * Insert the timer to the rbtree and check whether it | 680 | * Insert the timer to the rbtree and check whether it |
369 | * replaces the first pending timer | 681 | * replaces the first pending timer |
370 | */ | 682 | */ |
371 | rb_link_node(&timer->node, parent, link); | ||
372 | rb_insert_color(&timer->node, &base->active); | ||
373 | |||
374 | if (!base->first || timer->expires.tv64 < | 683 | if (!base->first || timer->expires.tv64 < |
375 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | 684 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) { |
685 | /* | ||
686 | * Reprogram the clock event device. When the timer is already | ||
687 | * expired hrtimer_enqueue_reprogram has either called the | ||
688 | * callback or added it to the pending list and raised the | ||
689 | * softirq. | ||
690 | * | ||
691 | * This is a NOP for !HIGHRES | ||
692 | */ | ||
693 | if (reprogram && hrtimer_enqueue_reprogram(timer, base)) | ||
694 | return; | ||
695 | |||
376 | base->first = &timer->node; | 696 | base->first = &timer->node; |
697 | } | ||
698 | |||
699 | rb_link_node(&timer->node, parent, link); | ||
700 | rb_insert_color(&timer->node, &base->active); | ||
701 | /* | ||
702 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | ||
703 | * state of a possibly running callback. | ||
704 | */ | ||
705 | timer->state |= HRTIMER_STATE_ENQUEUED; | ||
377 | } | 706 | } |
378 | 707 | ||
379 | /* | 708 | /* |
380 | * __remove_hrtimer - internal function to remove a timer | 709 | * __remove_hrtimer - internal function to remove a timer |
381 | * | 710 | * |
382 | * Caller must hold the base lock. | 711 | * Caller must hold the base lock. |
712 | * | ||
713 | * High resolution timer mode reprograms the clock event device when the | ||
714 | * timer is the one which expires next. The caller can disable this by setting | ||
715 | * reprogram to zero. This is useful, when the context does a reprogramming | ||
716 | * anyway (e.g. timer interrupt) | ||
383 | */ | 717 | */ |
384 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 718 | static void __remove_hrtimer(struct hrtimer *timer, |
719 | struct hrtimer_clock_base *base, | ||
720 | unsigned long newstate, int reprogram) | ||
385 | { | 721 | { |
386 | /* | 722 | /* High res. callback list. NOP for !HIGHRES */ |
387 | * Remove the timer from the rbtree and replace the | 723 | if (hrtimer_cb_pending(timer)) |
388 | * first entry pointer if necessary. | 724 | hrtimer_remove_cb_pending(timer); |
389 | */ | 725 | else { |
390 | if (base->first == &timer->node) | 726 | /* |
391 | base->first = rb_next(&timer->node); | 727 | * Remove the timer from the rbtree and replace the |
392 | rb_erase(&timer->node, &base->active); | 728 | * first entry pointer if necessary. |
393 | rb_set_parent(&timer->node, &timer->node); | 729 | */ |
730 | if (base->first == &timer->node) { | ||
731 | base->first = rb_next(&timer->node); | ||
732 | /* Reprogram the clock event device. if enabled */ | ||
733 | if (reprogram && hrtimer_hres_active()) | ||
734 | hrtimer_force_reprogram(base->cpu_base); | ||
735 | } | ||
736 | rb_erase(&timer->node, &base->active); | ||
737 | } | ||
738 | timer->state = newstate; | ||
394 | } | 739 | } |
395 | 740 | ||
396 | /* | 741 | /* |
397 | * remove hrtimer, called with base lock held | 742 | * remove hrtimer, called with base lock held |
398 | */ | 743 | */ |
399 | static inline int | 744 | static inline int |
400 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | 745 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) |
401 | { | 746 | { |
402 | if (hrtimer_active(timer)) { | 747 | if (hrtimer_is_queued(timer)) { |
403 | __remove_hrtimer(timer, base); | 748 | int reprogram; |
749 | |||
750 | /* | ||
751 | * Remove the timer and force reprogramming when high | ||
752 | * resolution mode is active and the timer is on the current | ||
753 | * CPU. If we remove a timer on another CPU, reprogramming is | ||
754 | * skipped. The interrupt event on this CPU is fired and | ||
755 | * reprogramming happens in the interrupt handler. This is a | ||
756 | * rare case and less expensive than a smp call. | ||
757 | */ | ||
758 | timer_stats_hrtimer_clear_start_info(timer); | ||
759 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | ||
760 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, | ||
761 | reprogram); | ||
404 | return 1; | 762 | return 1; |
405 | } | 763 | } |
406 | return 0; | 764 | return 0; |
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
419 | int | 777 | int |
420 | hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | 778 | hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) |
421 | { | 779 | { |
422 | struct hrtimer_base *base, *new_base; | 780 | struct hrtimer_clock_base *base, *new_base; |
423 | unsigned long flags; | 781 | unsigned long flags; |
424 | int ret; | 782 | int ret; |
425 | 783 | ||
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
431 | /* Switch the timer base, if necessary: */ | 789 | /* Switch the timer base, if necessary: */ |
432 | new_base = switch_hrtimer_base(timer, base); | 790 | new_base = switch_hrtimer_base(timer, base); |
433 | 791 | ||
434 | if (mode == HRTIMER_REL) { | 792 | if (mode == HRTIMER_MODE_REL) { |
435 | tim = ktime_add(tim, new_base->get_time()); | 793 | tim = ktime_add(tim, new_base->get_time()); |
436 | /* | 794 | /* |
437 | * CONFIG_TIME_LOW_RES is a temporary way for architectures | 795 | * CONFIG_TIME_LOW_RES is a temporary way for architectures |
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
446 | } | 804 | } |
447 | timer->expires = tim; | 805 | timer->expires = tim; |
448 | 806 | ||
449 | enqueue_hrtimer(timer, new_base); | 807 | timer_stats_hrtimer_set_start_info(timer); |
808 | |||
809 | enqueue_hrtimer(timer, new_base, base == new_base); | ||
450 | 810 | ||
451 | unlock_hrtimer_base(timer, &flags); | 811 | unlock_hrtimer_base(timer, &flags); |
452 | 812 | ||
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); | |||
466 | */ | 826 | */ |
467 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 827 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
468 | { | 828 | { |
469 | struct hrtimer_base *base; | 829 | struct hrtimer_clock_base *base; |
470 | unsigned long flags; | 830 | unsigned long flags; |
471 | int ret = -1; | 831 | int ret = -1; |
472 | 832 | ||
473 | base = lock_hrtimer_base(timer, &flags); | 833 | base = lock_hrtimer_base(timer, &flags); |
474 | 834 | ||
475 | if (base->curr_timer != timer) | 835 | if (!hrtimer_callback_running(timer)) |
476 | ret = remove_hrtimer(timer, base); | 836 | ret = remove_hrtimer(timer, base); |
477 | 837 | ||
478 | unlock_hrtimer_base(timer, &flags); | 838 | unlock_hrtimer_base(timer, &flags); |
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
508 | */ | 868 | */ |
509 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 869 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
510 | { | 870 | { |
511 | struct hrtimer_base *base; | 871 | struct hrtimer_clock_base *base; |
512 | unsigned long flags; | 872 | unsigned long flags; |
513 | ktime_t rem; | 873 | ktime_t rem; |
514 | 874 | ||
515 | base = lock_hrtimer_base(timer, &flags); | 875 | base = lock_hrtimer_base(timer, &flags); |
516 | rem = ktime_sub(timer->expires, timer->base->get_time()); | 876 | rem = ktime_sub(timer->expires, base->get_time()); |
517 | unlock_hrtimer_base(timer, &flags); | 877 | unlock_hrtimer_base(timer, &flags); |
518 | 878 | ||
519 | return rem; | 879 | return rem; |
520 | } | 880 | } |
521 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | 881 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); |
522 | 882 | ||
523 | #ifdef CONFIG_NO_IDLE_HZ | 883 | #if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) |
524 | /** | 884 | /** |
525 | * hrtimer_get_next_event - get the time until next expiry event | 885 | * hrtimer_get_next_event - get the time until next expiry event |
526 | * | 886 | * |
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | |||
529 | */ | 889 | */ |
530 | ktime_t hrtimer_get_next_event(void) | 890 | ktime_t hrtimer_get_next_event(void) |
531 | { | 891 | { |
532 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | 892 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
893 | struct hrtimer_clock_base *base = cpu_base->clock_base; | ||
533 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; | 894 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; |
534 | unsigned long flags; | 895 | unsigned long flags; |
535 | int i; | 896 | int i; |
536 | 897 | ||
537 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { | 898 | spin_lock_irqsave(&cpu_base->lock, flags); |
538 | struct hrtimer *timer; | ||
539 | 899 | ||
540 | spin_lock_irqsave(&base->lock, flags); | 900 | if (!hrtimer_hres_active()) { |
541 | if (!base->first) { | 901 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
542 | spin_unlock_irqrestore(&base->lock, flags); | 902 | struct hrtimer *timer; |
543 | continue; | 903 | |
904 | if (!base->first) | ||
905 | continue; | ||
906 | |||
907 | timer = rb_entry(base->first, struct hrtimer, node); | ||
908 | delta.tv64 = timer->expires.tv64; | ||
909 | delta = ktime_sub(delta, base->get_time()); | ||
910 | if (delta.tv64 < mindelta.tv64) | ||
911 | mindelta.tv64 = delta.tv64; | ||
544 | } | 912 | } |
545 | timer = rb_entry(base->first, struct hrtimer, node); | ||
546 | delta.tv64 = timer->expires.tv64; | ||
547 | spin_unlock_irqrestore(&base->lock, flags); | ||
548 | delta = ktime_sub(delta, base->get_time()); | ||
549 | if (delta.tv64 < mindelta.tv64) | ||
550 | mindelta.tv64 = delta.tv64; | ||
551 | } | 913 | } |
914 | |||
915 | spin_unlock_irqrestore(&cpu_base->lock, flags); | ||
916 | |||
552 | if (mindelta.tv64 < 0) | 917 | if (mindelta.tv64 < 0) |
553 | mindelta.tv64 = 0; | 918 | mindelta.tv64 = 0; |
554 | return mindelta; | 919 | return mindelta; |
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void) | |||
564 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | 929 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
565 | enum hrtimer_mode mode) | 930 | enum hrtimer_mode mode) |
566 | { | 931 | { |
567 | struct hrtimer_base *bases; | 932 | struct hrtimer_cpu_base *cpu_base; |
568 | 933 | ||
569 | memset(timer, 0, sizeof(struct hrtimer)); | 934 | memset(timer, 0, sizeof(struct hrtimer)); |
570 | 935 | ||
571 | bases = __raw_get_cpu_var(hrtimer_bases); | 936 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
572 | 937 | ||
573 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) | 938 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
574 | clock_id = CLOCK_MONOTONIC; | 939 | clock_id = CLOCK_MONOTONIC; |
575 | 940 | ||
576 | timer->base = &bases[clock_id]; | 941 | timer->base = &cpu_base->clock_base[clock_id]; |
577 | rb_set_parent(&timer->node, &timer->node); | 942 | hrtimer_init_timer_hres(timer); |
943 | |||
944 | #ifdef CONFIG_TIMER_STATS | ||
945 | timer->start_site = NULL; | ||
946 | timer->start_pid = -1; | ||
947 | memset(timer->start_comm, 0, TASK_COMM_LEN); | ||
948 | #endif | ||
578 | } | 949 | } |
579 | EXPORT_SYMBOL_GPL(hrtimer_init); | 950 | EXPORT_SYMBOL_GPL(hrtimer_init); |
580 | 951 | ||
@@ -583,26 +954,164 @@ EXPORT_SYMBOL_GPL(hrtimer_init); | |||
583 | * @which_clock: which clock to query | 954 | * @which_clock: which clock to query |
584 | * @tp: pointer to timespec variable to store the resolution | 955 | * @tp: pointer to timespec variable to store the resolution |
585 | * | 956 | * |
586 | * Store the resolution of the clock selected by which_clock in the | 957 | * Store the resolution of the clock selected by @which_clock in the |
587 | * variable pointed to by tp. | 958 | * variable pointed to by @tp. |
588 | */ | 959 | */ |
589 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | 960 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) |
590 | { | 961 | { |
591 | struct hrtimer_base *bases; | 962 | struct hrtimer_cpu_base *cpu_base; |
592 | 963 | ||
593 | bases = __raw_get_cpu_var(hrtimer_bases); | 964 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
594 | *tp = ktime_to_timespec(bases[which_clock].resolution); | 965 | *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); |
595 | 966 | ||
596 | return 0; | 967 | return 0; |
597 | } | 968 | } |
598 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 969 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
599 | 970 | ||
971 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
972 | |||
973 | /* | ||
974 | * High resolution timer interrupt | ||
975 | * Called with interrupts disabled | ||
976 | */ | ||
977 | void hrtimer_interrupt(struct clock_event_device *dev) | ||
978 | { | ||
979 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
980 | struct hrtimer_clock_base *base; | ||
981 | ktime_t expires_next, now; | ||
982 | int i, raise = 0; | ||
983 | |||
984 | BUG_ON(!cpu_base->hres_active); | ||
985 | cpu_base->nr_events++; | ||
986 | dev->next_event.tv64 = KTIME_MAX; | ||
987 | |||
988 | retry: | ||
989 | now = ktime_get(); | ||
990 | |||
991 | expires_next.tv64 = KTIME_MAX; | ||
992 | |||
993 | base = cpu_base->clock_base; | ||
994 | |||
995 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | ||
996 | ktime_t basenow; | ||
997 | struct rb_node *node; | ||
998 | |||
999 | spin_lock(&cpu_base->lock); | ||
1000 | |||
1001 | basenow = ktime_add(now, base->offset); | ||
1002 | |||
1003 | while ((node = base->first)) { | ||
1004 | struct hrtimer *timer; | ||
1005 | |||
1006 | timer = rb_entry(node, struct hrtimer, node); | ||
1007 | |||
1008 | if (basenow.tv64 < timer->expires.tv64) { | ||
1009 | ktime_t expires; | ||
1010 | |||
1011 | expires = ktime_sub(timer->expires, | ||
1012 | base->offset); | ||
1013 | if (expires.tv64 < expires_next.tv64) | ||
1014 | expires_next = expires; | ||
1015 | break; | ||
1016 | } | ||
1017 | |||
1018 | /* Move softirq callbacks to the pending list */ | ||
1019 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { | ||
1020 | __remove_hrtimer(timer, base, | ||
1021 | HRTIMER_STATE_PENDING, 0); | ||
1022 | list_add_tail(&timer->cb_entry, | ||
1023 | &base->cpu_base->cb_pending); | ||
1024 | raise = 1; | ||
1025 | continue; | ||
1026 | } | ||
1027 | |||
1028 | __remove_hrtimer(timer, base, | ||
1029 | HRTIMER_STATE_CALLBACK, 0); | ||
1030 | timer_stats_account_hrtimer(timer); | ||
1031 | |||
1032 | /* | ||
1033 | * Note: We clear the CALLBACK bit after | ||
1034 | * enqueue_hrtimer to avoid reprogramming of | ||
1035 | * the event hardware. This happens at the end | ||
1036 | * of this function anyway. | ||
1037 | */ | ||
1038 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
1039 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1040 | enqueue_hrtimer(timer, base, 0); | ||
1041 | } | ||
1042 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1043 | } | ||
1044 | spin_unlock(&cpu_base->lock); | ||
1045 | base++; | ||
1046 | } | ||
1047 | |||
1048 | cpu_base->expires_next = expires_next; | ||
1049 | |||
1050 | /* Reprogramming necessary ? */ | ||
1051 | if (expires_next.tv64 != KTIME_MAX) { | ||
1052 | if (tick_program_event(expires_next, 0)) | ||
1053 | goto retry; | ||
1054 | } | ||
1055 | |||
1056 | /* Raise softirq ? */ | ||
1057 | if (raise) | ||
1058 | raise_softirq(HRTIMER_SOFTIRQ); | ||
1059 | } | ||
1060 | |||
1061 | static void run_hrtimer_softirq(struct softirq_action *h) | ||
1062 | { | ||
1063 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1064 | |||
1065 | spin_lock_irq(&cpu_base->lock); | ||
1066 | |||
1067 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1068 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1069 | struct hrtimer *timer; | ||
1070 | int restart; | ||
1071 | |||
1072 | timer = list_entry(cpu_base->cb_pending.next, | ||
1073 | struct hrtimer, cb_entry); | ||
1074 | |||
1075 | timer_stats_account_hrtimer(timer); | ||
1076 | |||
1077 | fn = timer->function; | ||
1078 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
1079 | spin_unlock_irq(&cpu_base->lock); | ||
1080 | |||
1081 | restart = fn(timer); | ||
1082 | |||
1083 | spin_lock_irq(&cpu_base->lock); | ||
1084 | |||
1085 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1086 | if (restart == HRTIMER_RESTART) { | ||
1087 | BUG_ON(hrtimer_active(timer)); | ||
1088 | /* | ||
1089 | * Enqueue the timer, allow reprogramming of the event | ||
1090 | * device | ||
1091 | */ | ||
1092 | enqueue_hrtimer(timer, timer->base, 1); | ||
1093 | } else if (hrtimer_active(timer)) { | ||
1094 | /* | ||
1095 | * If the timer was rearmed on another CPU, reprogram | ||
1096 | * the event device. | ||
1097 | */ | ||
1098 | if (timer->base->first == &timer->node) | ||
1099 | hrtimer_reprogram(timer, timer->base); | ||
1100 | } | ||
1101 | } | ||
1102 | spin_unlock_irq(&cpu_base->lock); | ||
1103 | } | ||
1104 | |||
1105 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
1106 | |||
600 | /* | 1107 | /* |
601 | * Expire the per base hrtimer-queue: | 1108 | * Expire the per base hrtimer-queue: |
602 | */ | 1109 | */ |
603 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | 1110 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
1111 | int index) | ||
604 | { | 1112 | { |
605 | struct rb_node *node; | 1113 | struct rb_node *node; |
1114 | struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; | ||
606 | 1115 | ||
607 | if (!base->first) | 1116 | if (!base->first) |
608 | return; | 1117 | return; |
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) | |||
610 | if (base->get_softirq_time) | 1119 | if (base->get_softirq_time) |
611 | base->softirq_time = base->get_softirq_time(); | 1120 | base->softirq_time = base->get_softirq_time(); |
612 | 1121 | ||
613 | spin_lock_irq(&base->lock); | 1122 | spin_lock_irq(&cpu_base->lock); |
614 | 1123 | ||
615 | while ((node = base->first)) { | 1124 | while ((node = base->first)) { |
616 | struct hrtimer *timer; | 1125 | struct hrtimer *timer; |
617 | int (*fn)(struct hrtimer *); | 1126 | enum hrtimer_restart (*fn)(struct hrtimer *); |
618 | int restart; | 1127 | int restart; |
619 | 1128 | ||
620 | timer = rb_entry(node, struct hrtimer, node); | 1129 | timer = rb_entry(node, struct hrtimer, node); |
621 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1130 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
622 | break; | 1131 | break; |
623 | 1132 | ||
1133 | timer_stats_account_hrtimer(timer); | ||
1134 | |||
624 | fn = timer->function; | 1135 | fn = timer->function; |
625 | set_curr_timer(base, timer); | 1136 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); |
626 | __remove_hrtimer(timer, base); | 1137 | spin_unlock_irq(&cpu_base->lock); |
627 | spin_unlock_irq(&base->lock); | ||
628 | 1138 | ||
629 | restart = fn(timer); | 1139 | restart = fn(timer); |
630 | 1140 | ||
631 | spin_lock_irq(&base->lock); | 1141 | spin_lock_irq(&cpu_base->lock); |
632 | 1142 | ||
1143 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
633 | if (restart != HRTIMER_NORESTART) { | 1144 | if (restart != HRTIMER_NORESTART) { |
634 | BUG_ON(hrtimer_active(timer)); | 1145 | BUG_ON(hrtimer_active(timer)); |
635 | enqueue_hrtimer(timer, base); | 1146 | enqueue_hrtimer(timer, base, 0); |
636 | } | 1147 | } |
637 | } | 1148 | } |
638 | set_curr_timer(base, NULL); | 1149 | spin_unlock_irq(&cpu_base->lock); |
639 | spin_unlock_irq(&base->lock); | ||
640 | } | 1150 | } |
641 | 1151 | ||
642 | /* | 1152 | /* |
643 | * Called from timer softirq every jiffy, expire hrtimers: | 1153 | * Called from timer softirq every jiffy, expire hrtimers: |
1154 | * | ||
1155 | * For HRT its the fall back code to run the softirq in the timer | ||
1156 | * softirq context in case the hrtimer initialization failed or has | ||
1157 | * not been done yet. | ||
644 | */ | 1158 | */ |
645 | void hrtimer_run_queues(void) | 1159 | void hrtimer_run_queues(void) |
646 | { | 1160 | { |
647 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | 1161 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
648 | int i; | 1162 | int i; |
649 | 1163 | ||
650 | hrtimer_get_softirq_time(base); | 1164 | if (hrtimer_hres_active()) |
1165 | return; | ||
1166 | |||
1167 | /* | ||
1168 | * This _is_ ugly: We have to check in the softirq context, | ||
1169 | * whether we can switch to highres and / or nohz mode. The | ||
1170 | * clocksource switch happens in the timer interrupt with | ||
1171 | * xtime_lock held. Notification from there only sets the | ||
1172 | * check bit in the tick_oneshot code, otherwise we might | ||
1173 | * deadlock vs. xtime_lock. | ||
1174 | */ | ||
1175 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1176 | hrtimer_switch_to_hres(); | ||
651 | 1177 | ||
652 | for (i = 0; i < MAX_HRTIMER_BASES; i++) | 1178 | hrtimer_get_softirq_time(cpu_base); |
653 | run_hrtimer_queue(&base[i]); | 1179 | |
1180 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | ||
1181 | run_hrtimer_queue(cpu_base, i); | ||
654 | } | 1182 | } |
655 | 1183 | ||
656 | /* | 1184 | /* |
657 | * Sleep related functions: | 1185 | * Sleep related functions: |
658 | */ | 1186 | */ |
659 | static int hrtimer_wakeup(struct hrtimer *timer) | 1187 | static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) |
660 | { | 1188 | { |
661 | struct hrtimer_sleeper *t = | 1189 | struct hrtimer_sleeper *t = |
662 | container_of(timer, struct hrtimer_sleeper, timer); | 1190 | container_of(timer, struct hrtimer_sleeper, timer); |
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
673 | { | 1201 | { |
674 | sl->timer.function = hrtimer_wakeup; | 1202 | sl->timer.function = hrtimer_wakeup; |
675 | sl->task = task; | 1203 | sl->task = task; |
1204 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1205 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | ||
1206 | #endif | ||
676 | } | 1207 | } |
677 | 1208 | ||
678 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) | 1209 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) |
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
683 | set_current_state(TASK_INTERRUPTIBLE); | 1214 | set_current_state(TASK_INTERRUPTIBLE); |
684 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1215 | hrtimer_start(&t->timer, t->timer.expires, mode); |
685 | 1216 | ||
686 | schedule(); | 1217 | if (likely(t->task)) |
1218 | schedule(); | ||
687 | 1219 | ||
688 | hrtimer_cancel(&t->timer); | 1220 | hrtimer_cancel(&t->timer); |
689 | mode = HRTIMER_ABS; | 1221 | mode = HRTIMER_MODE_ABS; |
690 | 1222 | ||
691 | } while (t->task && !signal_pending(current)); | 1223 | } while (t->task && !signal_pending(current)); |
692 | 1224 | ||
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
702 | 1234 | ||
703 | restart->fn = do_no_restart_syscall; | 1235 | restart->fn = do_no_restart_syscall; |
704 | 1236 | ||
705 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); | 1237 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); |
706 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; | 1238 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; |
707 | 1239 | ||
708 | if (do_nanosleep(&t, HRTIMER_ABS)) | 1240 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) |
709 | return 0; | 1241 | return 0; |
710 | 1242 | ||
711 | rmtp = (struct timespec __user *) restart->arg1; | 1243 | rmtp = (struct timespec __user *) restart->arg1; |
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
738 | return 0; | 1270 | return 0; |
739 | 1271 | ||
740 | /* Absolute timers do not update the rmtp value and restart: */ | 1272 | /* Absolute timers do not update the rmtp value and restart: */ |
741 | if (mode == HRTIMER_ABS) | 1273 | if (mode == HRTIMER_MODE_ABS) |
742 | return -ERESTARTNOHAND; | 1274 | return -ERESTARTNOHAND; |
743 | 1275 | ||
744 | if (rmtp) { | 1276 | if (rmtp) { |
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | |||
771 | if (!timespec_valid(&tu)) | 1303 | if (!timespec_valid(&tu)) |
772 | return -EINVAL; | 1304 | return -EINVAL; |
773 | 1305 | ||
774 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); | 1306 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
775 | } | 1307 | } |
776 | 1308 | ||
777 | /* | 1309 | /* |
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | |||
779 | */ | 1311 | */ |
780 | static void __devinit init_hrtimers_cpu(int cpu) | 1312 | static void __devinit init_hrtimers_cpu(int cpu) |
781 | { | 1313 | { |
782 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); | 1314 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
783 | int i; | 1315 | int i; |
784 | 1316 | ||
785 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { | 1317 | spin_lock_init(&cpu_base->lock); |
786 | spin_lock_init(&base->lock); | 1318 | lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); |
787 | lockdep_set_class(&base->lock, &base->lock_key); | 1319 | |
788 | } | 1320 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1321 | cpu_base->clock_base[i].cpu_base = cpu_base; | ||
1322 | |||
1323 | hrtimer_init_hres(cpu_base); | ||
789 | } | 1324 | } |
790 | 1325 | ||
791 | #ifdef CONFIG_HOTPLUG_CPU | 1326 | #ifdef CONFIG_HOTPLUG_CPU |
792 | 1327 | ||
793 | static void migrate_hrtimer_list(struct hrtimer_base *old_base, | 1328 | static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, |
794 | struct hrtimer_base *new_base) | 1329 | struct hrtimer_clock_base *new_base) |
795 | { | 1330 | { |
796 | struct hrtimer *timer; | 1331 | struct hrtimer *timer; |
797 | struct rb_node *node; | 1332 | struct rb_node *node; |
798 | 1333 | ||
799 | while ((node = rb_first(&old_base->active))) { | 1334 | while ((node = rb_first(&old_base->active))) { |
800 | timer = rb_entry(node, struct hrtimer, node); | 1335 | timer = rb_entry(node, struct hrtimer, node); |
801 | __remove_hrtimer(timer, old_base); | 1336 | BUG_ON(hrtimer_callback_running(timer)); |
1337 | __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); | ||
802 | timer->base = new_base; | 1338 | timer->base = new_base; |
803 | enqueue_hrtimer(timer, new_base); | 1339 | /* |
1340 | * Enqueue the timer. Allow reprogramming of the event device | ||
1341 | */ | ||
1342 | enqueue_hrtimer(timer, new_base, 1); | ||
804 | } | 1343 | } |
805 | } | 1344 | } |
806 | 1345 | ||
807 | static void migrate_hrtimers(int cpu) | 1346 | static void migrate_hrtimers(int cpu) |
808 | { | 1347 | { |
809 | struct hrtimer_base *old_base, *new_base; | 1348 | struct hrtimer_cpu_base *old_base, *new_base; |
810 | int i; | 1349 | int i; |
811 | 1350 | ||
812 | BUG_ON(cpu_online(cpu)); | 1351 | BUG_ON(cpu_online(cpu)); |
813 | old_base = per_cpu(hrtimer_bases, cpu); | 1352 | old_base = &per_cpu(hrtimer_bases, cpu); |
814 | new_base = get_cpu_var(hrtimer_bases); | 1353 | new_base = &get_cpu_var(hrtimer_bases); |
815 | |||
816 | local_irq_disable(); | ||
817 | 1354 | ||
818 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | 1355 | tick_cancel_sched_timer(cpu); |
819 | 1356 | ||
820 | spin_lock(&new_base->lock); | 1357 | local_irq_disable(); |
821 | spin_lock(&old_base->lock); | ||
822 | |||
823 | BUG_ON(old_base->curr_timer); | ||
824 | 1358 | ||
825 | migrate_hrtimer_list(old_base, new_base); | 1359 | spin_lock(&new_base->lock); |
1360 | spin_lock(&old_base->lock); | ||
826 | 1361 | ||
827 | spin_unlock(&old_base->lock); | 1362 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
828 | spin_unlock(&new_base->lock); | 1363 | migrate_hrtimer_list(&old_base->clock_base[i], |
829 | old_base++; | 1364 | &new_base->clock_base[i]); |
830 | new_base++; | ||
831 | } | 1365 | } |
1366 | spin_unlock(&old_base->lock); | ||
1367 | spin_unlock(&new_base->lock); | ||
832 | 1368 | ||
833 | local_irq_enable(); | 1369 | local_irq_enable(); |
834 | put_cpu_var(hrtimer_bases); | 1370 | put_cpu_var(hrtimer_bases); |
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, | |||
848 | 1384 | ||
849 | #ifdef CONFIG_HOTPLUG_CPU | 1385 | #ifdef CONFIG_HOTPLUG_CPU |
850 | case CPU_DEAD: | 1386 | case CPU_DEAD: |
1387 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); | ||
851 | migrate_hrtimers(cpu); | 1388 | migrate_hrtimers(cpu); |
852 | break; | 1389 | break; |
853 | #endif | 1390 | #endif |
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void) | |||
868 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1405 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, |
869 | (void *)(long)smp_processor_id()); | 1406 | (void *)(long)smp_processor_id()); |
870 | register_cpu_notifier(&hrtimers_nb); | 1407 | register_cpu_notifier(&hrtimers_nb); |
1408 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1409 | open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); | ||
1410 | #endif | ||
871 | } | 1411 | } |
872 | 1412 | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1dab0ac3f797..681c52dbfe22 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | obj-y := handle.o manage.o spurious.o resend.o chip.o | 2 | obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d27b25855743..0133f4f9e9f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -39,6 +39,7 @@ void dynamic_irq_init(unsigned int irq) | |||
39 | desc->chip = &no_irq_chip; | 39 | desc->chip = &no_irq_chip; |
40 | desc->handle_irq = handle_bad_irq; | 40 | desc->handle_irq = handle_bad_irq; |
41 | desc->depth = 1; | 41 | desc->depth = 1; |
42 | desc->msi_desc = NULL; | ||
42 | desc->handler_data = NULL; | 43 | desc->handler_data = NULL; |
43 | desc->chip_data = NULL; | 44 | desc->chip_data = NULL; |
44 | desc->action = NULL; | 45 | desc->action = NULL; |
@@ -74,6 +75,9 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
74 | WARN_ON(1); | 75 | WARN_ON(1); |
75 | return; | 76 | return; |
76 | } | 77 | } |
78 | desc->msi_desc = NULL; | ||
79 | desc->handler_data = NULL; | ||
80 | desc->chip_data = NULL; | ||
77 | desc->handle_irq = handle_bad_irq; | 81 | desc->handle_irq = handle_bad_irq; |
78 | desc->chip = &no_irq_chip; | 82 | desc->chip = &no_irq_chip; |
79 | spin_unlock_irqrestore(&desc->lock, flags); | 83 | spin_unlock_irqrestore(&desc->lock, flags); |
@@ -162,6 +166,30 @@ int set_irq_data(unsigned int irq, void *data) | |||
162 | EXPORT_SYMBOL(set_irq_data); | 166 | EXPORT_SYMBOL(set_irq_data); |
163 | 167 | ||
164 | /** | 168 | /** |
169 | * set_irq_data - set irq type data for an irq | ||
170 | * @irq: Interrupt number | ||
171 | * @entry: Pointer to MSI descriptor data | ||
172 | * | ||
173 | * Set the hardware irq controller data for an irq | ||
174 | */ | ||
175 | int set_irq_msi(unsigned int irq, struct msi_desc *entry) | ||
176 | { | ||
177 | struct irq_desc *desc; | ||
178 | unsigned long flags; | ||
179 | |||
180 | if (irq >= NR_IRQS) { | ||
181 | printk(KERN_ERR | ||
182 | "Trying to install msi data for IRQ%d\n", irq); | ||
183 | return -EINVAL; | ||
184 | } | ||
185 | desc = irq_desc + irq; | ||
186 | spin_lock_irqsave(&desc->lock, flags); | ||
187 | desc->msi_desc = entry; | ||
188 | spin_unlock_irqrestore(&desc->lock, flags); | ||
189 | return 0; | ||
190 | } | ||
191 | |||
192 | /** | ||
165 | * set_irq_chip_data - set irq chip data for an irq | 193 | * set_irq_chip_data - set irq chip data for an irq |
166 | * @irq: Interrupt number | 194 | * @irq: Interrupt number |
167 | * @data: Pointer to chip specific data | 195 | * @data: Pointer to chip specific data |
@@ -202,10 +230,6 @@ static void default_enable(unsigned int irq) | |||
202 | */ | 230 | */ |
203 | static void default_disable(unsigned int irq) | 231 | static void default_disable(unsigned int irq) |
204 | { | 232 | { |
205 | struct irq_desc *desc = irq_desc + irq; | ||
206 | |||
207 | if (!(desc->status & IRQ_DELAYED_DISABLE)) | ||
208 | desc->chip->mask(irq); | ||
209 | } | 233 | } |
210 | 234 | ||
211 | /* | 235 | /* |
@@ -270,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
270 | 294 | ||
271 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 295 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
272 | goto out_unlock; | 296 | goto out_unlock; |
273 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
274 | kstat_cpu(cpu).irqs[irq]++; | 297 | kstat_cpu(cpu).irqs[irq]++; |
275 | 298 | ||
276 | action = desc->action; | 299 | action = desc->action; |
277 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | 300 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
301 | if (desc->chip->mask) | ||
302 | desc->chip->mask(irq); | ||
303 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
304 | desc->status |= IRQ_PENDING; | ||
278 | goto out_unlock; | 305 | goto out_unlock; |
306 | } | ||
279 | 307 | ||
308 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); | ||
280 | desc->status |= IRQ_INPROGRESS; | 309 | desc->status |= IRQ_INPROGRESS; |
281 | spin_unlock(&desc->lock); | 310 | spin_unlock(&desc->lock); |
282 | 311 | ||
@@ -368,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
368 | 397 | ||
369 | /* | 398 | /* |
370 | * If its disabled or no action available | 399 | * If its disabled or no action available |
371 | * keep it masked and get out of here | 400 | * then mask it and get out of here: |
372 | */ | 401 | */ |
373 | action = desc->action; | 402 | action = desc->action; |
374 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 403 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
375 | desc->status |= IRQ_PENDING; | 404 | desc->status |= IRQ_PENDING; |
405 | if (desc->chip->mask) | ||
406 | desc->chip->mask(irq); | ||
376 | goto out; | 407 | goto out; |
377 | } | 408 | } |
378 | 409 | ||
@@ -534,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
534 | 565 | ||
535 | /* Uninstall? */ | 566 | /* Uninstall? */ |
536 | if (handle == handle_bad_irq) { | 567 | if (handle == handle_bad_irq) { |
537 | if (desc->chip != &no_irq_chip) { | 568 | if (desc->chip != &no_irq_chip) |
538 | desc->chip->mask(irq); | 569 | mask_ack_irq(desc, irq); |
539 | desc->chip->ack(irq); | ||
540 | } | ||
541 | desc->status |= IRQ_DISABLED; | 570 | desc->status |= IRQ_DISABLED; |
542 | desc->depth = 1; | 571 | desc->depth = 1; |
543 | } | 572 | } |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c new file mode 100644 index 000000000000..85a430da0fb6 --- /dev/null +++ b/kernel/irq/devres.c | |||
@@ -0,0 +1,88 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/interrupt.h> | ||
3 | |||
4 | /* | ||
5 | * Device resource management aware IRQ request/free implementation. | ||
6 | */ | ||
7 | struct irq_devres { | ||
8 | unsigned int irq; | ||
9 | void *dev_id; | ||
10 | }; | ||
11 | |||
12 | static void devm_irq_release(struct device *dev, void *res) | ||
13 | { | ||
14 | struct irq_devres *this = res; | ||
15 | |||
16 | free_irq(this->irq, this->dev_id); | ||
17 | } | ||
18 | |||
19 | static int devm_irq_match(struct device *dev, void *res, void *data) | ||
20 | { | ||
21 | struct irq_devres *this = res, *match = data; | ||
22 | |||
23 | return this->irq == match->irq && this->dev_id == match->dev_id; | ||
24 | } | ||
25 | |||
26 | /** | ||
27 | * devm_request_irq - allocate an interrupt line for a managed device | ||
28 | * @dev: device to request interrupt for | ||
29 | * @irq: Interrupt line to allocate | ||
30 | * @handler: Function to be called when the IRQ occurs | ||
31 | * @irqflags: Interrupt type flags | ||
32 | * @devname: An ascii name for the claiming device | ||
33 | * @dev_id: A cookie passed back to the handler function | ||
34 | * | ||
35 | * Except for the extra @dev argument, this function takes the | ||
36 | * same arguments and performs the same function as | ||
37 | * request_irq(). IRQs requested with this function will be | ||
38 | * automatically freed on driver detach. | ||
39 | * | ||
40 | * If an IRQ allocated with this function needs to be freed | ||
41 | * separately, dev_free_irq() must be used. | ||
42 | */ | ||
43 | int devm_request_irq(struct device *dev, unsigned int irq, | ||
44 | irq_handler_t handler, unsigned long irqflags, | ||
45 | const char *devname, void *dev_id) | ||
46 | { | ||
47 | struct irq_devres *dr; | ||
48 | int rc; | ||
49 | |||
50 | dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), | ||
51 | GFP_KERNEL); | ||
52 | if (!dr) | ||
53 | return -ENOMEM; | ||
54 | |||
55 | rc = request_irq(irq, handler, irqflags, devname, dev_id); | ||
56 | if (rc) { | ||
57 | kfree(dr); | ||
58 | return rc; | ||
59 | } | ||
60 | |||
61 | dr->irq = irq; | ||
62 | dr->dev_id = dev_id; | ||
63 | devres_add(dev, dr); | ||
64 | |||
65 | return 0; | ||
66 | } | ||
67 | EXPORT_SYMBOL(devm_request_irq); | ||
68 | |||
69 | /** | ||
70 | * devm_free_irq - free an interrupt | ||
71 | * @dev: device to free interrupt for | ||
72 | * @irq: Interrupt line to free | ||
73 | * @dev_id: Device identity to free | ||
74 | * | ||
75 | * Except for the extra @dev argument, this function takes the | ||
76 | * same arguments and performs the same function as free_irq(). | ||
77 | * This function instead of free_irq() should be used to manually | ||
78 | * free IRQs allocated with dev_request_irq(). | ||
79 | */ | ||
80 | void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) | ||
81 | { | ||
82 | struct irq_devres match_data = { irq, dev_id }; | ||
83 | |||
84 | free_irq(irq, dev_id); | ||
85 | WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, | ||
86 | &match_data)); | ||
87 | } | ||
88 | EXPORT_SYMBOL(devm_free_irq); | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b961adc3bd2..5597c157442a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq) | |||
38 | } | 38 | } |
39 | EXPORT_SYMBOL(synchronize_irq); | 39 | EXPORT_SYMBOL(synchronize_irq); |
40 | 40 | ||
41 | /** | ||
42 | * irq_can_set_affinity - Check if the affinity of a given irq can be set | ||
43 | * @irq: Interrupt to check | ||
44 | * | ||
45 | */ | ||
46 | int irq_can_set_affinity(unsigned int irq) | ||
47 | { | ||
48 | struct irq_desc *desc = irq_desc + irq; | ||
49 | |||
50 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || | ||
51 | !desc->chip->set_affinity) | ||
52 | return 0; | ||
53 | |||
54 | return 1; | ||
55 | } | ||
56 | |||
57 | /** | ||
58 | * irq_set_affinity - Set the irq affinity of a given irq | ||
59 | * @irq: Interrupt to set affinity | ||
60 | * @cpumask: cpumask | ||
61 | * | ||
62 | */ | ||
63 | int irq_set_affinity(unsigned int irq, cpumask_t cpumask) | ||
64 | { | ||
65 | struct irq_desc *desc = irq_desc + irq; | ||
66 | |||
67 | if (!desc->chip->set_affinity) | ||
68 | return -EINVAL; | ||
69 | |||
70 | set_balance_irq_affinity(irq, cpumask); | ||
71 | |||
72 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
73 | set_pending_irq(irq, cpumask); | ||
74 | #else | ||
75 | desc->affinity = cpumask; | ||
76 | desc->chip->set_affinity(irq, cpumask); | ||
77 | #endif | ||
78 | return 0; | ||
79 | } | ||
80 | |||
41 | #endif | 81 | #endif |
42 | 82 | ||
43 | /** | 83 | /** |
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
281 | if (new->flags & IRQF_PERCPU) | 321 | if (new->flags & IRQF_PERCPU) |
282 | desc->status |= IRQ_PER_CPU; | 322 | desc->status |= IRQ_PER_CPU; |
283 | #endif | 323 | #endif |
324 | /* Exclude IRQ from balancing */ | ||
325 | if (new->flags & IRQF_NOBALANCING) | ||
326 | desc->status |= IRQ_NO_BALANCING; | ||
327 | |||
284 | if (!shared) { | 328 | if (!shared) { |
285 | irq_chip_set_defaults(desc->chip); | 329 | irq_chip_set_defaults(desc->chip); |
286 | 330 | ||
@@ -328,12 +372,14 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
328 | return 0; | 372 | return 0; |
329 | 373 | ||
330 | mismatch: | 374 | mismatch: |
375 | #ifdef CONFIG_DEBUG_SHIRQ | ||
331 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 376 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
332 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 377 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); |
333 | if (old_name) | 378 | if (old_name) |
334 | printk(KERN_ERR "current handler: %s\n", old_name); | 379 | printk(KERN_ERR "current handler: %s\n", old_name); |
335 | dump_stack(); | 380 | dump_stack(); |
336 | } | 381 | } |
382 | #endif | ||
337 | spin_unlock_irqrestore(&desc->lock, flags); | 383 | spin_unlock_irqrestore(&desc->lock, flags); |
338 | return -EBUSY; | 384 | return -EBUSY; |
339 | } | 385 | } |
@@ -357,6 +403,7 @@ void free_irq(unsigned int irq, void *dev_id) | |||
357 | struct irq_desc *desc; | 403 | struct irq_desc *desc; |
358 | struct irqaction **p; | 404 | struct irqaction **p; |
359 | unsigned long flags; | 405 | unsigned long flags; |
406 | irqreturn_t (*handler)(int, void *) = NULL; | ||
360 | 407 | ||
361 | WARN_ON(in_interrupt()); | 408 | WARN_ON(in_interrupt()); |
362 | if (irq >= NR_IRQS) | 409 | if (irq >= NR_IRQS) |
@@ -396,6 +443,8 @@ void free_irq(unsigned int irq, void *dev_id) | |||
396 | 443 | ||
397 | /* Make sure it's not being used on another CPU */ | 444 | /* Make sure it's not being used on another CPU */ |
398 | synchronize_irq(irq); | 445 | synchronize_irq(irq); |
446 | if (action->flags & IRQF_SHARED) | ||
447 | handler = action->handler; | ||
399 | kfree(action); | 448 | kfree(action); |
400 | return; | 449 | return; |
401 | } | 450 | } |
@@ -403,6 +452,17 @@ void free_irq(unsigned int irq, void *dev_id) | |||
403 | spin_unlock_irqrestore(&desc->lock, flags); | 452 | spin_unlock_irqrestore(&desc->lock, flags); |
404 | return; | 453 | return; |
405 | } | 454 | } |
455 | #ifdef CONFIG_DEBUG_SHIRQ | ||
456 | if (handler) { | ||
457 | /* | ||
458 | * It's a shared IRQ -- the driver ought to be prepared for it | ||
459 | * to happen even now it's being freed, so let's make sure.... | ||
460 | * We do this after actually deregistering it, to make sure that | ||
461 | * a 'real' IRQ doesn't run in parallel with our fake | ||
462 | */ | ||
463 | handler(irq, dev_id); | ||
464 | } | ||
465 | #endif | ||
406 | } | 466 | } |
407 | EXPORT_SYMBOL(free_irq); | 467 | EXPORT_SYMBOL(free_irq); |
408 | 468 | ||
@@ -445,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, | |||
445 | /* | 505 | /* |
446 | * Lockdep wants atomic interrupt handlers: | 506 | * Lockdep wants atomic interrupt handlers: |
447 | */ | 507 | */ |
448 | irqflags |= SA_INTERRUPT; | 508 | irqflags |= IRQF_DISABLED; |
449 | #endif | 509 | #endif |
450 | /* | 510 | /* |
451 | * Sanity-check: shared interrupts must pass in a real dev-ID, | 511 | * Sanity-check: shared interrupts must pass in a real dev-ID, |
@@ -475,6 +535,25 @@ int request_irq(unsigned int irq, irq_handler_t handler, | |||
475 | 535 | ||
476 | select_smp_affinity(irq); | 536 | select_smp_affinity(irq); |
477 | 537 | ||
538 | #ifdef CONFIG_DEBUG_SHIRQ | ||
539 | if (irqflags & IRQF_SHARED) { | ||
540 | /* | ||
541 | * It's a shared IRQ -- the driver ought to be prepared for it | ||
542 | * to happen immediately, so let's make sure.... | ||
543 | * We do this before actually registering it, to make sure that | ||
544 | * a 'real' IRQ doesn't run in parallel with our fake | ||
545 | */ | ||
546 | if (irqflags & IRQF_DISABLED) { | ||
547 | unsigned long flags; | ||
548 | |||
549 | local_irq_save(flags); | ||
550 | handler(irq, dev_id); | ||
551 | local_irq_restore(flags); | ||
552 | } else | ||
553 | handler(irq, dev_id); | ||
554 | } | ||
555 | #endif | ||
556 | |||
478 | retval = setup_irq(irq, action); | 557 | retval = setup_irq(irq, action); |
479 | if (retval) | 558 | if (retval) |
480 | kfree(action); | 559 | kfree(action); |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 4baa3bbcd25a..77b7acc875c5 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -65,12 +65,11 @@ void move_native_irq(int irq) | |||
65 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 65 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) |
66 | return; | 66 | return; |
67 | 67 | ||
68 | if (likely(!(desc->status & IRQ_DISABLED))) | 68 | if (unlikely(desc->status & IRQ_DISABLED)) |
69 | desc->chip->disable(irq); | 69 | return; |
70 | 70 | ||
71 | desc->chip->mask(irq); | ||
71 | move_masked_irq(irq); | 72 | move_masked_irq(irq); |
72 | 73 | desc->chip->unmask(irq); | |
73 | if (likely(!(desc->status & IRQ_DISABLED))) | ||
74 | desc->chip->enable(irq); | ||
75 | } | 74 | } |
76 | 75 | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 61f5c717a8f5..2db91eb54ad8 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir; | |||
16 | 16 | ||
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | 18 | ||
19 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
20 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
21 | { | ||
22 | set_balance_irq_affinity(irq, mask_val); | ||
23 | |||
24 | /* | ||
25 | * Save these away for later use. Re-progam when the | ||
26 | * interrupt is pending | ||
27 | */ | ||
28 | set_pending_irq(irq, mask_val); | ||
29 | } | ||
30 | #else | ||
31 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
32 | { | ||
33 | set_balance_irq_affinity(irq, mask_val); | ||
34 | irq_desc[irq].affinity = mask_val; | ||
35 | irq_desc[irq].chip->set_affinity(irq, mask_val); | ||
36 | } | ||
37 | #endif | ||
38 | |||
39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 19 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
40 | int count, int *eof, void *data) | 20 | int count, int *eof, void *data) |
41 | { | 21 | { |
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
55 | cpumask_t new_value, tmp; | 35 | cpumask_t new_value, tmp; |
56 | 36 | ||
57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || | 37 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || |
58 | CHECK_IRQ_PER_CPU(irq_desc[irq].status)) | 38 | irq_balancing_disabled(irq)) |
59 | return -EIO; | 39 | return -EIO; |
60 | 40 | ||
61 | err = cpumask_parse_user(buffer, count, new_value); | 41 | err = cpumask_parse_user(buffer, count, new_value); |
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
73 | code to set default SMP affinity. */ | 53 | code to set default SMP affinity. */ |
74 | return select_smp_affinity(irq) ? -EINVAL : full_count; | 54 | return select_smp_affinity(irq) ? -EINVAL : full_count; |
75 | 55 | ||
76 | proc_set_irq_affinity(irq, new_value); | 56 | irq_set_affinity(irq, new_value); |
77 | 57 | ||
78 | return full_count; | 58 | return full_count; |
79 | } | 59 | } |
@@ -136,7 +116,6 @@ void register_irq_proc(unsigned int irq) | |||
136 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); | 116 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
137 | 117 | ||
138 | if (entry) { | 118 | if (entry) { |
139 | entry->nlink = 1; | ||
140 | entry->data = (void *)(long)irq; | 119 | entry->data = (void *)(long)irq; |
141 | entry->read_proc = irq_affinity_read_proc; | 120 | entry->read_proc = irq_affinity_read_proc; |
142 | entry->write_proc = irq_affinity_write_proc; | 121 | entry->write_proc = irq_affinity_write_proc; |
diff --git a/kernel/itimer.c b/kernel/itimer.c index 204ed7939e75..307c6a632ef6 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | |||
128 | /* | 128 | /* |
129 | * The timer is automagically restarted, when interval != 0 | 129 | * The timer is automagically restarted, when interval != 0 |
130 | */ | 130 | */ |
131 | int it_real_fn(struct hrtimer *timer) | 131 | enum hrtimer_restart it_real_fn(struct hrtimer *timer) |
132 | { | 132 | { |
133 | struct signal_struct *sig = | 133 | struct signal_struct *sig = |
134 | container_of(timer, struct signal_struct, real_timer); | 134 | container_of(timer, struct signal_struct, real_timer); |
135 | 135 | ||
136 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); | 136 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); |
137 | 137 | ||
138 | if (sig->it_real_incr.tv64 != 0) { | ||
139 | hrtimer_forward(timer, timer->base->softirq_time, | ||
140 | sig->it_real_incr); | ||
141 | return HRTIMER_RESTART; | ||
142 | } | ||
143 | return HRTIMER_NORESTART; | 138 | return HRTIMER_NORESTART; |
144 | } | 139 | } |
145 | 140 | ||
@@ -231,11 +226,14 @@ again: | |||
231 | spin_unlock_irq(&tsk->sighand->siglock); | 226 | spin_unlock_irq(&tsk->sighand->siglock); |
232 | goto again; | 227 | goto again; |
233 | } | 228 | } |
234 | tsk->signal->it_real_incr = | ||
235 | timeval_to_ktime(value->it_interval); | ||
236 | expires = timeval_to_ktime(value->it_value); | 229 | expires = timeval_to_ktime(value->it_value); |
237 | if (expires.tv64 != 0) | 230 | if (expires.tv64 != 0) { |
238 | hrtimer_start(timer, expires, HRTIMER_REL); | 231 | tsk->signal->it_real_incr = |
232 | timeval_to_ktime(value->it_interval); | ||
233 | hrtimer_start(timer, expires, HRTIMER_MODE_REL); | ||
234 | } else | ||
235 | tsk->signal->it_real_incr.tv64 = 0; | ||
236 | |||
239 | spin_unlock_irq(&tsk->sighand->siglock); | 237 | spin_unlock_irq(&tsk->sighand->siglock); |
240 | break; | 238 | break; |
241 | case ITIMER_VIRTUAL: | 239 | case ITIMER_VIRTUAL: |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 5d1d907378a2..cee419143fd4 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -32,8 +32,8 @@ | |||
32 | * @gfp_mask: get_free_pages mask, passed to kmalloc() | 32 | * @gfp_mask: get_free_pages mask, passed to kmalloc() |
33 | * @lock: the lock to be used to protect the fifo buffer | 33 | * @lock: the lock to be used to protect the fifo buffer |
34 | * | 34 | * |
35 | * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the | 35 | * Do NOT pass the kfifo to kfifo_free() after use! Simply free the |
36 | * struct kfifo with kfree(). | 36 | * &struct kfifo with kfree(). |
37 | */ | 37 | */ |
38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, | 38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, |
39 | gfp_t gfp_mask, spinlock_t *lock) | 39 | gfp_t gfp_mask, spinlock_t *lock) |
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(kfifo_free); | |||
108 | * @buffer: the data to be added. | 108 | * @buffer: the data to be added. |
109 | * @len: the length of the data to be added. | 109 | * @len: the length of the data to be added. |
110 | * | 110 | * |
111 | * This function copies at most 'len' bytes from the 'buffer' into | 111 | * This function copies at most @len bytes from the @buffer into |
112 | * the FIFO depending on the free space, and returns the number of | 112 | * the FIFO depending on the free space, and returns the number of |
113 | * bytes copied. | 113 | * bytes copied. |
114 | * | 114 | * |
@@ -155,8 +155,8 @@ EXPORT_SYMBOL(__kfifo_put); | |||
155 | * @buffer: where the data must be copied. | 155 | * @buffer: where the data must be copied. |
156 | * @len: the size of the destination buffer. | 156 | * @len: the size of the destination buffer. |
157 | * | 157 | * |
158 | * This function copies at most 'len' bytes from the FIFO into the | 158 | * This function copies at most @len bytes from the FIFO into the |
159 | * 'buffer' and returns the number of copied bytes. | 159 | * @buffer and returns the number of copied bytes. |
160 | * | 160 | * |
161 | * Note that with only one concurrent reader and one concurrent | 161 | * Note that with only one concurrent reader and one concurrent |
162 | * writer, you don't need extra locking to use these functions. | 162 | * writer, you don't need extra locking to use these functions. |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379aa31ca..796276141e51 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data) | |||
217 | sub_info->retval = ret; | 217 | sub_info->retval = ret; |
218 | } | 218 | } |
219 | 219 | ||
220 | complete(sub_info->complete); | 220 | if (sub_info->wait < 0) |
221 | kfree(sub_info); | ||
222 | else | ||
223 | complete(sub_info->complete); | ||
221 | return 0; | 224 | return 0; |
222 | } | 225 | } |
223 | 226 | ||
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) | |||
239 | pid = kernel_thread(____call_usermodehelper, sub_info, | 242 | pid = kernel_thread(____call_usermodehelper, sub_info, |
240 | CLONE_VFORK | SIGCHLD); | 243 | CLONE_VFORK | SIGCHLD); |
241 | 244 | ||
245 | if (wait < 0) | ||
246 | return; | ||
247 | |||
242 | if (pid < 0) { | 248 | if (pid < 0) { |
243 | sub_info->retval = pid; | 249 | sub_info->retval = pid; |
244 | complete(sub_info->complete); | 250 | complete(sub_info->complete); |
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) | |||
253 | * @envp: null-terminated environment list | 259 | * @envp: null-terminated environment list |
254 | * @session_keyring: session keyring for process (NULL for an empty keyring) | 260 | * @session_keyring: session keyring for process (NULL for an empty keyring) |
255 | * @wait: wait for the application to finish and return status. | 261 | * @wait: wait for the application to finish and return status. |
262 | * when -1 don't wait at all, but you get no useful error back when | ||
263 | * the program couldn't be exec'ed. This makes it safe to call | ||
264 | * from interrupt context. | ||
256 | * | 265 | * |
257 | * Runs a user-space application. The application is started | 266 | * Runs a user-space application. The application is started |
258 | * asynchronously if wait is not set, and runs as a child of keventd. | 267 | * asynchronously if wait is not set, and runs as a child of keventd. |
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
265 | struct key *session_keyring, int wait) | 274 | struct key *session_keyring, int wait) |
266 | { | 275 | { |
267 | DECLARE_COMPLETION_ONSTACK(done); | 276 | DECLARE_COMPLETION_ONSTACK(done); |
268 | struct subprocess_info sub_info = { | 277 | struct subprocess_info *sub_info; |
269 | .work = __WORK_INITIALIZER(sub_info.work, | 278 | int retval; |
270 | __call_usermodehelper), | ||
271 | .complete = &done, | ||
272 | .path = path, | ||
273 | .argv = argv, | ||
274 | .envp = envp, | ||
275 | .ring = session_keyring, | ||
276 | .wait = wait, | ||
277 | .retval = 0, | ||
278 | }; | ||
279 | 279 | ||
280 | if (!khelper_wq) | 280 | if (!khelper_wq) |
281 | return -EBUSY; | 281 | return -EBUSY; |
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
283 | if (path[0] == '\0') | 283 | if (path[0] == '\0') |
284 | return 0; | 284 | return 0; |
285 | 285 | ||
286 | queue_work(khelper_wq, &sub_info.work); | 286 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); |
287 | if (!sub_info) | ||
288 | return -ENOMEM; | ||
289 | |||
290 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
291 | sub_info->complete = &done; | ||
292 | sub_info->path = path; | ||
293 | sub_info->argv = argv; | ||
294 | sub_info->envp = envp; | ||
295 | sub_info->ring = session_keyring; | ||
296 | sub_info->wait = wait; | ||
297 | |||
298 | queue_work(khelper_wq, &sub_info->work); | ||
299 | if (wait < 0) /* task has freed sub_info */ | ||
300 | return 0; | ||
287 | wait_for_completion(&done); | 301 | wait_for_completion(&done); |
288 | return sub_info.retval; | 302 | retval = sub_info->retval; |
303 | kfree(sub_info); | ||
304 | return retval; | ||
289 | } | 305 | } |
290 | EXPORT_SYMBOL(call_usermodehelper_keys); | 306 | EXPORT_SYMBOL(call_usermodehelper_keys); |
291 | 307 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6fcf8dd148d0..d25a9ada3f8e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -39,6 +39,8 @@ | |||
39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
40 | #include <linux/kallsyms.h> | 40 | #include <linux/kallsyms.h> |
41 | #include <linux/freezer.h> | 41 | #include <linux/freezer.h> |
42 | #include <linux/seq_file.h> | ||
43 | #include <linux/debugfs.h> | ||
42 | #include <asm-generic/sections.h> | 44 | #include <asm-generic/sections.h> |
43 | #include <asm/cacheflush.h> | 45 | #include <asm/cacheflush.h> |
44 | #include <asm/errno.h> | 46 | #include <asm/errno.h> |
@@ -778,6 +780,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
778 | return -ENOSYS; | 780 | return -ENOSYS; |
779 | } | 781 | } |
780 | 782 | ||
783 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, | ||
784 | struct pt_regs *regs) | ||
785 | { | ||
786 | return 0; | ||
787 | } | ||
788 | |||
781 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | 789 | #endif /* ARCH_SUPPORTS_KRETPROBES */ |
782 | 790 | ||
783 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 791 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
@@ -815,7 +823,109 @@ static int __init init_kprobes(void) | |||
815 | return err; | 823 | return err; |
816 | } | 824 | } |
817 | 825 | ||
818 | __initcall(init_kprobes); | 826 | #ifdef CONFIG_DEBUG_FS |
827 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | ||
828 | const char *sym, int offset,char *modname) | ||
829 | { | ||
830 | char *kprobe_type; | ||
831 | |||
832 | if (p->pre_handler == pre_handler_kretprobe) | ||
833 | kprobe_type = "r"; | ||
834 | else if (p->pre_handler == setjmp_pre_handler) | ||
835 | kprobe_type = "j"; | ||
836 | else | ||
837 | kprobe_type = "k"; | ||
838 | if (sym) | ||
839 | seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, | ||
840 | sym, offset, (modname ? modname : " ")); | ||
841 | else | ||
842 | seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); | ||
843 | } | ||
844 | |||
845 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | ||
846 | { | ||
847 | return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; | ||
848 | } | ||
849 | |||
850 | static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) | ||
851 | { | ||
852 | (*pos)++; | ||
853 | if (*pos >= KPROBE_TABLE_SIZE) | ||
854 | return NULL; | ||
855 | return pos; | ||
856 | } | ||
857 | |||
858 | static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) | ||
859 | { | ||
860 | /* Nothing to do */ | ||
861 | } | ||
862 | |||
863 | static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | ||
864 | { | ||
865 | struct hlist_head *head; | ||
866 | struct hlist_node *node; | ||
867 | struct kprobe *p, *kp; | ||
868 | const char *sym = NULL; | ||
869 | unsigned int i = *(loff_t *) v; | ||
870 | unsigned long size, offset = 0; | ||
871 | char *modname, namebuf[128]; | ||
872 | |||
873 | head = &kprobe_table[i]; | ||
874 | preempt_disable(); | ||
875 | hlist_for_each_entry_rcu(p, node, head, hlist) { | ||
876 | sym = kallsyms_lookup((unsigned long)p->addr, &size, | ||
877 | &offset, &modname, namebuf); | ||
878 | if (p->pre_handler == aggr_pre_handler) { | ||
879 | list_for_each_entry_rcu(kp, &p->list, list) | ||
880 | report_probe(pi, kp, sym, offset, modname); | ||
881 | } else | ||
882 | report_probe(pi, p, sym, offset, modname); | ||
883 | } | ||
884 | preempt_enable(); | ||
885 | return 0; | ||
886 | } | ||
887 | |||
888 | static struct seq_operations kprobes_seq_ops = { | ||
889 | .start = kprobe_seq_start, | ||
890 | .next = kprobe_seq_next, | ||
891 | .stop = kprobe_seq_stop, | ||
892 | .show = show_kprobe_addr | ||
893 | }; | ||
894 | |||
895 | static int __kprobes kprobes_open(struct inode *inode, struct file *filp) | ||
896 | { | ||
897 | return seq_open(filp, &kprobes_seq_ops); | ||
898 | } | ||
899 | |||
900 | static struct file_operations debugfs_kprobes_operations = { | ||
901 | .open = kprobes_open, | ||
902 | .read = seq_read, | ||
903 | .llseek = seq_lseek, | ||
904 | .release = seq_release, | ||
905 | }; | ||
906 | |||
907 | static int __kprobes debugfs_kprobe_init(void) | ||
908 | { | ||
909 | struct dentry *dir, *file; | ||
910 | |||
911 | dir = debugfs_create_dir("kprobes", NULL); | ||
912 | if (!dir) | ||
913 | return -ENOMEM; | ||
914 | |||
915 | file = debugfs_create_file("list", 0444, dir , 0 , | ||
916 | &debugfs_kprobes_operations); | ||
917 | if (!file) { | ||
918 | debugfs_remove(dir); | ||
919 | return -ENOMEM; | ||
920 | } | ||
921 | |||
922 | return 0; | ||
923 | } | ||
924 | |||
925 | late_initcall(debugfs_kprobe_init); | ||
926 | #endif /* CONFIG_DEBUG_FS */ | ||
927 | |||
928 | module_init(init_kprobes); | ||
819 | 929 | ||
820 | EXPORT_SYMBOL_GPL(register_kprobe); | 930 | EXPORT_SYMBOL_GPL(register_kprobe); |
821 | EXPORT_SYMBOL_GPL(unregister_kprobe); | 931 | EXPORT_SYMBOL_GPL(unregister_kprobe); |
@@ -824,4 +934,3 @@ EXPORT_SYMBOL_GPL(unregister_jprobe); | |||
824 | EXPORT_SYMBOL_GPL(jprobe_return); | 934 | EXPORT_SYMBOL_GPL(jprobe_return); |
825 | EXPORT_SYMBOL_GPL(register_kretprobe); | 935 | EXPORT_SYMBOL_GPL(register_kretprobe); |
826 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 936 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
827 | |||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 1db8c72d0d38..87c50ccd1d4e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -50,7 +50,7 @@ static struct kthread_stop_info kthread_stop_info; | |||
50 | /** | 50 | /** |
51 | * kthread_should_stop - should this kthread return now? | 51 | * kthread_should_stop - should this kthread return now? |
52 | * | 52 | * |
53 | * When someone calls kthread_stop on your kthread, it will be woken | 53 | * When someone calls kthread_stop() on your kthread, it will be woken |
54 | * and this will return true. You should then return, and your return | 54 | * and this will return true. You should then return, and your return |
55 | * value will be passed through to kthread_stop(). | 55 | * value will be passed through to kthread_stop(). |
56 | */ | 56 | */ |
@@ -143,7 +143,7 @@ static void keventd_create_kthread(struct work_struct *work) | |||
143 | * it. See also kthread_run(), kthread_create_on_cpu(). | 143 | * it. See also kthread_run(), kthread_create_on_cpu(). |
144 | * | 144 | * |
145 | * When woken, the thread will run @threadfn() with @data as its | 145 | * When woken, the thread will run @threadfn() with @data as its |
146 | * argument. @threadfn can either call do_exit() directly if it is a | 146 | * argument. @threadfn() can either call do_exit() directly if it is a |
147 | * standalone thread for which noone will call kthread_stop(), or | 147 | * standalone thread for which noone will call kthread_stop(), or |
148 | * return when 'kthread_should_stop()' is true (which means | 148 | * return when 'kthread_should_stop()' is true (which means |
149 | * kthread_stop() has been called). The return value should be zero | 149 | * kthread_stop() has been called). The return value should be zero |
@@ -192,7 +192,7 @@ EXPORT_SYMBOL(kthread_create); | |||
192 | * | 192 | * |
193 | * Description: This function is equivalent to set_cpus_allowed(), | 193 | * Description: This function is equivalent to set_cpus_allowed(), |
194 | * except that @cpu doesn't need to be online, and the thread must be | 194 | * except that @cpu doesn't need to be online, and the thread must be |
195 | * stopped (i.e., just returned from kthread_create(). | 195 | * stopped (i.e., just returned from kthread_create()). |
196 | */ | 196 | */ |
197 | void kthread_bind(struct task_struct *k, unsigned int cpu) | 197 | void kthread_bind(struct task_struct *k, unsigned int cpu) |
198 | { | 198 | { |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 509efd49540f..a08a17218dfa 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -70,6 +70,9 @@ static int graph_lock(void) | |||
70 | 70 | ||
71 | static inline int graph_unlock(void) | 71 | static inline int graph_unlock(void) |
72 | { | 72 | { |
73 | if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) | ||
74 | return DEBUG_LOCKS_WARN_ON(1); | ||
75 | |||
73 | __raw_spin_unlock(&lockdep_lock); | 76 | __raw_spin_unlock(&lockdep_lock); |
74 | return 0; | 77 | return 0; |
75 | } | 78 | } |
@@ -487,7 +490,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth) | |||
487 | * Add a new dependency to the head of the list: | 490 | * Add a new dependency to the head of the list: |
488 | */ | 491 | */ |
489 | static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | 492 | static int add_lock_to_list(struct lock_class *class, struct lock_class *this, |
490 | struct list_head *head, unsigned long ip) | 493 | struct list_head *head, unsigned long ip, int distance) |
491 | { | 494 | { |
492 | struct lock_list *entry; | 495 | struct lock_list *entry; |
493 | /* | 496 | /* |
@@ -499,6 +502,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | |||
499 | return 0; | 502 | return 0; |
500 | 503 | ||
501 | entry->class = this; | 504 | entry->class = this; |
505 | entry->distance = distance; | ||
502 | if (!save_trace(&entry->trace)) | 506 | if (!save_trace(&entry->trace)) |
503 | return 0; | 507 | return 0; |
504 | 508 | ||
@@ -712,6 +716,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) | |||
712 | struct lock_list *entry; | 716 | struct lock_list *entry; |
713 | int ret; | 717 | int ret; |
714 | 718 | ||
719 | if (!__raw_spin_is_locked(&lockdep_lock)) | ||
720 | return DEBUG_LOCKS_WARN_ON(1); | ||
721 | |||
715 | if (depth > max_recursion_depth) | 722 | if (depth > max_recursion_depth) |
716 | max_recursion_depth = depth; | 723 | max_recursion_depth = depth; |
717 | if (depth >= RECURSION_LIMIT) | 724 | if (depth >= RECURSION_LIMIT) |
@@ -900,7 +907,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, | |||
900 | */ | 907 | */ |
901 | static int | 908 | static int |
902 | check_prev_add(struct task_struct *curr, struct held_lock *prev, | 909 | check_prev_add(struct task_struct *curr, struct held_lock *prev, |
903 | struct held_lock *next) | 910 | struct held_lock *next, int distance) |
904 | { | 911 | { |
905 | struct lock_list *entry; | 912 | struct lock_list *entry; |
906 | int ret; | 913 | int ret; |
@@ -978,8 +985,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
978 | * L2 added to its dependency list, due to the first chain.) | 985 | * L2 added to its dependency list, due to the first chain.) |
979 | */ | 986 | */ |
980 | list_for_each_entry(entry, &prev->class->locks_after, entry) { | 987 | list_for_each_entry(entry, &prev->class->locks_after, entry) { |
981 | if (entry->class == next->class) | 988 | if (entry->class == next->class) { |
989 | if (distance == 1) | ||
990 | entry->distance = 1; | ||
982 | return 2; | 991 | return 2; |
992 | } | ||
983 | } | 993 | } |
984 | 994 | ||
985 | /* | 995 | /* |
@@ -987,12 +997,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
987 | * to the previous lock's dependency list: | 997 | * to the previous lock's dependency list: |
988 | */ | 998 | */ |
989 | ret = add_lock_to_list(prev->class, next->class, | 999 | ret = add_lock_to_list(prev->class, next->class, |
990 | &prev->class->locks_after, next->acquire_ip); | 1000 | &prev->class->locks_after, next->acquire_ip, distance); |
1001 | |||
991 | if (!ret) | 1002 | if (!ret) |
992 | return 0; | 1003 | return 0; |
993 | 1004 | ||
994 | ret = add_lock_to_list(next->class, prev->class, | 1005 | ret = add_lock_to_list(next->class, prev->class, |
995 | &next->class->locks_before, next->acquire_ip); | 1006 | &next->class->locks_before, next->acquire_ip, distance); |
996 | if (!ret) | 1007 | if (!ret) |
997 | return 0; | 1008 | return 0; |
998 | 1009 | ||
@@ -1040,13 +1051,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
1040 | goto out_bug; | 1051 | goto out_bug; |
1041 | 1052 | ||
1042 | for (;;) { | 1053 | for (;;) { |
1054 | int distance = curr->lockdep_depth - depth + 1; | ||
1043 | hlock = curr->held_locks + depth-1; | 1055 | hlock = curr->held_locks + depth-1; |
1044 | /* | 1056 | /* |
1045 | * Only non-recursive-read entries get new dependencies | 1057 | * Only non-recursive-read entries get new dependencies |
1046 | * added: | 1058 | * added: |
1047 | */ | 1059 | */ |
1048 | if (hlock->read != 2) { | 1060 | if (hlock->read != 2) { |
1049 | if (!check_prev_add(curr, hlock, next)) | 1061 | if (!check_prev_add(curr, hlock, next, distance)) |
1050 | return 0; | 1062 | return 0; |
1051 | /* | 1063 | /* |
1052 | * Stop after the first non-trylock entry, | 1064 | * Stop after the first non-trylock entry, |
@@ -1293,7 +1305,8 @@ out_unlock_set: | |||
1293 | if (!subclass || force) | 1305 | if (!subclass || force) |
1294 | lock->class_cache = class; | 1306 | lock->class_cache = class; |
1295 | 1307 | ||
1296 | DEBUG_LOCKS_WARN_ON(class->subclass != subclass); | 1308 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
1309 | return NULL; | ||
1297 | 1310 | ||
1298 | return class; | 1311 | return class; |
1299 | } | 1312 | } |
@@ -1308,7 +1321,8 @@ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) | |||
1308 | struct list_head *hash_head = chainhashentry(chain_key); | 1321 | struct list_head *hash_head = chainhashentry(chain_key); |
1309 | struct lock_chain *chain; | 1322 | struct lock_chain *chain; |
1310 | 1323 | ||
1311 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | 1324 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
1325 | return 0; | ||
1312 | /* | 1326 | /* |
1313 | * We can walk it lock-free, because entries only get added | 1327 | * We can walk it lock-free, because entries only get added |
1314 | * to the hash: | 1328 | * to the hash: |
@@ -1394,7 +1408,9 @@ static void check_chain_key(struct task_struct *curr) | |||
1394 | return; | 1408 | return; |
1395 | } | 1409 | } |
1396 | id = hlock->class - lock_classes; | 1410 | id = hlock->class - lock_classes; |
1397 | DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); | 1411 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) |
1412 | return; | ||
1413 | |||
1398 | if (prev_hlock && (prev_hlock->irq_context != | 1414 | if (prev_hlock && (prev_hlock->irq_context != |
1399 | hlock->irq_context)) | 1415 | hlock->irq_context)) |
1400 | chain_key = 0; | 1416 | chain_key = 0; |
@@ -2205,15 +2221,24 @@ out_calc_hash: | |||
2205 | if (!check_prevs_add(curr, hlock)) | 2221 | if (!check_prevs_add(curr, hlock)) |
2206 | return 0; | 2222 | return 0; |
2207 | graph_unlock(); | 2223 | graph_unlock(); |
2208 | } | 2224 | } else |
2225 | /* after lookup_chain_cache(): */ | ||
2226 | if (unlikely(!debug_locks)) | ||
2227 | return 0; | ||
2228 | |||
2209 | curr->lockdep_depth++; | 2229 | curr->lockdep_depth++; |
2210 | check_chain_key(curr); | 2230 | check_chain_key(curr); |
2231 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
2232 | if (unlikely(!debug_locks)) | ||
2233 | return 0; | ||
2234 | #endif | ||
2211 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { | 2235 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { |
2212 | debug_locks_off(); | 2236 | debug_locks_off(); |
2213 | printk("BUG: MAX_LOCK_DEPTH too low!\n"); | 2237 | printk("BUG: MAX_LOCK_DEPTH too low!\n"); |
2214 | printk("turning off the locking correctness validator.\n"); | 2238 | printk("turning off the locking correctness validator.\n"); |
2215 | return 0; | 2239 | return 0; |
2216 | } | 2240 | } |
2241 | |||
2217 | if (unlikely(curr->lockdep_depth > max_lockdep_depth)) | 2242 | if (unlikely(curr->lockdep_depth > max_lockdep_depth)) |
2218 | max_lockdep_depth = curr->lockdep_depth; | 2243 | max_lockdep_depth = curr->lockdep_depth; |
2219 | 2244 | ||
@@ -2764,4 +2789,3 @@ void debug_show_held_locks(struct task_struct *task) | |||
2764 | } | 2789 | } |
2765 | 2790 | ||
2766 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 2791 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
2767 | |||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index b554b40a4aa6..58f35e586ee3 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -10,7 +10,6 @@ | |||
10 | * Code for /proc/lockdep and /proc/lockdep_stats: | 10 | * Code for /proc/lockdep and /proc/lockdep_stats: |
11 | * | 11 | * |
12 | */ | 12 | */ |
13 | #include <linux/sched.h> | ||
14 | #include <linux/module.h> | 13 | #include <linux/module.h> |
15 | #include <linux/proc_fs.h> | 14 | #include <linux/proc_fs.h> |
16 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
@@ -77,12 +76,29 @@ static unsigned long count_backward_deps(struct lock_class *class) | |||
77 | return ret; | 76 | return ret; |
78 | } | 77 | } |
79 | 78 | ||
79 | static void print_name(struct seq_file *m, struct lock_class *class) | ||
80 | { | ||
81 | char str[128]; | ||
82 | const char *name = class->name; | ||
83 | |||
84 | if (!name) { | ||
85 | name = __get_key_name(class->key, str); | ||
86 | seq_printf(m, "%s", name); | ||
87 | } else{ | ||
88 | seq_printf(m, "%s", name); | ||
89 | if (class->name_version > 1) | ||
90 | seq_printf(m, "#%d", class->name_version); | ||
91 | if (class->subclass) | ||
92 | seq_printf(m, "/%d", class->subclass); | ||
93 | } | ||
94 | } | ||
95 | |||
80 | static int l_show(struct seq_file *m, void *v) | 96 | static int l_show(struct seq_file *m, void *v) |
81 | { | 97 | { |
82 | unsigned long nr_forward_deps, nr_backward_deps; | 98 | unsigned long nr_forward_deps, nr_backward_deps; |
83 | struct lock_class *class = m->private; | 99 | struct lock_class *class = m->private; |
84 | char str[128], c1, c2, c3, c4; | 100 | struct lock_list *entry; |
85 | const char *name; | 101 | char c1, c2, c3, c4; |
86 | 102 | ||
87 | seq_printf(m, "%p", class->key); | 103 | seq_printf(m, "%p", class->key); |
88 | #ifdef CONFIG_DEBUG_LOCKDEP | 104 | #ifdef CONFIG_DEBUG_LOCKDEP |
@@ -97,16 +113,16 @@ static int l_show(struct seq_file *m, void *v) | |||
97 | get_usage_chars(class, &c1, &c2, &c3, &c4); | 113 | get_usage_chars(class, &c1, &c2, &c3, &c4); |
98 | seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); | 114 | seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); |
99 | 115 | ||
100 | name = class->name; | 116 | seq_printf(m, ": "); |
101 | if (!name) { | 117 | print_name(m, class); |
102 | name = __get_key_name(class->key, str); | 118 | seq_puts(m, "\n"); |
103 | seq_printf(m, ": %s", name); | 119 | |
104 | } else{ | 120 | list_for_each_entry(entry, &class->locks_after, entry) { |
105 | seq_printf(m, ": %s", name); | 121 | if (entry->distance == 1) { |
106 | if (class->name_version > 1) | 122 | seq_printf(m, " -> [%p] ", entry->class); |
107 | seq_printf(m, "#%d", class->name_version); | 123 | print_name(m, entry->class); |
108 | if (class->subclass) | 124 | seq_puts(m, "\n"); |
109 | seq_printf(m, "/%d", class->subclass); | 125 | } |
110 | } | 126 | } |
111 | seq_puts(m, "\n"); | 127 | seq_puts(m, "\n"); |
112 | 128 | ||
@@ -227,7 +243,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
227 | 243 | ||
228 | sum_forward_deps += count_forward_deps(class); | 244 | sum_forward_deps += count_forward_deps(class); |
229 | } | 245 | } |
230 | #ifdef CONFIG_LOCKDEP_DEBUG | 246 | #ifdef CONFIG_DEBUG_LOCKDEP |
231 | DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); | 247 | DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); |
232 | #endif | 248 | #endif |
233 | seq_printf(m, " lock-classes: %11lu [max: %lu]\n", | 249 | seq_printf(m, " lock-classes: %11lu [max: %lu]\n", |
diff --git a/kernel/module.c b/kernel/module.c index d0f2260a0210..f77e893e4620 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -537,6 +537,8 @@ static int already_uses(struct module *a, struct module *b) | |||
537 | static int use_module(struct module *a, struct module *b) | 537 | static int use_module(struct module *a, struct module *b) |
538 | { | 538 | { |
539 | struct module_use *use; | 539 | struct module_use *use; |
540 | int no_warn; | ||
541 | |||
540 | if (b == NULL || already_uses(a, b)) return 1; | 542 | if (b == NULL || already_uses(a, b)) return 1; |
541 | 543 | ||
542 | if (!strong_try_module_get(b)) | 544 | if (!strong_try_module_get(b)) |
@@ -552,6 +554,7 @@ static int use_module(struct module *a, struct module *b) | |||
552 | 554 | ||
553 | use->module_which_uses = a; | 555 | use->module_which_uses = a; |
554 | list_add(&use->list, &b->modules_which_use_me); | 556 | list_add(&use->list, &b->modules_which_use_me); |
557 | no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); | ||
555 | return 1; | 558 | return 1; |
556 | } | 559 | } |
557 | 560 | ||
@@ -569,6 +572,7 @@ static void module_unload_free(struct module *mod) | |||
569 | module_put(i); | 572 | module_put(i); |
570 | list_del(&use->list); | 573 | list_del(&use->list); |
571 | kfree(use); | 574 | kfree(use); |
575 | sysfs_remove_link(i->holders_dir, mod->name); | ||
572 | /* There can be at most one match. */ | 576 | /* There can be at most one match. */ |
573 | break; | 577 | break; |
574 | } | 578 | } |
@@ -1064,7 +1068,8 @@ static inline void remove_sect_attrs(struct module *mod) | |||
1064 | } | 1068 | } |
1065 | #endif /* CONFIG_KALLSYMS */ | 1069 | #endif /* CONFIG_KALLSYMS */ |
1066 | 1070 | ||
1067 | static int module_add_modinfo_attrs(struct module *mod) | 1071 | #ifdef CONFIG_SYSFS |
1072 | int module_add_modinfo_attrs(struct module *mod) | ||
1068 | { | 1073 | { |
1069 | struct module_attribute *attr; | 1074 | struct module_attribute *attr; |
1070 | struct module_attribute *temp_attr; | 1075 | struct module_attribute *temp_attr; |
@@ -1090,7 +1095,7 @@ static int module_add_modinfo_attrs(struct module *mod) | |||
1090 | return error; | 1095 | return error; |
1091 | } | 1096 | } |
1092 | 1097 | ||
1093 | static void module_remove_modinfo_attrs(struct module *mod) | 1098 | void module_remove_modinfo_attrs(struct module *mod) |
1094 | { | 1099 | { |
1095 | struct module_attribute *attr; | 1100 | struct module_attribute *attr; |
1096 | int i; | 1101 | int i; |
@@ -1105,10 +1110,10 @@ static void module_remove_modinfo_attrs(struct module *mod) | |||
1105 | } | 1110 | } |
1106 | kfree(mod->modinfo_attrs); | 1111 | kfree(mod->modinfo_attrs); |
1107 | } | 1112 | } |
1113 | #endif | ||
1108 | 1114 | ||
1109 | static int mod_sysfs_setup(struct module *mod, | 1115 | #ifdef CONFIG_SYSFS |
1110 | struct kernel_param *kparam, | 1116 | int mod_sysfs_init(struct module *mod) |
1111 | unsigned int num_params) | ||
1112 | { | 1117 | { |
1113 | int err; | 1118 | int err; |
1114 | 1119 | ||
@@ -1125,21 +1130,30 @@ static int mod_sysfs_setup(struct module *mod, | |||
1125 | kobj_set_kset_s(&mod->mkobj, module_subsys); | 1130 | kobj_set_kset_s(&mod->mkobj, module_subsys); |
1126 | mod->mkobj.mod = mod; | 1131 | mod->mkobj.mod = mod; |
1127 | 1132 | ||
1128 | /* delay uevent until full sysfs population */ | ||
1129 | kobject_init(&mod->mkobj.kobj); | 1133 | kobject_init(&mod->mkobj.kobj); |
1134 | |||
1135 | out: | ||
1136 | return err; | ||
1137 | } | ||
1138 | |||
1139 | int mod_sysfs_setup(struct module *mod, | ||
1140 | struct kernel_param *kparam, | ||
1141 | unsigned int num_params) | ||
1142 | { | ||
1143 | int err; | ||
1144 | |||
1145 | /* delay uevent until full sysfs population */ | ||
1130 | err = kobject_add(&mod->mkobj.kobj); | 1146 | err = kobject_add(&mod->mkobj.kobj); |
1131 | if (err) | 1147 | if (err) |
1132 | goto out; | 1148 | goto out; |
1133 | 1149 | ||
1134 | mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); | 1150 | mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); |
1135 | if (!mod->drivers_dir) { | 1151 | if (!mod->holders_dir) |
1136 | err = -ENOMEM; | ||
1137 | goto out_unreg; | 1152 | goto out_unreg; |
1138 | } | ||
1139 | 1153 | ||
1140 | err = module_param_sysfs_setup(mod, kparam, num_params); | 1154 | err = module_param_sysfs_setup(mod, kparam, num_params); |
1141 | if (err) | 1155 | if (err) |
1142 | goto out_unreg_drivers; | 1156 | goto out_unreg_holders; |
1143 | 1157 | ||
1144 | err = module_add_modinfo_attrs(mod); | 1158 | err = module_add_modinfo_attrs(mod); |
1145 | if (err) | 1159 | if (err) |
@@ -1150,21 +1164,22 @@ static int mod_sysfs_setup(struct module *mod, | |||
1150 | 1164 | ||
1151 | out_unreg_param: | 1165 | out_unreg_param: |
1152 | module_param_sysfs_remove(mod); | 1166 | module_param_sysfs_remove(mod); |
1153 | out_unreg_drivers: | 1167 | out_unreg_holders: |
1154 | kobject_unregister(mod->drivers_dir); | 1168 | kobject_unregister(mod->holders_dir); |
1155 | out_unreg: | 1169 | out_unreg: |
1156 | kobject_del(&mod->mkobj.kobj); | 1170 | kobject_del(&mod->mkobj.kobj); |
1157 | kobject_put(&mod->mkobj.kobj); | 1171 | kobject_put(&mod->mkobj.kobj); |
1158 | out: | 1172 | out: |
1159 | return err; | 1173 | return err; |
1160 | } | 1174 | } |
1175 | #endif | ||
1161 | 1176 | ||
1162 | static void mod_kobject_remove(struct module *mod) | 1177 | static void mod_kobject_remove(struct module *mod) |
1163 | { | 1178 | { |
1164 | module_remove_modinfo_attrs(mod); | 1179 | module_remove_modinfo_attrs(mod); |
1165 | module_param_sysfs_remove(mod); | 1180 | module_param_sysfs_remove(mod); |
1166 | kobject_unregister(mod->drivers_dir); | 1181 | kobject_unregister(mod->mkobj.drivers_dir); |
1167 | 1182 | kobject_unregister(mod->holders_dir); | |
1168 | kobject_unregister(&mod->mkobj.kobj); | 1183 | kobject_unregister(&mod->mkobj.kobj); |
1169 | } | 1184 | } |
1170 | 1185 | ||
@@ -1768,6 +1783,10 @@ static struct module *load_module(void __user *umod, | |||
1768 | /* Now we've moved module, initialize linked lists, etc. */ | 1783 | /* Now we've moved module, initialize linked lists, etc. */ |
1769 | module_unload_init(mod); | 1784 | module_unload_init(mod); |
1770 | 1785 | ||
1786 | /* Initialize kobject, so we can reference it. */ | ||
1787 | if (mod_sysfs_init(mod) != 0) | ||
1788 | goto cleanup; | ||
1789 | |||
1771 | /* Set up license info based on the info section */ | 1790 | /* Set up license info based on the info section */ |
1772 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1791 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1773 | 1792 | ||
@@ -2327,6 +2346,7 @@ void print_modules(void) | |||
2327 | printk("\n"); | 2346 | printk("\n"); |
2328 | } | 2347 | } |
2329 | 2348 | ||
2349 | #ifdef CONFIG_SYSFS | ||
2330 | static char *make_driver_name(struct device_driver *drv) | 2350 | static char *make_driver_name(struct device_driver *drv) |
2331 | { | 2351 | { |
2332 | char *driver_name; | 2352 | char *driver_name; |
@@ -2340,19 +2360,43 @@ static char *make_driver_name(struct device_driver *drv) | |||
2340 | return driver_name; | 2360 | return driver_name; |
2341 | } | 2361 | } |
2342 | 2362 | ||
2363 | static void module_create_drivers_dir(struct module_kobject *mk) | ||
2364 | { | ||
2365 | if (!mk || mk->drivers_dir) | ||
2366 | return; | ||
2367 | |||
2368 | mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers"); | ||
2369 | } | ||
2370 | |||
2343 | void module_add_driver(struct module *mod, struct device_driver *drv) | 2371 | void module_add_driver(struct module *mod, struct device_driver *drv) |
2344 | { | 2372 | { |
2345 | char *driver_name; | 2373 | char *driver_name; |
2346 | int no_warn; | 2374 | int no_warn; |
2375 | struct module_kobject *mk = NULL; | ||
2347 | 2376 | ||
2348 | if (!mod || !drv) | 2377 | if (!drv) |
2378 | return; | ||
2379 | |||
2380 | if (mod) | ||
2381 | mk = &mod->mkobj; | ||
2382 | else if (drv->mod_name) { | ||
2383 | struct kobject *mkobj; | ||
2384 | |||
2385 | /* Lookup built-in module entry in /sys/modules */ | ||
2386 | mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name); | ||
2387 | if (mkobj) | ||
2388 | mk = container_of(mkobj, struct module_kobject, kobj); | ||
2389 | } | ||
2390 | |||
2391 | if (!mk) | ||
2349 | return; | 2392 | return; |
2350 | 2393 | ||
2351 | /* Don't check return codes; these calls are idempotent */ | 2394 | /* Don't check return codes; these calls are idempotent */ |
2352 | no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); | 2395 | no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module"); |
2353 | driver_name = make_driver_name(drv); | 2396 | driver_name = make_driver_name(drv); |
2354 | if (driver_name) { | 2397 | if (driver_name) { |
2355 | no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, | 2398 | module_create_drivers_dir(mk); |
2399 | no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj, | ||
2356 | driver_name); | 2400 | driver_name); |
2357 | kfree(driver_name); | 2401 | kfree(driver_name); |
2358 | } | 2402 | } |
@@ -2367,16 +2411,23 @@ void module_remove_driver(struct device_driver *drv) | |||
2367 | return; | 2411 | return; |
2368 | 2412 | ||
2369 | sysfs_remove_link(&drv->kobj, "module"); | 2413 | sysfs_remove_link(&drv->kobj, "module"); |
2370 | if (drv->owner && drv->owner->drivers_dir) { | 2414 | if (drv->owner && drv->owner->mkobj.drivers_dir) { |
2371 | driver_name = make_driver_name(drv); | 2415 | driver_name = make_driver_name(drv); |
2372 | if (driver_name) { | 2416 | if (driver_name) { |
2373 | sysfs_remove_link(drv->owner->drivers_dir, | 2417 | sysfs_remove_link(drv->owner->mkobj.drivers_dir, |
2374 | driver_name); | 2418 | driver_name); |
2375 | kfree(driver_name); | 2419 | kfree(driver_name); |
2376 | } | 2420 | } |
2377 | } | 2421 | } |
2422 | /* | ||
2423 | * Undo the additional reference we added in module_add_driver() | ||
2424 | * via kset_find_obj() | ||
2425 | */ | ||
2426 | if (drv->mod_name) | ||
2427 | kobject_put(&drv->kobj); | ||
2378 | } | 2428 | } |
2379 | EXPORT_SYMBOL(module_remove_driver); | 2429 | EXPORT_SYMBOL(module_remove_driver); |
2430 | #endif | ||
2380 | 2431 | ||
2381 | #ifdef CONFIG_MODVERSIONS | 2432 | #ifdef CONFIG_MODVERSIONS |
2382 | /* Generate the signature for struct module here, too, for modversions. */ | 2433 | /* Generate the signature for struct module here, too, for modversions. */ |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 841539d72c55..d17436cdea1b 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -13,7 +13,6 @@ | |||
13 | * Released under the General Public License (GPL). | 13 | * Released under the General Public License (GPL). |
14 | */ | 14 | */ |
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <linux/sched.h> | ||
17 | #include <linux/delay.h> | 16 | #include <linux/delay.h> |
18 | #include <linux/module.h> | 17 | #include <linux/module.h> |
19 | #include <linux/poison.h> | 18 | #include <linux/poison.h> |
diff --git a/kernel/panic.c b/kernel/panic.c index 525e365f7239..623d1828259a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -150,6 +150,7 @@ EXPORT_SYMBOL(panic); | |||
150 | * 'R' - User forced a module unload. | 150 | * 'R' - User forced a module unload. |
151 | * 'M' - Machine had a machine check experience. | 151 | * 'M' - Machine had a machine check experience. |
152 | * 'B' - System has hit bad_page. | 152 | * 'B' - System has hit bad_page. |
153 | * 'U' - Userspace-defined naughtiness. | ||
153 | * | 154 | * |
154 | * The string is overwritten by the next call to print_taint(). | 155 | * The string is overwritten by the next call to print_taint(). |
155 | */ | 156 | */ |
@@ -158,13 +159,14 @@ const char *print_tainted(void) | |||
158 | { | 159 | { |
159 | static char buf[20]; | 160 | static char buf[20]; |
160 | if (tainted) { | 161 | if (tainted) { |
161 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", | 162 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", |
162 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', | 163 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', |
163 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', | 164 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', |
164 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', | 165 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', |
165 | tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', | 166 | tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', |
166 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', | 167 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', |
167 | tainted & TAINT_BAD_PAGE ? 'B' : ' '); | 168 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', |
169 | tainted & TAINT_USER ? 'U' : ' '); | ||
168 | } | 170 | } |
169 | else | 171 | else |
170 | snprintf(buf, sizeof(buf), "Not tainted"); | 172 | snprintf(buf, sizeof(buf), "Not tainted"); |
diff --git a/kernel/params.c b/kernel/params.c index 718945da8f58..e265b13195b1 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -389,6 +389,7 @@ struct module_param_attrs | |||
389 | struct param_attribute attrs[0]; | 389 | struct param_attribute attrs[0]; |
390 | }; | 390 | }; |
391 | 391 | ||
392 | #ifdef CONFIG_SYSFS | ||
392 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr); | 393 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr); |
393 | 394 | ||
394 | static ssize_t param_attr_show(struct module_attribute *mattr, | 395 | static ssize_t param_attr_show(struct module_attribute *mattr, |
@@ -424,6 +425,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
424 | return len; | 425 | return len; |
425 | return err; | 426 | return err; |
426 | } | 427 | } |
428 | #endif | ||
427 | 429 | ||
428 | #ifdef CONFIG_MODULES | 430 | #ifdef CONFIG_MODULES |
429 | #define __modinit | 431 | #define __modinit |
@@ -431,6 +433,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
431 | #define __modinit __init | 433 | #define __modinit __init |
432 | #endif | 434 | #endif |
433 | 435 | ||
436 | #ifdef CONFIG_SYSFS | ||
434 | /* | 437 | /* |
435 | * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME | 438 | * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME |
436 | * @mk: struct module_kobject (contains parent kobject) | 439 | * @mk: struct module_kobject (contains parent kobject) |
@@ -498,9 +501,7 @@ param_sysfs_setup(struct module_kobject *mk, | |||
498 | return mp; | 501 | return mp; |
499 | } | 502 | } |
500 | 503 | ||
501 | |||
502 | #ifdef CONFIG_MODULES | 504 | #ifdef CONFIG_MODULES |
503 | |||
504 | /* | 505 | /* |
505 | * module_param_sysfs_setup - setup sysfs support for one module | 506 | * module_param_sysfs_setup - setup sysfs support for one module |
506 | * @mod: module | 507 | * @mod: module |
@@ -561,14 +562,11 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
561 | mk->mod = THIS_MODULE; | 562 | mk->mod = THIS_MODULE; |
562 | kobj_set_kset_s(mk, module_subsys); | 563 | kobj_set_kset_s(mk, module_subsys); |
563 | kobject_set_name(&mk->kobj, name); | 564 | kobject_set_name(&mk->kobj, name); |
564 | ret = kobject_register(&mk->kobj); | 565 | kobject_init(&mk->kobj); |
566 | ret = kobject_add(&mk->kobj); | ||
565 | BUG_ON(ret < 0); | 567 | BUG_ON(ret < 0); |
566 | 568 | param_sysfs_setup(mk, kparam, num_params, name_skip); | |
567 | /* no need to keep the kobject if no parameter is exported */ | 569 | kobject_uevent(&mk->kobj, KOBJ_ADD); |
568 | if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { | ||
569 | kobject_unregister(&mk->kobj); | ||
570 | kfree(mk); | ||
571 | } | ||
572 | } | 570 | } |
573 | 571 | ||
574 | /* | 572 | /* |
@@ -626,7 +624,6 @@ static void __init param_sysfs_builtin(void) | |||
626 | 624 | ||
627 | 625 | ||
628 | /* module-related sysfs stuff */ | 626 | /* module-related sysfs stuff */ |
629 | #ifdef CONFIG_SYSFS | ||
630 | 627 | ||
631 | #define to_module_attr(n) container_of(n, struct module_attribute, attr); | 628 | #define to_module_attr(n) container_of(n, struct module_attribute, attr); |
632 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); | 629 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); |
@@ -674,19 +671,27 @@ static struct sysfs_ops module_sysfs_ops = { | |||
674 | .store = module_attr_store, | 671 | .store = module_attr_store, |
675 | }; | 672 | }; |
676 | 673 | ||
677 | #else | 674 | static struct kobj_type module_ktype; |
678 | static struct sysfs_ops module_sysfs_ops = { | 675 | |
679 | .show = NULL, | 676 | static int uevent_filter(struct kset *kset, struct kobject *kobj) |
680 | .store = NULL, | 677 | { |
678 | struct kobj_type *ktype = get_ktype(kobj); | ||
679 | |||
680 | if (ktype == &module_ktype) | ||
681 | return 1; | ||
682 | return 0; | ||
683 | } | ||
684 | |||
685 | static struct kset_uevent_ops module_uevent_ops = { | ||
686 | .filter = uevent_filter, | ||
681 | }; | 687 | }; |
682 | #endif | 688 | |
689 | decl_subsys(module, &module_ktype, &module_uevent_ops); | ||
683 | 690 | ||
684 | static struct kobj_type module_ktype = { | 691 | static struct kobj_type module_ktype = { |
685 | .sysfs_ops = &module_sysfs_ops, | 692 | .sysfs_ops = &module_sysfs_ops, |
686 | }; | 693 | }; |
687 | 694 | ||
688 | decl_subsys(module, &module_ktype, NULL); | ||
689 | |||
690 | /* | 695 | /* |
691 | * param_sysfs_init - wrapper for built-in params support | 696 | * param_sysfs_init - wrapper for built-in params support |
692 | */ | 697 | */ |
@@ -707,6 +712,15 @@ static int __init param_sysfs_init(void) | |||
707 | } | 712 | } |
708 | subsys_initcall(param_sysfs_init); | 713 | subsys_initcall(param_sysfs_init); |
709 | 714 | ||
715 | #else | ||
716 | #if 0 | ||
717 | static struct sysfs_ops module_sysfs_ops = { | ||
718 | .show = NULL, | ||
719 | .store = NULL, | ||
720 | }; | ||
721 | #endif | ||
722 | #endif | ||
723 | |||
710 | EXPORT_SYMBOL(param_set_byte); | 724 | EXPORT_SYMBOL(param_set_byte); |
711 | EXPORT_SYMBOL(param_get_byte); | 725 | EXPORT_SYMBOL(param_get_byte); |
712 | EXPORT_SYMBOL(param_set_short); | 726 | EXPORT_SYMBOL(param_set_short); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 7c3e1e6dfb5b..657f77697415 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
304 | * should be able to see it. | 304 | * should be able to see it. |
305 | */ | 305 | */ |
306 | struct task_struct *p; | 306 | struct task_struct *p; |
307 | read_lock(&tasklist_lock); | 307 | rcu_read_lock(); |
308 | p = find_task_by_pid(pid); | 308 | p = find_task_by_pid(pid); |
309 | if (p) { | 309 | if (p) { |
310 | if (CPUCLOCK_PERTHREAD(which_clock)) { | 310 | if (CPUCLOCK_PERTHREAD(which_clock)) { |
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
312 | error = cpu_clock_sample(which_clock, | 312 | error = cpu_clock_sample(which_clock, |
313 | p, &rtn); | 313 | p, &rtn); |
314 | } | 314 | } |
315 | } else if (p->tgid == pid && p->signal) { | 315 | } else { |
316 | error = cpu_clock_sample_group(which_clock, | 316 | read_lock(&tasklist_lock); |
317 | p, &rtn); | 317 | if (p->tgid == pid && p->signal) { |
318 | error = | ||
319 | cpu_clock_sample_group(which_clock, | ||
320 | p, &rtn); | ||
321 | } | ||
322 | read_unlock(&tasklist_lock); | ||
318 | } | 323 | } |
319 | } | 324 | } |
320 | read_unlock(&tasklist_lock); | 325 | rcu_read_unlock(); |
321 | } | 326 | } |
322 | 327 | ||
323 | if (error) | 328 | if (error) |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5fe87de10ff0..44318ca71978 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, | |||
145 | struct itimerspec *, struct itimerspec *); | 145 | struct itimerspec *, struct itimerspec *); |
146 | static int common_timer_del(struct k_itimer *timer); | 146 | static int common_timer_del(struct k_itimer *timer); |
147 | 147 | ||
148 | static int posix_timer_fn(struct hrtimer *data); | 148 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
149 | 149 | ||
150 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 150 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); |
151 | 151 | ||
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event); | |||
334 | 334 | ||
335 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. | 335 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. |
336 | */ | 336 | */ |
337 | static int posix_timer_fn(struct hrtimer *timer) | 337 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) |
338 | { | 338 | { |
339 | struct k_itimer *timr; | 339 | struct k_itimer *timr; |
340 | unsigned long flags; | 340 | unsigned long flags; |
341 | int si_private = 0; | 341 | int si_private = 0; |
342 | int ret = HRTIMER_NORESTART; | 342 | enum hrtimer_restart ret = HRTIMER_NORESTART; |
343 | 343 | ||
344 | timr = container_of(timer, struct k_itimer, it.real.timer); | 344 | timr = container_of(timer, struct k_itimer, it.real.timer); |
345 | spin_lock_irqsave(&timr->it_lock, flags); | 345 | spin_lock_irqsave(&timr->it_lock, flags); |
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer) | |||
356 | if (timr->it.real.interval.tv64 != 0) { | 356 | if (timr->it.real.interval.tv64 != 0) { |
357 | timr->it_overrun += | 357 | timr->it_overrun += |
358 | hrtimer_forward(timer, | 358 | hrtimer_forward(timer, |
359 | timer->base->softirq_time, | 359 | hrtimer_cb_get_time(timer), |
360 | timr->it.real.interval); | 360 | timr->it.real.interval); |
361 | ret = HRTIMER_RESTART; | 361 | ret = HRTIMER_RESTART; |
362 | ++timr->it_requeue_pending; | 362 | ++timr->it_requeue_pending; |
@@ -399,10 +399,9 @@ EXPORT_SYMBOL_GPL(register_posix_clock); | |||
399 | static struct k_itimer * alloc_posix_timer(void) | 399 | static struct k_itimer * alloc_posix_timer(void) |
400 | { | 400 | { |
401 | struct k_itimer *tmr; | 401 | struct k_itimer *tmr; |
402 | tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL); | 402 | tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); |
403 | if (!tmr) | 403 | if (!tmr) |
404 | return tmr; | 404 | return tmr; |
405 | memset(tmr, 0, sizeof (struct k_itimer)); | ||
406 | if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { | 405 | if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { |
407 | kmem_cache_free(posix_timers_cache, tmr); | 406 | kmem_cache_free(posix_timers_cache, tmr); |
408 | tmr = NULL; | 407 | tmr = NULL; |
@@ -723,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
723 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) | 722 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) |
724 | return 0; | 723 | return 0; |
725 | 724 | ||
726 | mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; | 725 | mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; |
727 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); | 726 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); |
728 | timr->it.real.timer.function = posix_timer_fn; | 727 | timr->it.real.timer.function = posix_timer_fn; |
729 | 728 | ||
@@ -735,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
735 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ | 734 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ |
736 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { | 735 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { |
737 | /* Setup correct expiry time for relative timers */ | 736 | /* Setup correct expiry time for relative timers */ |
738 | if (mode == HRTIMER_REL) | 737 | if (mode == HRTIMER_MODE_REL) |
739 | timer->expires = ktime_add(timer->expires, | 738 | timer->expires = ktime_add(timer->expires, |
740 | timer->base->get_time()); | 739 | timer->base->get_time()); |
741 | return 0; | 740 | return 0; |
@@ -951,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, | |||
951 | struct timespec *tsave, struct timespec __user *rmtp) | 950 | struct timespec *tsave, struct timespec __user *rmtp) |
952 | { | 951 | { |
953 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? | 952 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? |
954 | HRTIMER_ABS : HRTIMER_REL, which_clock); | 953 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, |
954 | which_clock); | ||
955 | } | 955 | } |
956 | 956 | ||
957 | asmlinkage long | 957 | asmlinkage long |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ed296225dcd4..95f6657fff73 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -131,3 +131,29 @@ config SUSPEND_SMP | |||
131 | bool | 131 | bool |
132 | depends on HOTPLUG_CPU && X86 && PM | 132 | depends on HOTPLUG_CPU && X86 && PM |
133 | default y | 133 | default y |
134 | |||
135 | config APM_EMULATION | ||
136 | tristate "Advanced Power Management Emulation" | ||
137 | depends on PM && SYS_SUPPORTS_APM_EMULATION | ||
138 | help | ||
139 | APM is a BIOS specification for saving power using several different | ||
140 | techniques. This is mostly useful for battery powered laptops with | ||
141 | APM compliant BIOSes. If you say Y here, the system time will be | ||
142 | reset after a RESUME operation, the /proc/apm device will provide | ||
143 | battery status information, and user-space programs will receive | ||
144 | notification of APM "events" (e.g. battery status change). | ||
145 | |||
146 | In order to use APM, you will need supporting software. For location | ||
147 | and more information, read <file:Documentation/pm.txt> and the | ||
148 | Battery Powered Linux mini-HOWTO, available from | ||
149 | <http://www.tldp.org/docs.html#howto>. | ||
150 | |||
151 | This driver does not spin down disk drives (see the hdparm(8) | ||
152 | manpage ("man 8 hdparm") for that), and it doesn't turn off | ||
153 | VESA-compliant "green" monitors. | ||
154 | |||
155 | Generally, if you don't have a battery in your machine, there isn't | ||
156 | much point in using this driver and you should say N. If you get | ||
157 | random kernel OOPSes or reboots that don't seem to be related to | ||
158 | anything, try disabling/enabling this option (or disabling/enabling | ||
159 | APM in your BIOS). | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 88fc5d7ac737..406b20adb27a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -87,52 +87,24 @@ static inline void platform_finish(void) | |||
87 | } | 87 | } |
88 | } | 88 | } |
89 | 89 | ||
90 | static void unprepare_processes(void) | ||
91 | { | ||
92 | thaw_processes(); | ||
93 | pm_restore_console(); | ||
94 | } | ||
95 | |||
90 | static int prepare_processes(void) | 96 | static int prepare_processes(void) |
91 | { | 97 | { |
92 | int error = 0; | 98 | int error = 0; |
93 | 99 | ||
94 | pm_prepare_console(); | 100 | pm_prepare_console(); |
95 | |||
96 | error = disable_nonboot_cpus(); | ||
97 | if (error) | ||
98 | goto enable_cpus; | ||
99 | |||
100 | if (freeze_processes()) { | 101 | if (freeze_processes()) { |
101 | error = -EBUSY; | 102 | error = -EBUSY; |
102 | goto thaw; | 103 | unprepare_processes(); |
103 | } | 104 | } |
104 | |||
105 | if (pm_disk_mode == PM_DISK_TESTPROC) { | ||
106 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
107 | mdelay(5000); | ||
108 | goto thaw; | ||
109 | } | ||
110 | |||
111 | error = platform_prepare(); | ||
112 | if (error) | ||
113 | goto thaw; | ||
114 | |||
115 | /* Free memory before shutting down devices. */ | ||
116 | if (!(error = swsusp_shrink_memory())) | ||
117 | return 0; | ||
118 | |||
119 | platform_finish(); | ||
120 | thaw: | ||
121 | thaw_processes(); | ||
122 | enable_cpus: | ||
123 | enable_nonboot_cpus(); | ||
124 | pm_restore_console(); | ||
125 | return error; | 105 | return error; |
126 | } | 106 | } |
127 | 107 | ||
128 | static void unprepare_processes(void) | ||
129 | { | ||
130 | platform_finish(); | ||
131 | thaw_processes(); | ||
132 | enable_nonboot_cpus(); | ||
133 | pm_restore_console(); | ||
134 | } | ||
135 | |||
136 | /** | 108 | /** |
137 | * pm_suspend_disk - The granpappy of hibernation power management. | 109 | * pm_suspend_disk - The granpappy of hibernation power management. |
138 | * | 110 | * |
@@ -150,29 +122,45 @@ int pm_suspend_disk(void) | |||
150 | if (error) | 122 | if (error) |
151 | return error; | 123 | return error; |
152 | 124 | ||
153 | if (pm_disk_mode == PM_DISK_TESTPROC) | 125 | if (pm_disk_mode == PM_DISK_TESTPROC) { |
154 | return 0; | 126 | printk("swsusp debug: Waiting for 5 seconds.\n"); |
127 | mdelay(5000); | ||
128 | goto Thaw; | ||
129 | } | ||
130 | /* Free memory before shutting down devices. */ | ||
131 | error = swsusp_shrink_memory(); | ||
132 | if (error) | ||
133 | goto Thaw; | ||
134 | |||
135 | error = platform_prepare(); | ||
136 | if (error) | ||
137 | goto Thaw; | ||
155 | 138 | ||
156 | suspend_console(); | 139 | suspend_console(); |
157 | error = device_suspend(PMSG_FREEZE); | 140 | error = device_suspend(PMSG_FREEZE); |
158 | if (error) { | 141 | if (error) { |
159 | resume_console(); | 142 | printk(KERN_ERR "PM: Some devices failed to suspend\n"); |
160 | printk("Some devices failed to suspend\n"); | 143 | goto Resume_devices; |
161 | goto Thaw; | ||
162 | } | 144 | } |
145 | error = disable_nonboot_cpus(); | ||
146 | if (error) | ||
147 | goto Enable_cpus; | ||
163 | 148 | ||
164 | if (pm_disk_mode == PM_DISK_TEST) { | 149 | if (pm_disk_mode == PM_DISK_TEST) { |
165 | printk("swsusp debug: Waiting for 5 seconds.\n"); | 150 | printk("swsusp debug: Waiting for 5 seconds.\n"); |
166 | mdelay(5000); | 151 | mdelay(5000); |
167 | goto Done; | 152 | goto Enable_cpus; |
168 | } | 153 | } |
169 | 154 | ||
170 | pr_debug("PM: snapshotting memory.\n"); | 155 | pr_debug("PM: snapshotting memory.\n"); |
171 | in_suspend = 1; | 156 | in_suspend = 1; |
172 | if ((error = swsusp_suspend())) | 157 | error = swsusp_suspend(); |
173 | goto Done; | 158 | if (error) |
159 | goto Enable_cpus; | ||
174 | 160 | ||
175 | if (in_suspend) { | 161 | if (in_suspend) { |
162 | enable_nonboot_cpus(); | ||
163 | platform_finish(); | ||
176 | device_resume(); | 164 | device_resume(); |
177 | resume_console(); | 165 | resume_console(); |
178 | pr_debug("PM: writing image.\n"); | 166 | pr_debug("PM: writing image.\n"); |
@@ -188,7 +176,10 @@ int pm_suspend_disk(void) | |||
188 | } | 176 | } |
189 | 177 | ||
190 | swsusp_free(); | 178 | swsusp_free(); |
191 | Done: | 179 | Enable_cpus: |
180 | enable_nonboot_cpus(); | ||
181 | Resume_devices: | ||
182 | platform_finish(); | ||
192 | device_resume(); | 183 | device_resume(); |
193 | resume_console(); | 184 | resume_console(); |
194 | Thaw: | 185 | Thaw: |
@@ -237,19 +228,28 @@ static int software_resume(void) | |||
237 | 228 | ||
238 | pr_debug("PM: Checking swsusp image.\n"); | 229 | pr_debug("PM: Checking swsusp image.\n"); |
239 | 230 | ||
240 | if ((error = swsusp_check())) | 231 | error = swsusp_check(); |
232 | if (error) | ||
241 | goto Done; | 233 | goto Done; |
242 | 234 | ||
243 | pr_debug("PM: Preparing processes for restore.\n"); | 235 | pr_debug("PM: Preparing processes for restore.\n"); |
244 | 236 | ||
245 | if ((error = prepare_processes())) { | 237 | error = prepare_processes(); |
238 | if (error) { | ||
246 | swsusp_close(); | 239 | swsusp_close(); |
247 | goto Done; | 240 | goto Done; |
248 | } | 241 | } |
249 | 242 | ||
243 | error = platform_prepare(); | ||
244 | if (error) { | ||
245 | swsusp_free(); | ||
246 | goto Thaw; | ||
247 | } | ||
248 | |||
250 | pr_debug("PM: Reading swsusp image.\n"); | 249 | pr_debug("PM: Reading swsusp image.\n"); |
251 | 250 | ||
252 | if ((error = swsusp_read())) { | 251 | error = swsusp_read(); |
252 | if (error) { | ||
253 | swsusp_free(); | 253 | swsusp_free(); |
254 | goto Thaw; | 254 | goto Thaw; |
255 | } | 255 | } |
@@ -257,21 +257,22 @@ static int software_resume(void) | |||
257 | pr_debug("PM: Preparing devices for restore.\n"); | 257 | pr_debug("PM: Preparing devices for restore.\n"); |
258 | 258 | ||
259 | suspend_console(); | 259 | suspend_console(); |
260 | if ((error = device_suspend(PMSG_PRETHAW))) { | 260 | error = device_suspend(PMSG_PRETHAW); |
261 | resume_console(); | 261 | if (error) |
262 | printk("Some devices failed to suspend\n"); | 262 | goto Free; |
263 | swsusp_free(); | ||
264 | goto Thaw; | ||
265 | } | ||
266 | 263 | ||
267 | mb(); | 264 | error = disable_nonboot_cpus(); |
265 | if (!error) | ||
266 | swsusp_resume(); | ||
268 | 267 | ||
269 | pr_debug("PM: Restoring saved image.\n"); | 268 | enable_nonboot_cpus(); |
270 | swsusp_resume(); | 269 | Free: |
271 | pr_debug("PM: Restore failed, recovering.n"); | 270 | swsusp_free(); |
271 | platform_finish(); | ||
272 | device_resume(); | 272 | device_resume(); |
273 | resume_console(); | 273 | resume_console(); |
274 | Thaw: | 274 | Thaw: |
275 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | ||
275 | unprepare_processes(); | 276 | unprepare_processes(); |
276 | Done: | 277 | Done: |
277 | /* For success case, the suspend path will release the lock */ | 278 | /* For success case, the suspend path will release the lock */ |
diff --git a/kernel/power/main.c b/kernel/power/main.c index ff3a6182f5f0..a064dfd8877a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
21 | #include <linux/resume-trace.h> | 21 | #include <linux/resume-trace.h> |
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/vmstat.h> | ||
23 | 24 | ||
24 | #include "power.h" | 25 | #include "power.h" |
25 | 26 | ||
@@ -43,6 +44,11 @@ void pm_set_ops(struct pm_ops * ops) | |||
43 | mutex_unlock(&pm_mutex); | 44 | mutex_unlock(&pm_mutex); |
44 | } | 45 | } |
45 | 46 | ||
47 | static inline void pm_finish(suspend_state_t state) | ||
48 | { | ||
49 | if (pm_ops->finish) | ||
50 | pm_ops->finish(state); | ||
51 | } | ||
46 | 52 | ||
47 | /** | 53 | /** |
48 | * suspend_prepare - Do prep work before entering low-power state. | 54 | * suspend_prepare - Do prep work before entering low-power state. |
@@ -63,16 +69,13 @@ static int suspend_prepare(suspend_state_t state) | |||
63 | 69 | ||
64 | pm_prepare_console(); | 70 | pm_prepare_console(); |
65 | 71 | ||
66 | error = disable_nonboot_cpus(); | ||
67 | if (error) | ||
68 | goto Enable_cpu; | ||
69 | |||
70 | if (freeze_processes()) { | 72 | if (freeze_processes()) { |
71 | error = -EAGAIN; | 73 | error = -EAGAIN; |
72 | goto Thaw; | 74 | goto Thaw; |
73 | } | 75 | } |
74 | 76 | ||
75 | if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { | 77 | if ((free_pages = global_page_state(NR_FREE_PAGES)) |
78 | < FREE_PAGE_NUMBER) { | ||
76 | pr_debug("PM: free some memory\n"); | 79 | pr_debug("PM: free some memory\n"); |
77 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); | 80 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); |
78 | if (nr_free_pages() < FREE_PAGE_NUMBER) { | 81 | if (nr_free_pages() < FREE_PAGE_NUMBER) { |
@@ -88,18 +91,22 @@ static int suspend_prepare(suspend_state_t state) | |||
88 | } | 91 | } |
89 | 92 | ||
90 | suspend_console(); | 93 | suspend_console(); |
91 | if ((error = device_suspend(PMSG_SUSPEND))) { | 94 | error = device_suspend(PMSG_SUSPEND); |
95 | if (error) { | ||
92 | printk(KERN_ERR "Some devices failed to suspend\n"); | 96 | printk(KERN_ERR "Some devices failed to suspend\n"); |
93 | goto Finish; | 97 | goto Resume_devices; |
94 | } | 98 | } |
95 | return 0; | 99 | error = disable_nonboot_cpus(); |
96 | Finish: | 100 | if (!error) |
97 | if (pm_ops->finish) | 101 | return 0; |
98 | pm_ops->finish(state); | 102 | |
103 | enable_nonboot_cpus(); | ||
104 | Resume_devices: | ||
105 | pm_finish(state); | ||
106 | device_resume(); | ||
107 | resume_console(); | ||
99 | Thaw: | 108 | Thaw: |
100 | thaw_processes(); | 109 | thaw_processes(); |
101 | Enable_cpu: | ||
102 | enable_nonboot_cpus(); | ||
103 | pm_restore_console(); | 110 | pm_restore_console(); |
104 | return error; | 111 | return error; |
105 | } | 112 | } |
@@ -134,12 +141,11 @@ int suspend_enter(suspend_state_t state) | |||
134 | 141 | ||
135 | static void suspend_finish(suspend_state_t state) | 142 | static void suspend_finish(suspend_state_t state) |
136 | { | 143 | { |
144 | enable_nonboot_cpus(); | ||
145 | pm_finish(state); | ||
137 | device_resume(); | 146 | device_resume(); |
138 | resume_console(); | 147 | resume_console(); |
139 | thaw_processes(); | 148 | thaw_processes(); |
140 | enable_nonboot_cpus(); | ||
141 | if (pm_ops && pm_ops->finish) | ||
142 | pm_ops->finish(state); | ||
143 | pm_restore_console(); | 149 | pm_restore_console(); |
144 | } | 150 | } |
145 | 151 | ||
@@ -161,7 +167,10 @@ static inline int valid_state(suspend_state_t state) | |||
161 | if (state == PM_SUSPEND_DISK) | 167 | if (state == PM_SUSPEND_DISK) |
162 | return 1; | 168 | return 1; |
163 | 169 | ||
164 | if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) | 170 | /* all other states need lowlevel support and need to be |
171 | * valid to the lowlevel implementation, no valid callback | ||
172 | * implies that all are valid. */ | ||
173 | if (!pm_ops || (pm_ops->valid && !pm_ops->valid(state))) | ||
165 | return 0; | 174 | return 0; |
166 | return 1; | 175 | return 1; |
167 | } | 176 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c024606221c4..fc53ad068128 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -591,7 +591,7 @@ static unsigned int count_free_highmem_pages(void) | |||
591 | 591 | ||
592 | for_each_zone(zone) | 592 | for_each_zone(zone) |
593 | if (populated_zone(zone) && is_highmem(zone)) | 593 | if (populated_zone(zone) && is_highmem(zone)) |
594 | cnt += zone->free_pages; | 594 | cnt += zone_page_state(zone, NR_FREE_PAGES); |
595 | 595 | ||
596 | return cnt; | 596 | return cnt; |
597 | } | 597 | } |
@@ -869,7 +869,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) | |||
869 | for_each_zone(zone) { | 869 | for_each_zone(zone) { |
870 | meta += snapshot_additional_pages(zone); | 870 | meta += snapshot_additional_pages(zone); |
871 | if (!is_highmem(zone)) | 871 | if (!is_highmem(zone)) |
872 | free += zone->free_pages; | 872 | free += zone_page_state(zone, NR_FREE_PAGES); |
873 | } | 873 | } |
874 | 874 | ||
875 | nr_pages += count_pages_for_highmem(nr_highmem); | 875 | nr_pages += count_pages_for_highmem(nr_highmem); |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 31aa0390c777..7fb834397a0d 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -230,9 +230,10 @@ int swsusp_shrink_memory(void) | |||
230 | for_each_zone (zone) | 230 | for_each_zone (zone) |
231 | if (populated_zone(zone)) { | 231 | if (populated_zone(zone)) { |
232 | if (is_highmem(zone)) { | 232 | if (is_highmem(zone)) { |
233 | highmem_size -= zone->free_pages; | 233 | highmem_size -= |
234 | zone_page_state(zone, NR_FREE_PAGES); | ||
234 | } else { | 235 | } else { |
235 | tmp -= zone->free_pages; | 236 | tmp -= zone_page_state(zone, NR_FREE_PAGES); |
236 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | 237 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; |
237 | tmp += snapshot_additional_pages(zone); | 238 | tmp += snapshot_additional_pages(zone); |
238 | } | 239 | } |
diff --git a/kernel/power/user.c b/kernel/power/user.c index f7b7a785a5c6..dd09efe7df54 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -37,6 +37,7 @@ static struct snapshot_data { | |||
37 | int mode; | 37 | int mode; |
38 | char frozen; | 38 | char frozen; |
39 | char ready; | 39 | char ready; |
40 | char platform_suspend; | ||
40 | } snapshot_state; | 41 | } snapshot_state; |
41 | 42 | ||
42 | static atomic_t device_available = ATOMIC_INIT(1); | 43 | static atomic_t device_available = ATOMIC_INIT(1); |
@@ -66,6 +67,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
66 | data->bitmap = NULL; | 67 | data->bitmap = NULL; |
67 | data->frozen = 0; | 68 | data->frozen = 0; |
68 | data->ready = 0; | 69 | data->ready = 0; |
70 | data->platform_suspend = 0; | ||
69 | 71 | ||
70 | return 0; | 72 | return 0; |
71 | } | 73 | } |
@@ -122,6 +124,92 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
122 | return res; | 124 | return res; |
123 | } | 125 | } |
124 | 126 | ||
127 | static inline int platform_prepare(void) | ||
128 | { | ||
129 | int error = 0; | ||
130 | |||
131 | if (pm_ops && pm_ops->prepare) | ||
132 | error = pm_ops->prepare(PM_SUSPEND_DISK); | ||
133 | |||
134 | return error; | ||
135 | } | ||
136 | |||
137 | static inline void platform_finish(void) | ||
138 | { | ||
139 | if (pm_ops && pm_ops->finish) | ||
140 | pm_ops->finish(PM_SUSPEND_DISK); | ||
141 | } | ||
142 | |||
143 | static inline int snapshot_suspend(int platform_suspend) | ||
144 | { | ||
145 | int error; | ||
146 | |||
147 | mutex_lock(&pm_mutex); | ||
148 | /* Free memory before shutting down devices. */ | ||
149 | error = swsusp_shrink_memory(); | ||
150 | if (error) | ||
151 | goto Finish; | ||
152 | |||
153 | if (platform_suspend) { | ||
154 | error = platform_prepare(); | ||
155 | if (error) | ||
156 | goto Finish; | ||
157 | } | ||
158 | suspend_console(); | ||
159 | error = device_suspend(PMSG_FREEZE); | ||
160 | if (error) | ||
161 | goto Resume_devices; | ||
162 | |||
163 | error = disable_nonboot_cpus(); | ||
164 | if (!error) { | ||
165 | in_suspend = 1; | ||
166 | error = swsusp_suspend(); | ||
167 | } | ||
168 | enable_nonboot_cpus(); | ||
169 | Resume_devices: | ||
170 | if (platform_suspend) | ||
171 | platform_finish(); | ||
172 | |||
173 | device_resume(); | ||
174 | resume_console(); | ||
175 | Finish: | ||
176 | mutex_unlock(&pm_mutex); | ||
177 | return error; | ||
178 | } | ||
179 | |||
180 | static inline int snapshot_restore(int platform_suspend) | ||
181 | { | ||
182 | int error; | ||
183 | |||
184 | mutex_lock(&pm_mutex); | ||
185 | pm_prepare_console(); | ||
186 | if (platform_suspend) { | ||
187 | error = platform_prepare(); | ||
188 | if (error) | ||
189 | goto Finish; | ||
190 | } | ||
191 | suspend_console(); | ||
192 | error = device_suspend(PMSG_PRETHAW); | ||
193 | if (error) | ||
194 | goto Resume_devices; | ||
195 | |||
196 | error = disable_nonboot_cpus(); | ||
197 | if (!error) | ||
198 | error = swsusp_resume(); | ||
199 | |||
200 | enable_nonboot_cpus(); | ||
201 | Resume_devices: | ||
202 | if (platform_suspend) | ||
203 | platform_finish(); | ||
204 | |||
205 | device_resume(); | ||
206 | resume_console(); | ||
207 | Finish: | ||
208 | pm_restore_console(); | ||
209 | mutex_unlock(&pm_mutex); | ||
210 | return error; | ||
211 | } | ||
212 | |||
125 | static int snapshot_ioctl(struct inode *inode, struct file *filp, | 213 | static int snapshot_ioctl(struct inode *inode, struct file *filp, |
126 | unsigned int cmd, unsigned long arg) | 214 | unsigned int cmd, unsigned long arg) |
127 | { | 215 | { |
@@ -145,14 +233,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
145 | if (data->frozen) | 233 | if (data->frozen) |
146 | break; | 234 | break; |
147 | mutex_lock(&pm_mutex); | 235 | mutex_lock(&pm_mutex); |
148 | error = disable_nonboot_cpus(); | 236 | if (freeze_processes()) { |
149 | if (!error) { | 237 | thaw_processes(); |
150 | error = freeze_processes(); | 238 | error = -EBUSY; |
151 | if (error) { | ||
152 | thaw_processes(); | ||
153 | enable_nonboot_cpus(); | ||
154 | error = -EBUSY; | ||
155 | } | ||
156 | } | 239 | } |
157 | mutex_unlock(&pm_mutex); | 240 | mutex_unlock(&pm_mutex); |
158 | if (!error) | 241 | if (!error) |
@@ -164,7 +247,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
164 | break; | 247 | break; |
165 | mutex_lock(&pm_mutex); | 248 | mutex_lock(&pm_mutex); |
166 | thaw_processes(); | 249 | thaw_processes(); |
167 | enable_nonboot_cpus(); | ||
168 | mutex_unlock(&pm_mutex); | 250 | mutex_unlock(&pm_mutex); |
169 | data->frozen = 0; | 251 | data->frozen = 0; |
170 | break; | 252 | break; |
@@ -174,20 +256,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
174 | error = -EPERM; | 256 | error = -EPERM; |
175 | break; | 257 | break; |
176 | } | 258 | } |
177 | mutex_lock(&pm_mutex); | 259 | error = snapshot_suspend(data->platform_suspend); |
178 | /* Free memory before shutting down devices. */ | ||
179 | error = swsusp_shrink_memory(); | ||
180 | if (!error) { | ||
181 | suspend_console(); | ||
182 | error = device_suspend(PMSG_FREEZE); | ||
183 | if (!error) { | ||
184 | in_suspend = 1; | ||
185 | error = swsusp_suspend(); | ||
186 | device_resume(); | ||
187 | } | ||
188 | resume_console(); | ||
189 | } | ||
190 | mutex_unlock(&pm_mutex); | ||
191 | if (!error) | 260 | if (!error) |
192 | error = put_user(in_suspend, (unsigned int __user *)arg); | 261 | error = put_user(in_suspend, (unsigned int __user *)arg); |
193 | if (!error) | 262 | if (!error) |
@@ -201,17 +270,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
201 | error = -EPERM; | 270 | error = -EPERM; |
202 | break; | 271 | break; |
203 | } | 272 | } |
204 | mutex_lock(&pm_mutex); | 273 | error = snapshot_restore(data->platform_suspend); |
205 | pm_prepare_console(); | ||
206 | suspend_console(); | ||
207 | error = device_suspend(PMSG_PRETHAW); | ||
208 | if (!error) { | ||
209 | error = swsusp_resume(); | ||
210 | device_resume(); | ||
211 | } | ||
212 | resume_console(); | ||
213 | pm_restore_console(); | ||
214 | mutex_unlock(&pm_mutex); | ||
215 | break; | 274 | break; |
216 | 275 | ||
217 | case SNAPSHOT_FREE: | 276 | case SNAPSHOT_FREE: |
@@ -282,6 +341,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
282 | break; | 341 | break; |
283 | 342 | ||
284 | case SNAPSHOT_S2RAM: | 343 | case SNAPSHOT_S2RAM: |
344 | if (!pm_ops) { | ||
345 | error = -ENOSYS; | ||
346 | break; | ||
347 | } | ||
348 | |||
285 | if (!data->frozen) { | 349 | if (!data->frozen) { |
286 | error = -EPERM; | 350 | error = -EPERM; |
287 | break; | 351 | break; |
@@ -319,28 +383,35 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
319 | break; | 383 | break; |
320 | 384 | ||
321 | case SNAPSHOT_PMOPS: | 385 | case SNAPSHOT_PMOPS: |
386 | error = -EINVAL; | ||
387 | |||
322 | switch (arg) { | 388 | switch (arg) { |
323 | 389 | ||
324 | case PMOPS_PREPARE: | 390 | case PMOPS_PREPARE: |
325 | if (pm_ops->prepare) { | 391 | if (pm_ops && pm_ops->enter) { |
326 | error = pm_ops->prepare(PM_SUSPEND_DISK); | 392 | data->platform_suspend = 1; |
393 | error = 0; | ||
394 | } else { | ||
395 | error = -ENOSYS; | ||
327 | } | 396 | } |
328 | break; | 397 | break; |
329 | 398 | ||
330 | case PMOPS_ENTER: | 399 | case PMOPS_ENTER: |
331 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | 400 | if (data->platform_suspend) { |
332 | error = pm_ops->enter(PM_SUSPEND_DISK); | 401 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); |
402 | error = pm_ops->enter(PM_SUSPEND_DISK); | ||
403 | error = 0; | ||
404 | } | ||
333 | break; | 405 | break; |
334 | 406 | ||
335 | case PMOPS_FINISH: | 407 | case PMOPS_FINISH: |
336 | if (pm_ops && pm_ops->finish) { | 408 | if (data->platform_suspend) |
337 | pm_ops->finish(PM_SUSPEND_DISK); | 409 | error = 0; |
338 | } | 410 | |
339 | break; | 411 | break; |
340 | 412 | ||
341 | default: | 413 | default: |
342 | printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); | 414 | printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); |
343 | error = -EINVAL; | ||
344 | 415 | ||
345 | } | 416 | } |
346 | break; | 417 | break; |
diff --git a/kernel/printk.c b/kernel/printk.c index c770e1a4e882..4b47e59248df 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -54,7 +54,7 @@ int console_printk[4] = { | |||
54 | }; | 54 | }; |
55 | 55 | ||
56 | /* | 56 | /* |
57 | * Low lever drivers may need that to know if they can schedule in | 57 | * Low level drivers may need that to know if they can schedule in |
58 | * their unblank() callback or not. So let's export it. | 58 | * their unblank() callback or not. So let's export it. |
59 | */ | 59 | */ |
60 | int oops_in_progress; | 60 | int oops_in_progress; |
@@ -483,7 +483,7 @@ static int have_callable_console(void) | |||
483 | * printk - print a kernel message | 483 | * printk - print a kernel message |
484 | * @fmt: format string | 484 | * @fmt: format string |
485 | * | 485 | * |
486 | * This is printk. It can be called from any context. We want it to work. | 486 | * This is printk(). It can be called from any context. We want it to work. |
487 | * | 487 | * |
488 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 488 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and |
489 | * call the console drivers. If we fail to get the semaphore we place the output | 489 | * call the console drivers. If we fail to get the semaphore we place the output |
@@ -529,7 +529,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
529 | zap_locks(); | 529 | zap_locks(); |
530 | 530 | ||
531 | /* This stops the holder of console_sem just where we want him */ | 531 | /* This stops the holder of console_sem just where we want him */ |
532 | local_irq_save(flags); | 532 | raw_local_irq_save(flags); |
533 | lockdep_off(); | 533 | lockdep_off(); |
534 | spin_lock(&logbuf_lock); | 534 | spin_lock(&logbuf_lock); |
535 | printk_cpu = smp_processor_id(); | 535 | printk_cpu = smp_processor_id(); |
@@ -618,7 +618,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
618 | up(&console_sem); | 618 | up(&console_sem); |
619 | } | 619 | } |
620 | lockdep_on(); | 620 | lockdep_on(); |
621 | local_irq_restore(flags); | 621 | raw_local_irq_restore(flags); |
622 | } else { | 622 | } else { |
623 | /* | 623 | /* |
624 | * Someone else owns the drivers. We drop the spinlock, which | 624 | * Someone else owns the drivers. We drop the spinlock, which |
@@ -628,7 +628,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
628 | printk_cpu = UINT_MAX; | 628 | printk_cpu = UINT_MAX; |
629 | spin_unlock(&logbuf_lock); | 629 | spin_unlock(&logbuf_lock); |
630 | lockdep_on(); | 630 | lockdep_on(); |
631 | local_irq_restore(flags); | 631 | raw_local_irq_restore(flags); |
632 | } | 632 | } |
633 | 633 | ||
634 | preempt_enable(); | 634 | preempt_enable(); |
@@ -783,6 +783,12 @@ int is_console_locked(void) | |||
783 | return console_locked; | 783 | return console_locked; |
784 | } | 784 | } |
785 | 785 | ||
786 | void wake_up_klogd(void) | ||
787 | { | ||
788 | if (!oops_in_progress && waitqueue_active(&log_wait)) | ||
789 | wake_up_interruptible(&log_wait); | ||
790 | } | ||
791 | |||
786 | /** | 792 | /** |
787 | * release_console_sem - unlock the console system | 793 | * release_console_sem - unlock the console system |
788 | * | 794 | * |
@@ -825,8 +831,8 @@ void release_console_sem(void) | |||
825 | console_locked = 0; | 831 | console_locked = 0; |
826 | up(&console_sem); | 832 | up(&console_sem); |
827 | spin_unlock_irqrestore(&logbuf_lock, flags); | 833 | spin_unlock_irqrestore(&logbuf_lock, flags); |
828 | if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) | 834 | if (wake_klogd) |
829 | wake_up_interruptible(&log_wait); | 835 | wake_up_klogd(); |
830 | } | 836 | } |
831 | EXPORT_SYMBOL(release_console_sem); | 837 | EXPORT_SYMBOL(release_console_sem); |
832 | 838 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index d6579d511069..9bfadb248dd8 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -449,7 +449,6 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | |||
449 | /* create /proc/irq/prof_cpu_mask */ | 449 | /* create /proc/irq/prof_cpu_mask */ |
450 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | 450 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) |
451 | return; | 451 | return; |
452 | entry->nlink = 1; | ||
453 | entry->data = (void *)&prof_cpu_mask; | 452 | entry->data = (void *)&prof_cpu_mask; |
454 | entry->read_proc = prof_cpu_mask_read_proc; | 453 | entry->read_proc = prof_cpu_mask_read_proc; |
455 | entry->write_proc = prof_cpu_mask_write_proc; | 454 | entry->write_proc = prof_cpu_mask_write_proc; |
diff --git a/kernel/relay.c b/kernel/relay.c index 284e2e8b4eed..ef8a935710a2 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -7,6 +7,8 @@ | |||
7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) | 7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) |
8 | * | 8 | * |
9 | * Moved to kernel/relay.c by Paul Mundt, 2006. | 9 | * Moved to kernel/relay.c by Paul Mundt, 2006. |
10 | * November 2006 - CPU hotplug support by Mathieu Desnoyers | ||
11 | * (mathieu.desnoyers@polymtl.ca) | ||
10 | * | 12 | * |
11 | * This file is released under the GPL. | 13 | * This file is released under the GPL. |
12 | */ | 14 | */ |
@@ -18,6 +20,11 @@ | |||
18 | #include <linux/relay.h> | 20 | #include <linux/relay.h> |
19 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
20 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/cpu.h> | ||
24 | |||
25 | /* list of open channels, for cpu hotplug */ | ||
26 | static DEFINE_MUTEX(relay_channels_mutex); | ||
27 | static LIST_HEAD(relay_channels); | ||
21 | 28 | ||
22 | /* | 29 | /* |
23 | * close() vm_op implementation for relay file mapping. | 30 | * close() vm_op implementation for relay file mapping. |
@@ -187,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) | |||
187 | __free_page(buf->page_array[i]); | 194 | __free_page(buf->page_array[i]); |
188 | kfree(buf->page_array); | 195 | kfree(buf->page_array); |
189 | } | 196 | } |
197 | chan->buf[buf->cpu] = NULL; | ||
190 | kfree(buf->padding); | 198 | kfree(buf->padding); |
191 | kfree(buf); | 199 | kfree(buf); |
192 | kref_put(&chan->kref, relay_destroy_channel); | 200 | kref_put(&chan->kref, relay_destroy_channel); |
@@ -320,7 +328,7 @@ static void wakeup_readers(struct work_struct *work) | |||
320 | * @buf: the channel buffer | 328 | * @buf: the channel buffer |
321 | * @init: 1 if this is a first-time initialization | 329 | * @init: 1 if this is a first-time initialization |
322 | * | 330 | * |
323 | * See relay_reset for description of effect. | 331 | * See relay_reset() for description of effect. |
324 | */ | 332 | */ |
325 | static void __relay_reset(struct rchan_buf *buf, unsigned int init) | 333 | static void __relay_reset(struct rchan_buf *buf, unsigned int init) |
326 | { | 334 | { |
@@ -356,57 +364,75 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) | |||
356 | * and restarting the channel in its initial state. The buffers | 364 | * and restarting the channel in its initial state. The buffers |
357 | * are not freed, so any mappings are still in effect. | 365 | * are not freed, so any mappings are still in effect. |
358 | * | 366 | * |
359 | * NOTE: Care should be taken that the channel isn't actually | 367 | * NOTE. Care should be taken that the channel isn't actually |
360 | * being used by anything when this call is made. | 368 | * being used by anything when this call is made. |
361 | */ | 369 | */ |
362 | void relay_reset(struct rchan *chan) | 370 | void relay_reset(struct rchan *chan) |
363 | { | 371 | { |
364 | unsigned int i; | 372 | unsigned int i; |
365 | struct rchan_buf *prev = NULL; | ||
366 | 373 | ||
367 | if (!chan) | 374 | if (!chan) |
368 | return; | 375 | return; |
369 | 376 | ||
370 | for (i = 0; i < NR_CPUS; i++) { | 377 | if (chan->is_global && chan->buf[0]) { |
371 | if (!chan->buf[i] || chan->buf[i] == prev) | 378 | __relay_reset(chan->buf[0], 0); |
372 | break; | 379 | return; |
373 | __relay_reset(chan->buf[i], 0); | ||
374 | prev = chan->buf[i]; | ||
375 | } | 380 | } |
381 | |||
382 | mutex_lock(&relay_channels_mutex); | ||
383 | for_each_online_cpu(i) | ||
384 | if (chan->buf[i]) | ||
385 | __relay_reset(chan->buf[i], 0); | ||
386 | mutex_unlock(&relay_channels_mutex); | ||
376 | } | 387 | } |
377 | EXPORT_SYMBOL_GPL(relay_reset); | 388 | EXPORT_SYMBOL_GPL(relay_reset); |
378 | 389 | ||
379 | /* | 390 | /* |
380 | * relay_open_buf - create a new relay channel buffer | 391 | * relay_open_buf - create a new relay channel buffer |
381 | * | 392 | * |
382 | * Internal - used by relay_open(). | 393 | * used by relay_open() and CPU hotplug. |
383 | */ | 394 | */ |
384 | static struct rchan_buf *relay_open_buf(struct rchan *chan, | 395 | static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) |
385 | const char *filename, | ||
386 | struct dentry *parent, | ||
387 | int *is_global) | ||
388 | { | 396 | { |
389 | struct rchan_buf *buf; | 397 | struct rchan_buf *buf = NULL; |
390 | struct dentry *dentry; | 398 | struct dentry *dentry; |
399 | char *tmpname; | ||
391 | 400 | ||
392 | if (*is_global) | 401 | if (chan->is_global) |
393 | return chan->buf[0]; | 402 | return chan->buf[0]; |
394 | 403 | ||
404 | tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); | ||
405 | if (!tmpname) | ||
406 | goto end; | ||
407 | snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); | ||
408 | |||
395 | buf = relay_create_buf(chan); | 409 | buf = relay_create_buf(chan); |
396 | if (!buf) | 410 | if (!buf) |
397 | return NULL; | 411 | goto free_name; |
412 | |||
413 | buf->cpu = cpu; | ||
414 | __relay_reset(buf, 1); | ||
398 | 415 | ||
399 | /* Create file in fs */ | 416 | /* Create file in fs */ |
400 | dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, | 417 | dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, |
401 | buf, is_global); | 418 | buf, &chan->is_global); |
402 | if (!dentry) { | 419 | if (!dentry) |
403 | relay_destroy_buf(buf); | 420 | goto free_buf; |
404 | return NULL; | ||
405 | } | ||
406 | 421 | ||
407 | buf->dentry = dentry; | 422 | buf->dentry = dentry; |
408 | __relay_reset(buf, 1); | ||
409 | 423 | ||
424 | if(chan->is_global) { | ||
425 | chan->buf[0] = buf; | ||
426 | buf->cpu = 0; | ||
427 | } | ||
428 | |||
429 | goto free_name; | ||
430 | |||
431 | free_buf: | ||
432 | relay_destroy_buf(buf); | ||
433 | free_name: | ||
434 | kfree(tmpname); | ||
435 | end: | ||
410 | return buf; | 436 | return buf; |
411 | } | 437 | } |
412 | 438 | ||
@@ -448,31 +474,71 @@ static void setup_callbacks(struct rchan *chan, | |||
448 | } | 474 | } |
449 | 475 | ||
450 | /** | 476 | /** |
477 | * | ||
478 | * relay_hotcpu_callback - CPU hotplug callback | ||
479 | * @nb: notifier block | ||
480 | * @action: hotplug action to take | ||
481 | * @hcpu: CPU number | ||
482 | * | ||
483 | * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) | ||
484 | */ | ||
485 | static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, | ||
486 | unsigned long action, | ||
487 | void *hcpu) | ||
488 | { | ||
489 | unsigned int hotcpu = (unsigned long)hcpu; | ||
490 | struct rchan *chan; | ||
491 | |||
492 | switch(action) { | ||
493 | case CPU_UP_PREPARE: | ||
494 | mutex_lock(&relay_channels_mutex); | ||
495 | list_for_each_entry(chan, &relay_channels, list) { | ||
496 | if (chan->buf[hotcpu]) | ||
497 | continue; | ||
498 | chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); | ||
499 | if(!chan->buf[hotcpu]) { | ||
500 | printk(KERN_ERR | ||
501 | "relay_hotcpu_callback: cpu %d buffer " | ||
502 | "creation failed\n", hotcpu); | ||
503 | mutex_unlock(&relay_channels_mutex); | ||
504 | return NOTIFY_BAD; | ||
505 | } | ||
506 | } | ||
507 | mutex_unlock(&relay_channels_mutex); | ||
508 | break; | ||
509 | case CPU_DEAD: | ||
510 | /* No need to flush the cpu : will be flushed upon | ||
511 | * final relay_flush() call. */ | ||
512 | break; | ||
513 | } | ||
514 | return NOTIFY_OK; | ||
515 | } | ||
516 | |||
517 | /** | ||
451 | * relay_open - create a new relay channel | 518 | * relay_open - create a new relay channel |
452 | * @base_filename: base name of files to create | 519 | * @base_filename: base name of files to create |
453 | * @parent: dentry of parent directory, %NULL for root directory | 520 | * @parent: dentry of parent directory, %NULL for root directory |
454 | * @subbuf_size: size of sub-buffers | 521 | * @subbuf_size: size of sub-buffers |
455 | * @n_subbufs: number of sub-buffers | 522 | * @n_subbufs: number of sub-buffers |
456 | * @cb: client callback functions | 523 | * @cb: client callback functions |
524 | * @private_data: user-defined data | ||
457 | * | 525 | * |
458 | * Returns channel pointer if successful, %NULL otherwise. | 526 | * Returns channel pointer if successful, %NULL otherwise. |
459 | * | 527 | * |
460 | * Creates a channel buffer for each cpu using the sizes and | 528 | * Creates a channel buffer for each cpu using the sizes and |
461 | * attributes specified. The created channel buffer files | 529 | * attributes specified. The created channel buffer files |
462 | * will be named base_filename0...base_filenameN-1. File | 530 | * will be named base_filename0...base_filenameN-1. File |
463 | * permissions will be S_IRUSR. | 531 | * permissions will be %S_IRUSR. |
464 | */ | 532 | */ |
465 | struct rchan *relay_open(const char *base_filename, | 533 | struct rchan *relay_open(const char *base_filename, |
466 | struct dentry *parent, | 534 | struct dentry *parent, |
467 | size_t subbuf_size, | 535 | size_t subbuf_size, |
468 | size_t n_subbufs, | 536 | size_t n_subbufs, |
469 | struct rchan_callbacks *cb) | 537 | struct rchan_callbacks *cb, |
538 | void *private_data) | ||
470 | { | 539 | { |
471 | unsigned int i; | 540 | unsigned int i; |
472 | struct rchan *chan; | 541 | struct rchan *chan; |
473 | char *tmpname; | ||
474 | int is_global = 0; | ||
475 | |||
476 | if (!base_filename) | 542 | if (!base_filename) |
477 | return NULL; | 543 | return NULL; |
478 | 544 | ||
@@ -487,38 +553,32 @@ struct rchan *relay_open(const char *base_filename, | |||
487 | chan->n_subbufs = n_subbufs; | 553 | chan->n_subbufs = n_subbufs; |
488 | chan->subbuf_size = subbuf_size; | 554 | chan->subbuf_size = subbuf_size; |
489 | chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); | 555 | chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); |
556 | chan->parent = parent; | ||
557 | chan->private_data = private_data; | ||
558 | strlcpy(chan->base_filename, base_filename, NAME_MAX); | ||
490 | setup_callbacks(chan, cb); | 559 | setup_callbacks(chan, cb); |
491 | kref_init(&chan->kref); | 560 | kref_init(&chan->kref); |
492 | 561 | ||
493 | tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL); | 562 | mutex_lock(&relay_channels_mutex); |
494 | if (!tmpname) | ||
495 | goto free_chan; | ||
496 | |||
497 | for_each_online_cpu(i) { | 563 | for_each_online_cpu(i) { |
498 | sprintf(tmpname, "%s%d", base_filename, i); | 564 | chan->buf[i] = relay_open_buf(chan, i); |
499 | chan->buf[i] = relay_open_buf(chan, tmpname, parent, | ||
500 | &is_global); | ||
501 | if (!chan->buf[i]) | 565 | if (!chan->buf[i]) |
502 | goto free_bufs; | 566 | goto free_bufs; |
503 | |||
504 | chan->buf[i]->cpu = i; | ||
505 | } | 567 | } |
568 | list_add(&chan->list, &relay_channels); | ||
569 | mutex_unlock(&relay_channels_mutex); | ||
506 | 570 | ||
507 | kfree(tmpname); | ||
508 | return chan; | 571 | return chan; |
509 | 572 | ||
510 | free_bufs: | 573 | free_bufs: |
511 | for (i = 0; i < NR_CPUS; i++) { | 574 | for_each_online_cpu(i) { |
512 | if (!chan->buf[i]) | 575 | if (!chan->buf[i]) |
513 | break; | 576 | break; |
514 | relay_close_buf(chan->buf[i]); | 577 | relay_close_buf(chan->buf[i]); |
515 | if (is_global) | ||
516 | break; | ||
517 | } | 578 | } |
518 | kfree(tmpname); | ||
519 | 579 | ||
520 | free_chan: | ||
521 | kref_put(&chan->kref, relay_destroy_channel); | 580 | kref_put(&chan->kref, relay_destroy_channel); |
581 | mutex_unlock(&relay_channels_mutex); | ||
522 | return NULL; | 582 | return NULL; |
523 | } | 583 | } |
524 | EXPORT_SYMBOL_GPL(relay_open); | 584 | EXPORT_SYMBOL_GPL(relay_open); |
@@ -588,7 +648,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); | |||
588 | * subbufs_consumed should be the number of sub-buffers newly consumed, | 648 | * subbufs_consumed should be the number of sub-buffers newly consumed, |
589 | * not the total consumed. | 649 | * not the total consumed. |
590 | * | 650 | * |
591 | * NOTE: Kernel clients don't need to call this function if the channel | 651 | * NOTE. Kernel clients don't need to call this function if the channel |
592 | * mode is 'overwrite'. | 652 | * mode is 'overwrite'. |
593 | */ | 653 | */ |
594 | void relay_subbufs_consumed(struct rchan *chan, | 654 | void relay_subbufs_consumed(struct rchan *chan, |
@@ -619,24 +679,26 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); | |||
619 | void relay_close(struct rchan *chan) | 679 | void relay_close(struct rchan *chan) |
620 | { | 680 | { |
621 | unsigned int i; | 681 | unsigned int i; |
622 | struct rchan_buf *prev = NULL; | ||
623 | 682 | ||
624 | if (!chan) | 683 | if (!chan) |
625 | return; | 684 | return; |
626 | 685 | ||
627 | for (i = 0; i < NR_CPUS; i++) { | 686 | mutex_lock(&relay_channels_mutex); |
628 | if (!chan->buf[i] || chan->buf[i] == prev) | 687 | if (chan->is_global && chan->buf[0]) |
629 | break; | 688 | relay_close_buf(chan->buf[0]); |
630 | relay_close_buf(chan->buf[i]); | 689 | else |
631 | prev = chan->buf[i]; | 690 | for_each_possible_cpu(i) |
632 | } | 691 | if (chan->buf[i]) |
692 | relay_close_buf(chan->buf[i]); | ||
633 | 693 | ||
634 | if (chan->last_toobig) | 694 | if (chan->last_toobig) |
635 | printk(KERN_WARNING "relay: one or more items not logged " | 695 | printk(KERN_WARNING "relay: one or more items not logged " |
636 | "[item size (%Zd) > sub-buffer size (%Zd)]\n", | 696 | "[item size (%Zd) > sub-buffer size (%Zd)]\n", |
637 | chan->last_toobig, chan->subbuf_size); | 697 | chan->last_toobig, chan->subbuf_size); |
638 | 698 | ||
699 | list_del(&chan->list); | ||
639 | kref_put(&chan->kref, relay_destroy_channel); | 700 | kref_put(&chan->kref, relay_destroy_channel); |
701 | mutex_unlock(&relay_channels_mutex); | ||
640 | } | 702 | } |
641 | EXPORT_SYMBOL_GPL(relay_close); | 703 | EXPORT_SYMBOL_GPL(relay_close); |
642 | 704 | ||
@@ -649,17 +711,20 @@ EXPORT_SYMBOL_GPL(relay_close); | |||
649 | void relay_flush(struct rchan *chan) | 711 | void relay_flush(struct rchan *chan) |
650 | { | 712 | { |
651 | unsigned int i; | 713 | unsigned int i; |
652 | struct rchan_buf *prev = NULL; | ||
653 | 714 | ||
654 | if (!chan) | 715 | if (!chan) |
655 | return; | 716 | return; |
656 | 717 | ||
657 | for (i = 0; i < NR_CPUS; i++) { | 718 | if (chan->is_global && chan->buf[0]) { |
658 | if (!chan->buf[i] || chan->buf[i] == prev) | 719 | relay_switch_subbuf(chan->buf[0], 0); |
659 | break; | 720 | return; |
660 | relay_switch_subbuf(chan->buf[i], 0); | ||
661 | prev = chan->buf[i]; | ||
662 | } | 721 | } |
722 | |||
723 | mutex_lock(&relay_channels_mutex); | ||
724 | for_each_possible_cpu(i) | ||
725 | if (chan->buf[i]) | ||
726 | relay_switch_subbuf(chan->buf[i], 0); | ||
727 | mutex_unlock(&relay_channels_mutex); | ||
663 | } | 728 | } |
664 | EXPORT_SYMBOL_GPL(relay_flush); | 729 | EXPORT_SYMBOL_GPL(relay_flush); |
665 | 730 | ||
@@ -684,7 +749,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) | |||
684 | * @filp: the file | 749 | * @filp: the file |
685 | * @vma: the vma describing what to map | 750 | * @vma: the vma describing what to map |
686 | * | 751 | * |
687 | * Calls upon relay_mmap_buf to map the file into user space. | 752 | * Calls upon relay_mmap_buf() to map the file into user space. |
688 | */ | 753 | */ |
689 | static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) | 754 | static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) |
690 | { | 755 | { |
@@ -826,7 +891,7 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, | |||
826 | * @read_pos: file read position | 891 | * @read_pos: file read position |
827 | * @buf: relay channel buffer | 892 | * @buf: relay channel buffer |
828 | * | 893 | * |
829 | * If the read_pos is in the middle of padding, return the | 894 | * If the @read_pos is in the middle of padding, return the |
830 | * position of the first actually available byte, otherwise | 895 | * position of the first actually available byte, otherwise |
831 | * return the original value. | 896 | * return the original value. |
832 | */ | 897 | */ |
@@ -1022,3 +1087,12 @@ const struct file_operations relay_file_operations = { | |||
1022 | .sendfile = relay_file_sendfile, | 1087 | .sendfile = relay_file_sendfile, |
1023 | }; | 1088 | }; |
1024 | EXPORT_SYMBOL_GPL(relay_file_operations); | 1089 | EXPORT_SYMBOL_GPL(relay_file_operations); |
1090 | |||
1091 | static __init int relay_init(void) | ||
1092 | { | ||
1093 | |||
1094 | hotcpu_notifier(relay_hotcpu_callback, 0); | ||
1095 | return 0; | ||
1096 | } | ||
1097 | |||
1098 | module_init(relay_init); | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 7b9a497419d9..bdb55a33f969 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -8,7 +8,6 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/sched.h> | ||
12 | #include <linux/errno.h> | 11 | #include <linux/errno.h> |
13 | #include <linux/ioport.h> | 12 | #include <linux/ioport.h> |
14 | #include <linux/init.h> | 13 | #include <linux/init.h> |
@@ -17,6 +16,7 @@ | |||
17 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
18 | #include <linux/proc_fs.h> | 17 | #include <linux/proc_fs.h> |
19 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
19 | #include <linux/device.h> | ||
20 | #include <asm/io.h> | 20 | #include <asm/io.h> |
21 | 21 | ||
22 | 22 | ||
@@ -618,6 +618,67 @@ void __release_region(struct resource *parent, resource_size_t start, | |||
618 | EXPORT_SYMBOL(__release_region); | 618 | EXPORT_SYMBOL(__release_region); |
619 | 619 | ||
620 | /* | 620 | /* |
621 | * Managed region resource | ||
622 | */ | ||
623 | struct region_devres { | ||
624 | struct resource *parent; | ||
625 | resource_size_t start; | ||
626 | resource_size_t n; | ||
627 | }; | ||
628 | |||
629 | static void devm_region_release(struct device *dev, void *res) | ||
630 | { | ||
631 | struct region_devres *this = res; | ||
632 | |||
633 | __release_region(this->parent, this->start, this->n); | ||
634 | } | ||
635 | |||
636 | static int devm_region_match(struct device *dev, void *res, void *match_data) | ||
637 | { | ||
638 | struct region_devres *this = res, *match = match_data; | ||
639 | |||
640 | return this->parent == match->parent && | ||
641 | this->start == match->start && this->n == match->n; | ||
642 | } | ||
643 | |||
644 | struct resource * __devm_request_region(struct device *dev, | ||
645 | struct resource *parent, resource_size_t start, | ||
646 | resource_size_t n, const char *name) | ||
647 | { | ||
648 | struct region_devres *dr = NULL; | ||
649 | struct resource *res; | ||
650 | |||
651 | dr = devres_alloc(devm_region_release, sizeof(struct region_devres), | ||
652 | GFP_KERNEL); | ||
653 | if (!dr) | ||
654 | return NULL; | ||
655 | |||
656 | dr->parent = parent; | ||
657 | dr->start = start; | ||
658 | dr->n = n; | ||
659 | |||
660 | res = __request_region(parent, start, n, name); | ||
661 | if (res) | ||
662 | devres_add(dev, dr); | ||
663 | else | ||
664 | devres_free(dr); | ||
665 | |||
666 | return res; | ||
667 | } | ||
668 | EXPORT_SYMBOL(__devm_request_region); | ||
669 | |||
670 | void __devm_release_region(struct device *dev, struct resource *parent, | ||
671 | resource_size_t start, resource_size_t n) | ||
672 | { | ||
673 | struct region_devres match_data = { parent, start, n }; | ||
674 | |||
675 | __release_region(parent, start, n); | ||
676 | WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, | ||
677 | &match_data)); | ||
678 | } | ||
679 | EXPORT_SYMBOL(__devm_release_region); | ||
680 | |||
681 | /* | ||
621 | * Called from init/main.c to reserve IO ports. | 682 | * Called from init/main.c to reserve IO ports. |
622 | */ | 683 | */ |
623 | #define MAXRESERVE 4 | 684 | #define MAXRESERVE 4 |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4ab17da46fd8..180978cb2f75 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
625 | /* Setup the timer, when timeout != NULL */ | 625 | /* Setup the timer, when timeout != NULL */ |
626 | if (unlikely(timeout)) | 626 | if (unlikely(timeout)) |
627 | hrtimer_start(&timeout->timer, timeout->timer.expires, | 627 | hrtimer_start(&timeout->timer, timeout->timer.expires, |
628 | HRTIMER_ABS); | 628 | HRTIMER_MODE_ABS); |
629 | 629 | ||
630 | for (;;) { | 630 | for (;;) { |
631 | /* Try to acquire the lock: */ | 631 | /* Try to acquire the lock: */ |
diff --git a/kernel/sched.c b/kernel/sched.c index cca93cc0dd7d..0dc757246d89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -57,6 +57,16 @@ | |||
57 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Scheduler clock - returns current time in nanosec units. | ||
61 | * This is default implementation. | ||
62 | * Architectures and sub-architectures can override this. | ||
63 | */ | ||
64 | unsigned long long __attribute__((weak)) sched_clock(void) | ||
65 | { | ||
66 | return (unsigned long long)jiffies * (1000000000 / HZ); | ||
67 | } | ||
68 | |||
69 | /* | ||
60 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 70 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
61 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 71 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
62 | * and back. | 72 | * and back. |
@@ -1843,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1843 | struct mm_struct *mm = next->mm; | 1853 | struct mm_struct *mm = next->mm; |
1844 | struct mm_struct *oldmm = prev->active_mm; | 1854 | struct mm_struct *oldmm = prev->active_mm; |
1845 | 1855 | ||
1856 | /* | ||
1857 | * For paravirt, this is coupled with an exit in switch_to to | ||
1858 | * combine the page table reload and the switch backend into | ||
1859 | * one hypercall. | ||
1860 | */ | ||
1861 | arch_enter_lazy_cpu_mode(); | ||
1862 | |||
1846 | if (!mm) { | 1863 | if (!mm) { |
1847 | next->active_mm = oldmm; | 1864 | next->active_mm = oldmm; |
1848 | atomic_inc(&oldmm->mm_count); | 1865 | atomic_inc(&oldmm->mm_count); |
@@ -2887,14 +2904,16 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2887 | static void update_load(struct rq *this_rq) | 2904 | static void update_load(struct rq *this_rq) |
2888 | { | 2905 | { |
2889 | unsigned long this_load; | 2906 | unsigned long this_load; |
2890 | int i, scale; | 2907 | unsigned int i, scale; |
2891 | 2908 | ||
2892 | this_load = this_rq->raw_weighted_load; | 2909 | this_load = this_rq->raw_weighted_load; |
2893 | 2910 | ||
2894 | /* Update our load: */ | 2911 | /* Update our load: */ |
2895 | for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { | 2912 | for (i = 0, scale = 1; i < 3; i++, scale += scale) { |
2896 | unsigned long old_load, new_load; | 2913 | unsigned long old_load, new_load; |
2897 | 2914 | ||
2915 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
2916 | |||
2898 | old_load = this_rq->cpu_load[i]; | 2917 | old_load = this_rq->cpu_load[i]; |
2899 | new_load = this_load; | 2918 | new_load = this_load; |
2900 | /* | 2919 | /* |
@@ -2904,7 +2923,7 @@ static void update_load(struct rq *this_rq) | |||
2904 | */ | 2923 | */ |
2905 | if (new_load > old_load) | 2924 | if (new_load > old_load) |
2906 | new_load += scale-1; | 2925 | new_load += scale-1; |
2907 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | 2926 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2908 | } | 2927 | } |
2909 | } | 2928 | } |
2910 | 2929 | ||
@@ -4193,13 +4212,12 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
4193 | } | 4212 | } |
4194 | 4213 | ||
4195 | /** | 4214 | /** |
4196 | * sched_setscheduler - change the scheduling policy and/or RT priority of | 4215 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
4197 | * a thread. | ||
4198 | * @p: the task in question. | 4216 | * @p: the task in question. |
4199 | * @policy: new policy. | 4217 | * @policy: new policy. |
4200 | * @param: structure containing the new RT priority. | 4218 | * @param: structure containing the new RT priority. |
4201 | * | 4219 | * |
4202 | * NOTE: the task may be already dead | 4220 | * NOTE that the task may be already dead. |
4203 | */ | 4221 | */ |
4204 | int sched_setscheduler(struct task_struct *p, int policy, | 4222 | int sched_setscheduler(struct task_struct *p, int policy, |
4205 | struct sched_param *param) | 4223 | struct sched_param *param) |
@@ -4567,7 +4585,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
4567 | /** | 4585 | /** |
4568 | * sys_sched_yield - yield the current processor to other threads. | 4586 | * sys_sched_yield - yield the current processor to other threads. |
4569 | * | 4587 | * |
4570 | * this function yields the current CPU by moving the calling thread | 4588 | * This function yields the current CPU by moving the calling thread |
4571 | * to the expired array. If there are no other threads running on this | 4589 | * to the expired array. If there are no other threads running on this |
4572 | * CPU then this function will return. | 4590 | * CPU then this function will return. |
4573 | */ | 4591 | */ |
@@ -4694,7 +4712,7 @@ EXPORT_SYMBOL(cond_resched_softirq); | |||
4694 | /** | 4712 | /** |
4695 | * yield - yield the current processor to other threads. | 4713 | * yield - yield the current processor to other threads. |
4696 | * | 4714 | * |
4697 | * this is a shortcut for kernel-space yielding - it marks the | 4715 | * This is a shortcut for kernel-space yielding - it marks the |
4698 | * thread runnable and calls sys_sched_yield(). | 4716 | * thread runnable and calls sys_sched_yield(). |
4699 | */ | 4717 | */ |
4700 | void __sched yield(void) | 4718 | void __sched yield(void) |
diff --git a/kernel/signal.c b/kernel/signal.c index 8a04869402fb..3670225ecbc0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
456 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 456 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
457 | { | 457 | { |
458 | int signr = __dequeue_signal(&tsk->pending, mask, info); | 458 | int signr = __dequeue_signal(&tsk->pending, mask, info); |
459 | if (!signr) | 459 | if (!signr) { |
460 | signr = __dequeue_signal(&tsk->signal->shared_pending, | 460 | signr = __dequeue_signal(&tsk->signal->shared_pending, |
461 | mask, info); | 461 | mask, info); |
462 | /* | ||
463 | * itimer signal ? | ||
464 | * | ||
465 | * itimers are process shared and we restart periodic | ||
466 | * itimers in the signal delivery path to prevent DoS | ||
467 | * attacks in the high resolution timer case. This is | ||
468 | * compliant with the old way of self restarting | ||
469 | * itimers, as the SIGALRM is a legacy signal and only | ||
470 | * queued once. Changing the restart behaviour to | ||
471 | * restart the timer in the signal dequeue path is | ||
472 | * reducing the timer noise on heavy loaded !highres | ||
473 | * systems too. | ||
474 | */ | ||
475 | if (unlikely(signr == SIGALRM)) { | ||
476 | struct hrtimer *tmr = &tsk->signal->real_timer; | ||
477 | |||
478 | if (!hrtimer_is_queued(tmr) && | ||
479 | tsk->signal->it_real_incr.tv64 != 0) { | ||
480 | hrtimer_forward(tmr, tmr->base->get_time(), | ||
481 | tsk->signal->it_real_incr); | ||
482 | hrtimer_restart(tmr); | ||
483 | } | ||
484 | } | ||
485 | } | ||
462 | recalc_sigpending_tsk(tsk); | 486 | recalc_sigpending_tsk(tsk); |
463 | if (signr && unlikely(sig_kernel_stop(signr))) { | 487 | if (signr && unlikely(sig_kernel_stop(signr))) { |
464 | /* | 488 | /* |
465 | * Set a marker that we have dequeued a stop signal. Our | 489 | * Set a marker that we have dequeued a stop signal. Our |
466 | * caller might release the siglock and then the pending | 490 | * caller might release the siglock and then the pending |
467 | * stop signal it is about to process is no longer in the | 491 | * stop signal it is about to process is no longer in the |
468 | * pending bitmasks, but must still be cleared by a SIGCONT | 492 | * pending bitmasks, but must still be cleared by a SIGCONT |
469 | * (and overruled by a SIGKILL). So those cases clear this | 493 | * (and overruled by a SIGKILL). So those cases clear this |
470 | * shared flag after we've set it. Note that this flag may | 494 | * shared flag after we've set it. Note that this flag may |
471 | * remain set after the signal we return is ignored or | 495 | * remain set after the signal we return is ignored or |
472 | * handled. That doesn't matter because its only purpose | 496 | * handled. That doesn't matter because its only purpose |
473 | * is to alert stop-signal processing code when another | 497 | * is to alert stop-signal processing code when another |
474 | * processor has come along and cleared the flag. | 498 | * processor has come along and cleared the flag. |
475 | */ | 499 | */ |
476 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) | 500 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
477 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 501 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; |
478 | } | 502 | } |
479 | if ( signr && | 503 | if ( signr && |
480 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 504 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
481 | info->si_sys_private){ | 505 | info->si_sys_private){ |
@@ -1096,42 +1120,21 @@ int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) | |||
1096 | return retval; | 1120 | return retval; |
1097 | } | 1121 | } |
1098 | 1122 | ||
1099 | int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) | ||
1100 | { | ||
1101 | if (pgrp <= 0) | ||
1102 | return -EINVAL; | ||
1103 | |||
1104 | return __kill_pgrp_info(sig, info, find_pid(pgrp)); | ||
1105 | } | ||
1106 | |||
1107 | int | ||
1108 | kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) | ||
1109 | { | ||
1110 | int retval; | ||
1111 | |||
1112 | read_lock(&tasklist_lock); | ||
1113 | retval = __kill_pg_info(sig, info, pgrp); | ||
1114 | read_unlock(&tasklist_lock); | ||
1115 | |||
1116 | return retval; | ||
1117 | } | ||
1118 | |||
1119 | int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | 1123 | int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) |
1120 | { | 1124 | { |
1121 | int error; | 1125 | int error; |
1122 | int acquired_tasklist_lock = 0; | ||
1123 | struct task_struct *p; | 1126 | struct task_struct *p; |
1124 | 1127 | ||
1125 | rcu_read_lock(); | 1128 | rcu_read_lock(); |
1126 | if (unlikely(sig_needs_tasklist(sig))) { | 1129 | if (unlikely(sig_needs_tasklist(sig))) |
1127 | read_lock(&tasklist_lock); | 1130 | read_lock(&tasklist_lock); |
1128 | acquired_tasklist_lock = 1; | 1131 | |
1129 | } | ||
1130 | p = pid_task(pid, PIDTYPE_PID); | 1132 | p = pid_task(pid, PIDTYPE_PID); |
1131 | error = -ESRCH; | 1133 | error = -ESRCH; |
1132 | if (p) | 1134 | if (p) |
1133 | error = group_send_sig_info(sig, info, p); | 1135 | error = group_send_sig_info(sig, info, p); |
1134 | if (unlikely(acquired_tasklist_lock)) | 1136 | |
1137 | if (unlikely(sig_needs_tasklist(sig))) | ||
1135 | read_unlock(&tasklist_lock); | 1138 | read_unlock(&tasklist_lock); |
1136 | rcu_read_unlock(); | 1139 | rcu_read_unlock(); |
1137 | return error; | 1140 | return error; |
@@ -1193,8 +1196,10 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); | |||
1193 | 1196 | ||
1194 | static int kill_something_info(int sig, struct siginfo *info, int pid) | 1197 | static int kill_something_info(int sig, struct siginfo *info, int pid) |
1195 | { | 1198 | { |
1199 | int ret; | ||
1200 | rcu_read_lock(); | ||
1196 | if (!pid) { | 1201 | if (!pid) { |
1197 | return kill_pg_info(sig, info, process_group(current)); | 1202 | ret = kill_pgrp_info(sig, info, task_pgrp(current)); |
1198 | } else if (pid == -1) { | 1203 | } else if (pid == -1) { |
1199 | int retval = 0, count = 0; | 1204 | int retval = 0, count = 0; |
1200 | struct task_struct * p; | 1205 | struct task_struct * p; |
@@ -1209,12 +1214,14 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) | |||
1209 | } | 1214 | } |
1210 | } | 1215 | } |
1211 | read_unlock(&tasklist_lock); | 1216 | read_unlock(&tasklist_lock); |
1212 | return count ? retval : -ESRCH; | 1217 | ret = count ? retval : -ESRCH; |
1213 | } else if (pid < 0) { | 1218 | } else if (pid < 0) { |
1214 | return kill_pg_info(sig, info, -pid); | 1219 | ret = kill_pgrp_info(sig, info, find_pid(-pid)); |
1215 | } else { | 1220 | } else { |
1216 | return kill_proc_info(sig, info, pid); | 1221 | ret = kill_pid_info(sig, info, find_pid(pid)); |
1217 | } | 1222 | } |
1223 | rcu_read_unlock(); | ||
1224 | return ret; | ||
1218 | } | 1225 | } |
1219 | 1226 | ||
1220 | /* | 1227 | /* |
@@ -1313,12 +1320,6 @@ int kill_pid(struct pid *pid, int sig, int priv) | |||
1313 | EXPORT_SYMBOL(kill_pid); | 1320 | EXPORT_SYMBOL(kill_pid); |
1314 | 1321 | ||
1315 | int | 1322 | int |
1316 | kill_pg(pid_t pgrp, int sig, int priv) | ||
1317 | { | ||
1318 | return kill_pg_info(sig, __si_special(priv), pgrp); | ||
1319 | } | ||
1320 | |||
1321 | int | ||
1322 | kill_proc(pid_t pid, int sig, int priv) | 1323 | kill_proc(pid_t pid, int sig, int priv) |
1323 | { | 1324 | { |
1324 | return kill_proc_info(sig, __si_special(priv), pid); | 1325 | return kill_proc_info(sig, __si_special(priv), pid); |
@@ -1907,7 +1908,7 @@ relock: | |||
1907 | 1908 | ||
1908 | /* signals can be posted during this window */ | 1909 | /* signals can be posted during this window */ |
1909 | 1910 | ||
1910 | if (is_orphaned_pgrp(process_group(current))) | 1911 | if (is_current_pgrp_orphaned()) |
1911 | goto relock; | 1912 | goto relock; |
1912 | 1913 | ||
1913 | spin_lock_irq(¤t->sighand->siglock); | 1914 | spin_lock_irq(¤t->sighand->siglock); |
@@ -1957,7 +1958,6 @@ EXPORT_SYMBOL(recalc_sigpending); | |||
1957 | EXPORT_SYMBOL_GPL(dequeue_signal); | 1958 | EXPORT_SYMBOL_GPL(dequeue_signal); |
1958 | EXPORT_SYMBOL(flush_signals); | 1959 | EXPORT_SYMBOL(flush_signals); |
1959 | EXPORT_SYMBOL(force_sig); | 1960 | EXPORT_SYMBOL(force_sig); |
1960 | EXPORT_SYMBOL(kill_pg); | ||
1961 | EXPORT_SYMBOL(kill_proc); | 1961 | EXPORT_SYMBOL(kill_proc); |
1962 | EXPORT_SYMBOL(ptrace_notify); | 1962 | EXPORT_SYMBOL(ptrace_notify); |
1963 | EXPORT_SYMBOL(send_sig); | 1963 | EXPORT_SYMBOL(send_sig); |
@@ -2284,7 +2284,7 @@ static int do_tkill(int tgid, int pid, int sig) | |||
2284 | * @pid: the PID of the thread | 2284 | * @pid: the PID of the thread |
2285 | * @sig: signal to be sent | 2285 | * @sig: signal to be sent |
2286 | * | 2286 | * |
2287 | * This syscall also checks the tgid and returns -ESRCH even if the PID | 2287 | * This syscall also checks the @tgid and returns -ESRCH even if the PID |
2288 | * exists but it's not belonging to the target process anymore. This | 2288 | * exists but it's not belonging to the target process anymore. This |
2289 | * method solves the problem of threads exiting and PIDs getting reused. | 2289 | * method solves the problem of threads exiting and PIDs getting reused. |
2290 | */ | 2290 | */ |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 918e52df090e..8b75008e2bd8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
18 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/tick.h> | ||
20 | 21 | ||
21 | #include <asm/irq.h> | 22 | #include <asm/irq.h> |
22 | /* | 23 | /* |
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq); | |||
273 | 274 | ||
274 | #endif | 275 | #endif |
275 | 276 | ||
277 | /* | ||
278 | * Enter an interrupt context. | ||
279 | */ | ||
280 | void irq_enter(void) | ||
281 | { | ||
282 | __irq_enter(); | ||
283 | #ifdef CONFIG_NO_HZ | ||
284 | if (idle_cpu(smp_processor_id())) | ||
285 | tick_nohz_update_jiffies(); | ||
286 | #endif | ||
287 | } | ||
288 | |||
276 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 289 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
277 | # define invoke_softirq() __do_softirq() | 290 | # define invoke_softirq() __do_softirq() |
278 | #else | 291 | #else |
@@ -289,6 +302,12 @@ void irq_exit(void) | |||
289 | sub_preempt_count(IRQ_EXIT_OFFSET); | 302 | sub_preempt_count(IRQ_EXIT_OFFSET); |
290 | if (!in_interrupt() && local_softirq_pending()) | 303 | if (!in_interrupt() && local_softirq_pending()) |
291 | invoke_softirq(); | 304 | invoke_softirq(); |
305 | |||
306 | #ifdef CONFIG_NO_HZ | ||
307 | /* Make sure that timer wheel updates are propagated */ | ||
308 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) | ||
309 | tick_nohz_stop_sched_tick(); | ||
310 | #endif | ||
292 | preempt_enable_no_resched(); | 311 | preempt_enable_no_resched(); |
293 | } | 312 | } |
294 | 313 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 6e2101dec0fc..123b165080e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -215,7 +215,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); | |||
215 | * This routine uses RCU to synchronize with changes to the chain. | 215 | * This routine uses RCU to synchronize with changes to the chain. |
216 | * | 216 | * |
217 | * If the return value of the notifier can be and'ed | 217 | * If the return value of the notifier can be and'ed |
218 | * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain | 218 | * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() |
219 | * will return immediately, with the return value of | 219 | * will return immediately, with the return value of |
220 | * the notifier function which halted execution. | 220 | * the notifier function which halted execution. |
221 | * Otherwise the return value is the return value | 221 | * Otherwise the return value is the return value |
@@ -313,7 +313,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); | |||
313 | * run in a process context, so they are allowed to block. | 313 | * run in a process context, so they are allowed to block. |
314 | * | 314 | * |
315 | * If the return value of the notifier can be and'ed | 315 | * If the return value of the notifier can be and'ed |
316 | * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain | 316 | * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() |
317 | * will return immediately, with the return value of | 317 | * will return immediately, with the return value of |
318 | * the notifier function which halted execution. | 318 | * the notifier function which halted execution. |
319 | * Otherwise the return value is the return value | 319 | * Otherwise the return value is the return value |
@@ -393,7 +393,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); | |||
393 | * All locking must be provided by the caller. | 393 | * All locking must be provided by the caller. |
394 | * | 394 | * |
395 | * If the return value of the notifier can be and'ed | 395 | * If the return value of the notifier can be and'ed |
396 | * with %NOTIFY_STOP_MASK then raw_notifier_call_chain | 396 | * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() |
397 | * will return immediately, with the return value of | 397 | * will return immediately, with the return value of |
398 | * the notifier function which halted execution. | 398 | * the notifier function which halted execution. |
399 | * Otherwise the return value is the return value | 399 | * Otherwise the return value is the return value |
@@ -487,7 +487,7 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); | |||
487 | * run in a process context, so they are allowed to block. | 487 | * run in a process context, so they are allowed to block. |
488 | * | 488 | * |
489 | * If the return value of the notifier can be and'ed | 489 | * If the return value of the notifier can be and'ed |
490 | * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain | 490 | * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() |
491 | * will return immediately, with the return value of | 491 | * will return immediately, with the return value of |
492 | * the notifier function which halted execution. | 492 | * the notifier function which halted execution. |
493 | * Otherwise the return value is the return value | 493 | * Otherwise the return value is the return value |
@@ -538,7 +538,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); | |||
538 | * Registers a function with the list of functions | 538 | * Registers a function with the list of functions |
539 | * to be called at reboot time. | 539 | * to be called at reboot time. |
540 | * | 540 | * |
541 | * Currently always returns zero, as blocking_notifier_chain_register | 541 | * Currently always returns zero, as blocking_notifier_chain_register() |
542 | * always returns zero. | 542 | * always returns zero. |
543 | */ | 543 | */ |
544 | 544 | ||
@@ -596,6 +596,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) | |||
596 | struct task_struct *g, *p; | 596 | struct task_struct *g, *p; |
597 | struct user_struct *user; | 597 | struct user_struct *user; |
598 | int error = -EINVAL; | 598 | int error = -EINVAL; |
599 | struct pid *pgrp; | ||
599 | 600 | ||
600 | if (which > 2 || which < 0) | 601 | if (which > 2 || which < 0) |
601 | goto out; | 602 | goto out; |
@@ -610,18 +611,21 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) | |||
610 | read_lock(&tasklist_lock); | 611 | read_lock(&tasklist_lock); |
611 | switch (which) { | 612 | switch (which) { |
612 | case PRIO_PROCESS: | 613 | case PRIO_PROCESS: |
613 | if (!who) | 614 | if (who) |
614 | who = current->pid; | 615 | p = find_task_by_pid(who); |
615 | p = find_task_by_pid(who); | 616 | else |
617 | p = current; | ||
616 | if (p) | 618 | if (p) |
617 | error = set_one_prio(p, niceval, error); | 619 | error = set_one_prio(p, niceval, error); |
618 | break; | 620 | break; |
619 | case PRIO_PGRP: | 621 | case PRIO_PGRP: |
620 | if (!who) | 622 | if (who) |
621 | who = process_group(current); | 623 | pgrp = find_pid(who); |
622 | do_each_task_pid(who, PIDTYPE_PGID, p) { | 624 | else |
625 | pgrp = task_pgrp(current); | ||
626 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | ||
623 | error = set_one_prio(p, niceval, error); | 627 | error = set_one_prio(p, niceval, error); |
624 | } while_each_task_pid(who, PIDTYPE_PGID, p); | 628 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
625 | break; | 629 | break; |
626 | case PRIO_USER: | 630 | case PRIO_USER: |
627 | user = current->user; | 631 | user = current->user; |
@@ -656,6 +660,7 @@ asmlinkage long sys_getpriority(int which, int who) | |||
656 | struct task_struct *g, *p; | 660 | struct task_struct *g, *p; |
657 | struct user_struct *user; | 661 | struct user_struct *user; |
658 | long niceval, retval = -ESRCH; | 662 | long niceval, retval = -ESRCH; |
663 | struct pid *pgrp; | ||
659 | 664 | ||
660 | if (which > 2 || which < 0) | 665 | if (which > 2 || which < 0) |
661 | return -EINVAL; | 666 | return -EINVAL; |
@@ -663,9 +668,10 @@ asmlinkage long sys_getpriority(int which, int who) | |||
663 | read_lock(&tasklist_lock); | 668 | read_lock(&tasklist_lock); |
664 | switch (which) { | 669 | switch (which) { |
665 | case PRIO_PROCESS: | 670 | case PRIO_PROCESS: |
666 | if (!who) | 671 | if (who) |
667 | who = current->pid; | 672 | p = find_task_by_pid(who); |
668 | p = find_task_by_pid(who); | 673 | else |
674 | p = current; | ||
669 | if (p) { | 675 | if (p) { |
670 | niceval = 20 - task_nice(p); | 676 | niceval = 20 - task_nice(p); |
671 | if (niceval > retval) | 677 | if (niceval > retval) |
@@ -673,13 +679,15 @@ asmlinkage long sys_getpriority(int which, int who) | |||
673 | } | 679 | } |
674 | break; | 680 | break; |
675 | case PRIO_PGRP: | 681 | case PRIO_PGRP: |
676 | if (!who) | 682 | if (who) |
677 | who = process_group(current); | 683 | pgrp = find_pid(who); |
678 | do_each_task_pid(who, PIDTYPE_PGID, p) { | 684 | else |
685 | pgrp = task_pgrp(current); | ||
686 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | ||
679 | niceval = 20 - task_nice(p); | 687 | niceval = 20 - task_nice(p); |
680 | if (niceval > retval) | 688 | if (niceval > retval) |
681 | retval = niceval; | 689 | retval = niceval; |
682 | } while_each_task_pid(who, PIDTYPE_PGID, p); | 690 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
683 | break; | 691 | break; |
684 | case PRIO_USER: | 692 | case PRIO_USER: |
685 | user = current->user; | 693 | user = current->user; |
@@ -1388,7 +1396,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1388 | 1396 | ||
1389 | if (p->real_parent == group_leader) { | 1397 | if (p->real_parent == group_leader) { |
1390 | err = -EPERM; | 1398 | err = -EPERM; |
1391 | if (process_session(p) != process_session(group_leader)) | 1399 | if (task_session(p) != task_session(group_leader)) |
1392 | goto out; | 1400 | goto out; |
1393 | err = -EACCES; | 1401 | err = -EACCES; |
1394 | if (p->did_exec) | 1402 | if (p->did_exec) |
@@ -1407,7 +1415,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1407 | struct task_struct *g = | 1415 | struct task_struct *g = |
1408 | find_task_by_pid_type(PIDTYPE_PGID, pgid); | 1416 | find_task_by_pid_type(PIDTYPE_PGID, pgid); |
1409 | 1417 | ||
1410 | if (!g || process_session(g) != process_session(group_leader)) | 1418 | if (!g || task_session(g) != task_session(group_leader)) |
1411 | goto out; | 1419 | goto out; |
1412 | } | 1420 | } |
1413 | 1421 | ||
@@ -1510,7 +1518,6 @@ asmlinkage long sys_setsid(void) | |||
1510 | 1518 | ||
1511 | spin_lock(&group_leader->sighand->siglock); | 1519 | spin_lock(&group_leader->sighand->siglock); |
1512 | group_leader->signal->tty = NULL; | 1520 | group_leader->signal->tty = NULL; |
1513 | group_leader->signal->tty_old_pgrp = 0; | ||
1514 | spin_unlock(&group_leader->sighand->siglock); | 1521 | spin_unlock(&group_leader->sighand->siglock); |
1515 | 1522 | ||
1516 | err = process_group(group_leader); | 1523 | err = process_group(group_leader); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 600b33358ded..3ca1d5ff0319 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -90,12 +90,6 @@ extern char modprobe_path[]; | |||
90 | #ifdef CONFIG_CHR_DEV_SG | 90 | #ifdef CONFIG_CHR_DEV_SG |
91 | extern int sg_big_buff; | 91 | extern int sg_big_buff; |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_SYSVIPC | ||
94 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
95 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
96 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
97 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
98 | #endif | ||
99 | 93 | ||
100 | #ifdef __sparc__ | 94 | #ifdef __sparc__ |
101 | extern char reboot_command []; | 95 | extern char reboot_command []; |
@@ -135,22 +129,12 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, | |||
135 | void __user *, size_t, ctl_table *); | 129 | void __user *, size_t, ctl_table *); |
136 | #endif | 130 | #endif |
137 | 131 | ||
138 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
139 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
140 | |||
141 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
142 | void __user *oldval, size_t __user *oldlenp, | ||
143 | void __user *newval, size_t newlen); | ||
144 | |||
145 | #ifdef CONFIG_SYSVIPC | ||
146 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
147 | void __user *oldval, size_t __user *oldlenp, | ||
148 | void __user *newval, size_t newlen); | ||
149 | #endif | ||
150 | 132 | ||
151 | #ifdef CONFIG_PROC_SYSCTL | 133 | #ifdef CONFIG_PROC_SYSCTL |
152 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 134 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
153 | void __user *buffer, size_t *lenp, loff_t *ppos); | 135 | void __user *buffer, size_t *lenp, loff_t *ppos); |
136 | static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, | ||
137 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
154 | #endif | 138 | #endif |
155 | 139 | ||
156 | static ctl_table root_table[]; | 140 | static ctl_table root_table[]; |
@@ -174,59 +158,6 @@ extern ctl_table inotify_table[]; | |||
174 | int sysctl_legacy_va_layout; | 158 | int sysctl_legacy_va_layout; |
175 | #endif | 159 | #endif |
176 | 160 | ||
177 | static void *get_uts(ctl_table *table, int write) | ||
178 | { | ||
179 | char *which = table->data; | ||
180 | #ifdef CONFIG_UTS_NS | ||
181 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
182 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
183 | #endif | ||
184 | if (!write) | ||
185 | down_read(&uts_sem); | ||
186 | else | ||
187 | down_write(&uts_sem); | ||
188 | return which; | ||
189 | } | ||
190 | |||
191 | static void put_uts(ctl_table *table, int write, void *which) | ||
192 | { | ||
193 | if (!write) | ||
194 | up_read(&uts_sem); | ||
195 | else | ||
196 | up_write(&uts_sem); | ||
197 | } | ||
198 | |||
199 | #ifdef CONFIG_SYSVIPC | ||
200 | static void *get_ipc(ctl_table *table, int write) | ||
201 | { | ||
202 | char *which = table->data; | ||
203 | struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; | ||
204 | which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; | ||
205 | return which; | ||
206 | } | ||
207 | #else | ||
208 | #define get_ipc(T,W) ((T)->data) | ||
209 | #endif | ||
210 | |||
211 | /* /proc declarations: */ | ||
212 | |||
213 | #ifdef CONFIG_PROC_SYSCTL | ||
214 | |||
215 | static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); | ||
216 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); | ||
217 | static int proc_opensys(struct inode *, struct file *); | ||
218 | |||
219 | const struct file_operations proc_sys_file_operations = { | ||
220 | .open = proc_opensys, | ||
221 | .read = proc_readsys, | ||
222 | .write = proc_writesys, | ||
223 | }; | ||
224 | |||
225 | extern struct proc_dir_entry *proc_sys_root; | ||
226 | |||
227 | static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); | ||
228 | static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); | ||
229 | #endif | ||
230 | 161 | ||
231 | /* The default sysctl tables: */ | 162 | /* The default sysctl tables: */ |
232 | 163 | ||
@@ -275,51 +206,6 @@ static ctl_table root_table[] = { | |||
275 | 206 | ||
276 | static ctl_table kern_table[] = { | 207 | static ctl_table kern_table[] = { |
277 | { | 208 | { |
278 | .ctl_name = KERN_OSTYPE, | ||
279 | .procname = "ostype", | ||
280 | .data = init_uts_ns.name.sysname, | ||
281 | .maxlen = sizeof(init_uts_ns.name.sysname), | ||
282 | .mode = 0444, | ||
283 | .proc_handler = &proc_do_uts_string, | ||
284 | .strategy = &sysctl_uts_string, | ||
285 | }, | ||
286 | { | ||
287 | .ctl_name = KERN_OSRELEASE, | ||
288 | .procname = "osrelease", | ||
289 | .data = init_uts_ns.name.release, | ||
290 | .maxlen = sizeof(init_uts_ns.name.release), | ||
291 | .mode = 0444, | ||
292 | .proc_handler = &proc_do_uts_string, | ||
293 | .strategy = &sysctl_uts_string, | ||
294 | }, | ||
295 | { | ||
296 | .ctl_name = KERN_VERSION, | ||
297 | .procname = "version", | ||
298 | .data = init_uts_ns.name.version, | ||
299 | .maxlen = sizeof(init_uts_ns.name.version), | ||
300 | .mode = 0444, | ||
301 | .proc_handler = &proc_do_uts_string, | ||
302 | .strategy = &sysctl_uts_string, | ||
303 | }, | ||
304 | { | ||
305 | .ctl_name = KERN_NODENAME, | ||
306 | .procname = "hostname", | ||
307 | .data = init_uts_ns.name.nodename, | ||
308 | .maxlen = sizeof(init_uts_ns.name.nodename), | ||
309 | .mode = 0644, | ||
310 | .proc_handler = &proc_do_uts_string, | ||
311 | .strategy = &sysctl_uts_string, | ||
312 | }, | ||
313 | { | ||
314 | .ctl_name = KERN_DOMAINNAME, | ||
315 | .procname = "domainname", | ||
316 | .data = init_uts_ns.name.domainname, | ||
317 | .maxlen = sizeof(init_uts_ns.name.domainname), | ||
318 | .mode = 0644, | ||
319 | .proc_handler = &proc_do_uts_string, | ||
320 | .strategy = &sysctl_uts_string, | ||
321 | }, | ||
322 | { | ||
323 | .ctl_name = KERN_PANIC, | 209 | .ctl_name = KERN_PANIC, |
324 | .procname = "panic", | 210 | .procname = "panic", |
325 | .data = &panic_timeout, | 211 | .data = &panic_timeout, |
@@ -344,14 +230,16 @@ static ctl_table kern_table[] = { | |||
344 | .proc_handler = &proc_dostring, | 230 | .proc_handler = &proc_dostring, |
345 | .strategy = &sysctl_string, | 231 | .strategy = &sysctl_string, |
346 | }, | 232 | }, |
233 | #ifdef CONFIG_PROC_SYSCTL | ||
347 | { | 234 | { |
348 | .ctl_name = KERN_TAINTED, | 235 | .ctl_name = KERN_TAINTED, |
349 | .procname = "tainted", | 236 | .procname = "tainted", |
350 | .data = &tainted, | 237 | .data = &tainted, |
351 | .maxlen = sizeof(int), | 238 | .maxlen = sizeof(int), |
352 | .mode = 0444, | 239 | .mode = 0644, |
353 | .proc_handler = &proc_dointvec, | 240 | .proc_handler = &proc_dointvec_taint, |
354 | }, | 241 | }, |
242 | #endif | ||
355 | { | 243 | { |
356 | .ctl_name = KERN_CAP_BSET, | 244 | .ctl_name = KERN_CAP_BSET, |
357 | .procname = "cap-bound", | 245 | .procname = "cap-bound", |
@@ -473,71 +361,6 @@ static ctl_table kern_table[] = { | |||
473 | .proc_handler = &proc_dointvec, | 361 | .proc_handler = &proc_dointvec, |
474 | }, | 362 | }, |
475 | #endif | 363 | #endif |
476 | #ifdef CONFIG_SYSVIPC | ||
477 | { | ||
478 | .ctl_name = KERN_SHMMAX, | ||
479 | .procname = "shmmax", | ||
480 | .data = &init_ipc_ns.shm_ctlmax, | ||
481 | .maxlen = sizeof (init_ipc_ns.shm_ctlmax), | ||
482 | .mode = 0644, | ||
483 | .proc_handler = &proc_ipc_doulongvec_minmax, | ||
484 | .strategy = sysctl_ipc_data, | ||
485 | }, | ||
486 | { | ||
487 | .ctl_name = KERN_SHMALL, | ||
488 | .procname = "shmall", | ||
489 | .data = &init_ipc_ns.shm_ctlall, | ||
490 | .maxlen = sizeof (init_ipc_ns.shm_ctlall), | ||
491 | .mode = 0644, | ||
492 | .proc_handler = &proc_ipc_doulongvec_minmax, | ||
493 | .strategy = sysctl_ipc_data, | ||
494 | }, | ||
495 | { | ||
496 | .ctl_name = KERN_SHMMNI, | ||
497 | .procname = "shmmni", | ||
498 | .data = &init_ipc_ns.shm_ctlmni, | ||
499 | .maxlen = sizeof (init_ipc_ns.shm_ctlmni), | ||
500 | .mode = 0644, | ||
501 | .proc_handler = &proc_ipc_dointvec, | ||
502 | .strategy = sysctl_ipc_data, | ||
503 | }, | ||
504 | { | ||
505 | .ctl_name = KERN_MSGMAX, | ||
506 | .procname = "msgmax", | ||
507 | .data = &init_ipc_ns.msg_ctlmax, | ||
508 | .maxlen = sizeof (init_ipc_ns.msg_ctlmax), | ||
509 | .mode = 0644, | ||
510 | .proc_handler = &proc_ipc_dointvec, | ||
511 | .strategy = sysctl_ipc_data, | ||
512 | }, | ||
513 | { | ||
514 | .ctl_name = KERN_MSGMNI, | ||
515 | .procname = "msgmni", | ||
516 | .data = &init_ipc_ns.msg_ctlmni, | ||
517 | .maxlen = sizeof (init_ipc_ns.msg_ctlmni), | ||
518 | .mode = 0644, | ||
519 | .proc_handler = &proc_ipc_dointvec, | ||
520 | .strategy = sysctl_ipc_data, | ||
521 | }, | ||
522 | { | ||
523 | .ctl_name = KERN_MSGMNB, | ||
524 | .procname = "msgmnb", | ||
525 | .data = &init_ipc_ns.msg_ctlmnb, | ||
526 | .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), | ||
527 | .mode = 0644, | ||
528 | .proc_handler = &proc_ipc_dointvec, | ||
529 | .strategy = sysctl_ipc_data, | ||
530 | }, | ||
531 | { | ||
532 | .ctl_name = KERN_SEM, | ||
533 | .procname = "sem", | ||
534 | .data = &init_ipc_ns.sem_ctls, | ||
535 | .maxlen = 4*sizeof (int), | ||
536 | .mode = 0644, | ||
537 | .proc_handler = &proc_ipc_dointvec, | ||
538 | .strategy = sysctl_ipc_data, | ||
539 | }, | ||
540 | #endif | ||
541 | #ifdef CONFIG_MAGIC_SYSRQ | 364 | #ifdef CONFIG_MAGIC_SYSRQ |
542 | { | 365 | { |
543 | .ctl_name = KERN_SYSRQ, | 366 | .ctl_name = KERN_SYSRQ, |
@@ -1038,6 +861,12 @@ static ctl_table vm_table[] = { | |||
1038 | { .ctl_name = 0 } | 861 | { .ctl_name = 0 } |
1039 | }; | 862 | }; |
1040 | 863 | ||
864 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) | ||
865 | static ctl_table binfmt_misc_table[] = { | ||
866 | { .ctl_name = 0 } | ||
867 | }; | ||
868 | #endif | ||
869 | |||
1041 | static ctl_table fs_table[] = { | 870 | static ctl_table fs_table[] = { |
1042 | { | 871 | { |
1043 | .ctl_name = FS_NRINODE, | 872 | .ctl_name = FS_NRINODE, |
@@ -1161,6 +990,14 @@ static ctl_table fs_table[] = { | |||
1161 | .mode = 0644, | 990 | .mode = 0644, |
1162 | .proc_handler = &proc_dointvec, | 991 | .proc_handler = &proc_dointvec, |
1163 | }, | 992 | }, |
993 | #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) | ||
994 | { | ||
995 | .ctl_name = CTL_UNNUMBERED, | ||
996 | .procname = "binfmt_misc", | ||
997 | .mode = 0555, | ||
998 | .child = binfmt_misc_table, | ||
999 | }, | ||
1000 | #endif | ||
1164 | { .ctl_name = 0 } | 1001 | { .ctl_name = 0 } |
1165 | }; | 1002 | }; |
1166 | 1003 | ||
@@ -1172,8 +1009,6 @@ static ctl_table dev_table[] = { | |||
1172 | { .ctl_name = 0 } | 1009 | { .ctl_name = 0 } |
1173 | }; | 1010 | }; |
1174 | 1011 | ||
1175 | extern void init_irq_proc (void); | ||
1176 | |||
1177 | static DEFINE_SPINLOCK(sysctl_lock); | 1012 | static DEFINE_SPINLOCK(sysctl_lock); |
1178 | 1013 | ||
1179 | /* called under sysctl_lock */ | 1014 | /* called under sysctl_lock */ |
@@ -1215,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p) | |||
1215 | list_del_init(&p->ctl_entry); | 1050 | list_del_init(&p->ctl_entry); |
1216 | } | 1051 | } |
1217 | 1052 | ||
1218 | void __init sysctl_init(void) | 1053 | void sysctl_head_finish(struct ctl_table_header *head) |
1219 | { | 1054 | { |
1220 | #ifdef CONFIG_PROC_SYSCTL | 1055 | if (!head) |
1221 | register_proc_table(root_table, proc_sys_root, &root_table_header); | 1056 | return; |
1222 | init_irq_proc(); | 1057 | spin_lock(&sysctl_lock); |
1223 | #endif | 1058 | unuse_table(head); |
1059 | spin_unlock(&sysctl_lock); | ||
1060 | } | ||
1061 | |||
1062 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | ||
1063 | { | ||
1064 | struct ctl_table_header *head; | ||
1065 | struct list_head *tmp; | ||
1066 | spin_lock(&sysctl_lock); | ||
1067 | if (prev) { | ||
1068 | tmp = &prev->ctl_entry; | ||
1069 | unuse_table(prev); | ||
1070 | goto next; | ||
1071 | } | ||
1072 | tmp = &root_table_header.ctl_entry; | ||
1073 | for (;;) { | ||
1074 | head = list_entry(tmp, struct ctl_table_header, ctl_entry); | ||
1075 | |||
1076 | if (!use_table(head)) | ||
1077 | goto next; | ||
1078 | spin_unlock(&sysctl_lock); | ||
1079 | return head; | ||
1080 | next: | ||
1081 | tmp = tmp->next; | ||
1082 | if (tmp == &root_table_header.ctl_entry) | ||
1083 | break; | ||
1084 | } | ||
1085 | spin_unlock(&sysctl_lock); | ||
1086 | return NULL; | ||
1224 | } | 1087 | } |
1225 | 1088 | ||
1226 | #ifdef CONFIG_SYSCTL_SYSCALL | 1089 | #ifdef CONFIG_SYSCTL_SYSCALL |
1227 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1090 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
1228 | void __user *newval, size_t newlen) | 1091 | void __user *newval, size_t newlen) |
1229 | { | 1092 | { |
1230 | struct list_head *tmp; | 1093 | struct ctl_table_header *head; |
1231 | int error = -ENOTDIR; | 1094 | int error = -ENOTDIR; |
1232 | 1095 | ||
1233 | if (nlen <= 0 || nlen >= CTL_MAXNAME) | 1096 | if (nlen <= 0 || nlen >= CTL_MAXNAME) |
@@ -1237,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1237 | if (!oldlenp || get_user(old_len, oldlenp)) | 1100 | if (!oldlenp || get_user(old_len, oldlenp)) |
1238 | return -EFAULT; | 1101 | return -EFAULT; |
1239 | } | 1102 | } |
1240 | spin_lock(&sysctl_lock); | ||
1241 | tmp = &root_table_header.ctl_entry; | ||
1242 | do { | ||
1243 | struct ctl_table_header *head = | ||
1244 | list_entry(tmp, struct ctl_table_header, ctl_entry); | ||
1245 | |||
1246 | if (!use_table(head)) | ||
1247 | continue; | ||
1248 | |||
1249 | spin_unlock(&sysctl_lock); | ||
1250 | 1103 | ||
1104 | for (head = sysctl_head_next(NULL); head; | ||
1105 | head = sysctl_head_next(head)) { | ||
1251 | error = parse_table(name, nlen, oldval, oldlenp, | 1106 | error = parse_table(name, nlen, oldval, oldlenp, |
1252 | newval, newlen, head->ctl_table); | 1107 | newval, newlen, head->ctl_table); |
1253 | 1108 | if (error != -ENOTDIR) { | |
1254 | spin_lock(&sysctl_lock); | 1109 | sysctl_head_finish(head); |
1255 | unuse_table(head); | ||
1256 | if (error != -ENOTDIR) | ||
1257 | break; | 1110 | break; |
1258 | } while ((tmp = tmp->next) != &root_table_header.ctl_entry); | 1111 | } |
1259 | spin_unlock(&sysctl_lock); | 1112 | } |
1260 | return error; | 1113 | return error; |
1261 | } | 1114 | } |
1262 | 1115 | ||
@@ -1277,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | |||
1277 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 1130 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
1278 | 1131 | ||
1279 | /* | 1132 | /* |
1280 | * ctl_perm does NOT grant the superuser all rights automatically, because | 1133 | * sysctl_perm does NOT grant the superuser all rights automatically, because |
1281 | * some sysctl variables are readonly even to root. | 1134 | * some sysctl variables are readonly even to root. |
1282 | */ | 1135 | */ |
1283 | 1136 | ||
@@ -1292,7 +1145,7 @@ static int test_perm(int mode, int op) | |||
1292 | return -EACCES; | 1145 | return -EACCES; |
1293 | } | 1146 | } |
1294 | 1147 | ||
1295 | static inline int ctl_perm(ctl_table *table, int op) | 1148 | int sysctl_perm(ctl_table *table, int op) |
1296 | { | 1149 | { |
1297 | int error; | 1150 | int error; |
1298 | error = security_sysctl(table, op); | 1151 | error = security_sysctl(table, op); |
@@ -1316,19 +1169,11 @@ repeat: | |||
1316 | for ( ; table->ctl_name || table->procname; table++) { | 1169 | for ( ; table->ctl_name || table->procname; table++) { |
1317 | if (!table->ctl_name) | 1170 | if (!table->ctl_name) |
1318 | continue; | 1171 | continue; |
1319 | if (n == table->ctl_name || table->ctl_name == CTL_ANY) { | 1172 | if (n == table->ctl_name) { |
1320 | int error; | 1173 | int error; |
1321 | if (table->child) { | 1174 | if (table->child) { |
1322 | if (ctl_perm(table, 001)) | 1175 | if (sysctl_perm(table, 001)) |
1323 | return -EPERM; | 1176 | return -EPERM; |
1324 | if (table->strategy) { | ||
1325 | error = table->strategy( | ||
1326 | table, name, nlen, | ||
1327 | oldval, oldlenp, | ||
1328 | newval, newlen); | ||
1329 | if (error) | ||
1330 | return error; | ||
1331 | } | ||
1332 | name++; | 1177 | name++; |
1333 | nlen--; | 1178 | nlen--; |
1334 | table = table->child; | 1179 | table = table->child; |
@@ -1356,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
1356 | op |= 004; | 1201 | op |= 004; |
1357 | if (newval) | 1202 | if (newval) |
1358 | op |= 002; | 1203 | op |= 002; |
1359 | if (ctl_perm(table, op)) | 1204 | if (sysctl_perm(table, op)) |
1360 | return -EPERM; | 1205 | return -EPERM; |
1361 | 1206 | ||
1362 | if (table->strategy) { | 1207 | if (table->strategy) { |
@@ -1395,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table, | |||
1395 | } | 1240 | } |
1396 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 1241 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
1397 | 1242 | ||
1243 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) | ||
1244 | { | ||
1245 | for (; table->ctl_name || table->procname; table++) { | ||
1246 | table->parent = parent; | ||
1247 | if (table->child) | ||
1248 | sysctl_set_parent(table, table->child); | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | static __init int sysctl_init(void) | ||
1253 | { | ||
1254 | sysctl_set_parent(NULL, root_table); | ||
1255 | return 0; | ||
1256 | } | ||
1257 | |||
1258 | core_initcall(sysctl_init); | ||
1259 | |||
1398 | /** | 1260 | /** |
1399 | * register_sysctl_table - register a sysctl hierarchy | 1261 | * register_sysctl_table - register a sysctl hierarchy |
1400 | * @table: the top-level table structure | 1262 | * @table: the top-level table structure |
1401 | * @insert_at_head: whether the entry should be inserted in front or at the end | ||
1402 | * | 1263 | * |
1403 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1264 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1404 | * array. An entry with a ctl_name of 0 terminates the table. | 1265 | * array. An entry with a ctl_name of 0 terminates the table. |
@@ -1464,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
1464 | * This routine returns %NULL on a failure to register, and a pointer | 1325 | * This routine returns %NULL on a failure to register, and a pointer |
1465 | * to the table header on success. | 1326 | * to the table header on success. |
1466 | */ | 1327 | */ |
1467 | struct ctl_table_header *register_sysctl_table(ctl_table * table, | 1328 | struct ctl_table_header *register_sysctl_table(ctl_table * table) |
1468 | int insert_at_head) | ||
1469 | { | 1329 | { |
1470 | struct ctl_table_header *tmp; | 1330 | struct ctl_table_header *tmp; |
1471 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); | 1331 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); |
@@ -1475,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, | |||
1475 | INIT_LIST_HEAD(&tmp->ctl_entry); | 1335 | INIT_LIST_HEAD(&tmp->ctl_entry); |
1476 | tmp->used = 0; | 1336 | tmp->used = 0; |
1477 | tmp->unregistering = NULL; | 1337 | tmp->unregistering = NULL; |
1338 | sysctl_set_parent(NULL, table); | ||
1478 | spin_lock(&sysctl_lock); | 1339 | spin_lock(&sysctl_lock); |
1479 | if (insert_at_head) | 1340 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); |
1480 | list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); | ||
1481 | else | ||
1482 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | ||
1483 | spin_unlock(&sysctl_lock); | 1341 | spin_unlock(&sysctl_lock); |
1484 | #ifdef CONFIG_PROC_SYSCTL | ||
1485 | register_proc_table(table, proc_sys_root, tmp); | ||
1486 | #endif | ||
1487 | return tmp; | 1342 | return tmp; |
1488 | } | 1343 | } |
1489 | 1344 | ||
@@ -1499,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
1499 | might_sleep(); | 1354 | might_sleep(); |
1500 | spin_lock(&sysctl_lock); | 1355 | spin_lock(&sysctl_lock); |
1501 | start_unregistering(header); | 1356 | start_unregistering(header); |
1502 | #ifdef CONFIG_PROC_SYSCTL | ||
1503 | unregister_proc_table(header->ctl_table, proc_sys_root); | ||
1504 | #endif | ||
1505 | spin_unlock(&sysctl_lock); | 1357 | spin_unlock(&sysctl_lock); |
1506 | kfree(header); | 1358 | kfree(header); |
1507 | } | 1359 | } |
@@ -1525,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table) | |||
1525 | 1377 | ||
1526 | #ifdef CONFIG_PROC_SYSCTL | 1378 | #ifdef CONFIG_PROC_SYSCTL |
1527 | 1379 | ||
1528 | /* Scan the sysctl entries in table and add them all into /proc */ | ||
1529 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) | ||
1530 | { | ||
1531 | struct proc_dir_entry *de; | ||
1532 | int len; | ||
1533 | mode_t mode; | ||
1534 | |||
1535 | for (; table->ctl_name || table->procname; table++) { | ||
1536 | /* Can't do anything without a proc name. */ | ||
1537 | if (!table->procname) | ||
1538 | continue; | ||
1539 | /* Maybe we can't do anything with it... */ | ||
1540 | if (!table->proc_handler && !table->child) { | ||
1541 | printk(KERN_WARNING "SYSCTL: Can't register %s\n", | ||
1542 | table->procname); | ||
1543 | continue; | ||
1544 | } | ||
1545 | |||
1546 | len = strlen(table->procname); | ||
1547 | mode = table->mode; | ||
1548 | |||
1549 | de = NULL; | ||
1550 | if (table->proc_handler) | ||
1551 | mode |= S_IFREG; | ||
1552 | else { | ||
1553 | mode |= S_IFDIR; | ||
1554 | for (de = root->subdir; de; de = de->next) { | ||
1555 | if (proc_match(len, table->procname, de)) | ||
1556 | break; | ||
1557 | } | ||
1558 | /* If the subdir exists already, de is non-NULL */ | ||
1559 | } | ||
1560 | |||
1561 | if (!de) { | ||
1562 | de = create_proc_entry(table->procname, mode, root); | ||
1563 | if (!de) | ||
1564 | continue; | ||
1565 | de->set = set; | ||
1566 | de->data = (void *) table; | ||
1567 | if (table->proc_handler) | ||
1568 | de->proc_fops = &proc_sys_file_operations; | ||
1569 | } | ||
1570 | table->de = de; | ||
1571 | if (de->mode & S_IFDIR) | ||
1572 | register_proc_table(table->child, de, set); | ||
1573 | } | ||
1574 | } | ||
1575 | |||
1576 | /* | ||
1577 | * Unregister a /proc sysctl table and any subdirectories. | ||
1578 | */ | ||
1579 | static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) | ||
1580 | { | ||
1581 | struct proc_dir_entry *de; | ||
1582 | for (; table->ctl_name || table->procname; table++) { | ||
1583 | if (!(de = table->de)) | ||
1584 | continue; | ||
1585 | if (de->mode & S_IFDIR) { | ||
1586 | if (!table->child) { | ||
1587 | printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); | ||
1588 | continue; | ||
1589 | } | ||
1590 | unregister_proc_table(table->child, de); | ||
1591 | |||
1592 | /* Don't unregister directories which still have entries.. */ | ||
1593 | if (de->subdir) | ||
1594 | continue; | ||
1595 | } | ||
1596 | |||
1597 | /* | ||
1598 | * In any case, mark the entry as goner; we'll keep it | ||
1599 | * around if it's busy, but we'll know to do nothing with | ||
1600 | * its fields. We are under sysctl_lock here. | ||
1601 | */ | ||
1602 | de->data = NULL; | ||
1603 | |||
1604 | /* Don't unregister proc entries that are still being used.. */ | ||
1605 | if (atomic_read(&de->count)) | ||
1606 | continue; | ||
1607 | |||
1608 | table->de = NULL; | ||
1609 | remove_proc_entry(table->procname, root); | ||
1610 | } | ||
1611 | } | ||
1612 | |||
1613 | static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | ||
1614 | size_t count, loff_t *ppos) | ||
1615 | { | ||
1616 | int op; | ||
1617 | struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); | ||
1618 | struct ctl_table *table; | ||
1619 | size_t res; | ||
1620 | ssize_t error = -ENOTDIR; | ||
1621 | |||
1622 | spin_lock(&sysctl_lock); | ||
1623 | if (de && de->data && use_table(de->set)) { | ||
1624 | /* | ||
1625 | * at that point we know that sysctl was not unregistered | ||
1626 | * and won't be until we finish | ||
1627 | */ | ||
1628 | spin_unlock(&sysctl_lock); | ||
1629 | table = (struct ctl_table *) de->data; | ||
1630 | if (!table || !table->proc_handler) | ||
1631 | goto out; | ||
1632 | error = -EPERM; | ||
1633 | op = (write ? 002 : 004); | ||
1634 | if (ctl_perm(table, op)) | ||
1635 | goto out; | ||
1636 | |||
1637 | /* careful: calling conventions are nasty here */ | ||
1638 | res = count; | ||
1639 | error = (*table->proc_handler)(table, write, file, | ||
1640 | buf, &res, ppos); | ||
1641 | if (!error) | ||
1642 | error = res; | ||
1643 | out: | ||
1644 | spin_lock(&sysctl_lock); | ||
1645 | unuse_table(de->set); | ||
1646 | } | ||
1647 | spin_unlock(&sysctl_lock); | ||
1648 | return error; | ||
1649 | } | ||
1650 | |||
1651 | static int proc_opensys(struct inode *inode, struct file *file) | ||
1652 | { | ||
1653 | if (file->f_mode & FMODE_WRITE) { | ||
1654 | /* | ||
1655 | * sysctl entries that are not writable, | ||
1656 | * are _NOT_ writable, capabilities or not. | ||
1657 | */ | ||
1658 | if (!(inode->i_mode & S_IWUSR)) | ||
1659 | return -EPERM; | ||
1660 | } | ||
1661 | |||
1662 | return 0; | ||
1663 | } | ||
1664 | |||
1665 | static ssize_t proc_readsys(struct file * file, char __user * buf, | ||
1666 | size_t count, loff_t *ppos) | ||
1667 | { | ||
1668 | return do_rw_proc(0, file, buf, count, ppos); | ||
1669 | } | ||
1670 | |||
1671 | static ssize_t proc_writesys(struct file * file, const char __user * buf, | ||
1672 | size_t count, loff_t *ppos) | ||
1673 | { | ||
1674 | return do_rw_proc(1, file, (char __user *) buf, count, ppos); | ||
1675 | } | ||
1676 | |||
1677 | static int _proc_do_string(void* data, int maxlen, int write, | 1380 | static int _proc_do_string(void* data, int maxlen, int write, |
1678 | struct file *filp, void __user *buffer, | 1381 | struct file *filp, void __user *buffer, |
1679 | size_t *lenp, loff_t *ppos) | 1382 | size_t *lenp, loff_t *ppos) |
@@ -1681,13 +1384,12 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
1681 | size_t len; | 1384 | size_t len; |
1682 | char __user *p; | 1385 | char __user *p; |
1683 | char c; | 1386 | char c; |
1684 | 1387 | ||
1685 | if (!data || !maxlen || !*lenp || | 1388 | if (!data || !maxlen || !*lenp) { |
1686 | (*ppos && !write)) { | ||
1687 | *lenp = 0; | 1389 | *lenp = 0; |
1688 | return 0; | 1390 | return 0; |
1689 | } | 1391 | } |
1690 | 1392 | ||
1691 | if (write) { | 1393 | if (write) { |
1692 | len = 0; | 1394 | len = 0; |
1693 | p = buffer; | 1395 | p = buffer; |
@@ -1708,6 +1410,15 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
1708 | len = strlen(data); | 1410 | len = strlen(data); |
1709 | if (len > maxlen) | 1411 | if (len > maxlen) |
1710 | len = maxlen; | 1412 | len = maxlen; |
1413 | |||
1414 | if (*ppos > len) { | ||
1415 | *lenp = 0; | ||
1416 | return 0; | ||
1417 | } | ||
1418 | |||
1419 | data += *ppos; | ||
1420 | len -= *ppos; | ||
1421 | |||
1711 | if (len > *lenp) | 1422 | if (len > *lenp) |
1712 | len = *lenp; | 1423 | len = *lenp; |
1713 | if (len) | 1424 | if (len) |
@@ -1749,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
1749 | buffer, lenp, ppos); | 1460 | buffer, lenp, ppos); |
1750 | } | 1461 | } |
1751 | 1462 | ||
1752 | /* | ||
1753 | * Special case of dostring for the UTS structure. This has locks | ||
1754 | * to observe. Should this be in kernel/sys.c ???? | ||
1755 | */ | ||
1756 | |||
1757 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
1758 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1759 | { | ||
1760 | int r; | ||
1761 | void *which; | ||
1762 | which = get_uts(table, write); | ||
1763 | r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); | ||
1764 | put_uts(table, write, which); | ||
1765 | return r; | ||
1766 | } | ||
1767 | 1463 | ||
1768 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | 1464 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, |
1769 | int *valp, | 1465 | int *valp, |
@@ -1927,6 +1623,7 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, | |||
1927 | 1623 | ||
1928 | #define OP_SET 0 | 1624 | #define OP_SET 0 |
1929 | #define OP_AND 1 | 1625 | #define OP_AND 1 |
1626 | #define OP_OR 2 | ||
1930 | 1627 | ||
1931 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | 1628 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, |
1932 | int *valp, | 1629 | int *valp, |
@@ -1938,6 +1635,7 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | |||
1938 | switch(op) { | 1635 | switch(op) { |
1939 | case OP_SET: *valp = val; break; | 1636 | case OP_SET: *valp = val; break; |
1940 | case OP_AND: *valp &= val; break; | 1637 | case OP_AND: *valp &= val; break; |
1638 | case OP_OR: *valp |= val; break; | ||
1941 | } | 1639 | } |
1942 | } else { | 1640 | } else { |
1943 | int val = *valp; | 1641 | int val = *valp; |
@@ -1961,7 +1659,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | |||
1961 | { | 1659 | { |
1962 | int op; | 1660 | int op; |
1963 | 1661 | ||
1964 | if (!capable(CAP_SYS_MODULE)) { | 1662 | if (write && !capable(CAP_SYS_MODULE)) { |
1965 | return -EPERM; | 1663 | return -EPERM; |
1966 | } | 1664 | } |
1967 | 1665 | ||
@@ -1970,6 +1668,22 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | |||
1970 | do_proc_dointvec_bset_conv,&op); | 1668 | do_proc_dointvec_bset_conv,&op); |
1971 | } | 1669 | } |
1972 | 1670 | ||
1671 | /* | ||
1672 | * Taint values can only be increased | ||
1673 | */ | ||
1674 | static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, | ||
1675 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1676 | { | ||
1677 | int op; | ||
1678 | |||
1679 | if (!capable(CAP_SYS_ADMIN)) | ||
1680 | return -EPERM; | ||
1681 | |||
1682 | op = OP_OR; | ||
1683 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | ||
1684 | do_proc_dointvec_bset_conv,&op); | ||
1685 | } | ||
1686 | |||
1973 | struct do_proc_dointvec_minmax_conv_param { | 1687 | struct do_proc_dointvec_minmax_conv_param { |
1974 | int *min; | 1688 | int *min; |
1975 | int *max; | 1689 | int *max; |
@@ -2331,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | |||
2331 | do_proc_dointvec_ms_jiffies_conv, NULL); | 2045 | do_proc_dointvec_ms_jiffies_conv, NULL); |
2332 | } | 2046 | } |
2333 | 2047 | ||
2334 | #ifdef CONFIG_SYSVIPC | ||
2335 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2336 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2337 | { | ||
2338 | void *which; | ||
2339 | which = get_ipc(table, write); | ||
2340 | return __do_proc_dointvec(which, table, write, filp, buffer, | ||
2341 | lenp, ppos, NULL, NULL); | ||
2342 | } | ||
2343 | |||
2344 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2345 | struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2346 | { | ||
2347 | void *which; | ||
2348 | which = get_ipc(table, write); | ||
2349 | return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, | ||
2350 | lenp, ppos, 1l, 1l); | ||
2351 | } | ||
2352 | |||
2353 | #endif | ||
2354 | |||
2355 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 2048 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
2356 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2049 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2357 | { | 2050 | { |
@@ -2382,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
2382 | return -ENOSYS; | 2075 | return -ENOSYS; |
2383 | } | 2076 | } |
2384 | 2077 | ||
2385 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
2386 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2387 | { | ||
2388 | return -ENOSYS; | ||
2389 | } | ||
2390 | |||
2391 | #ifdef CONFIG_SYSVIPC | ||
2392 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | ||
2393 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2394 | { | ||
2395 | return -ENOSYS; | ||
2396 | } | ||
2397 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2398 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2399 | { | ||
2400 | return -ENOSYS; | ||
2401 | } | ||
2402 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2403 | struct file *filp, void __user *buffer, | ||
2404 | size_t *lenp, loff_t *ppos) | ||
2405 | { | ||
2406 | return -ENOSYS; | ||
2407 | } | ||
2408 | #endif | ||
2409 | |||
2410 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | 2078 | int proc_dointvec(ctl_table *table, int write, struct file *filp, |
2411 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2079 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2412 | { | 2080 | { |
@@ -2553,17 +2221,23 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2553 | void __user *oldval, size_t __user *oldlenp, | 2221 | void __user *oldval, size_t __user *oldlenp, |
2554 | void __user *newval, size_t newlen) | 2222 | void __user *newval, size_t newlen) |
2555 | { | 2223 | { |
2556 | if (oldval) { | 2224 | if (oldval && oldlenp) { |
2557 | size_t olen; | 2225 | size_t olen; |
2558 | if (oldlenp) { | 2226 | |
2559 | if (get_user(olen, oldlenp)) | 2227 | if (get_user(olen, oldlenp)) |
2228 | return -EFAULT; | ||
2229 | if (olen) { | ||
2230 | int val; | ||
2231 | |||
2232 | if (olen < sizeof(int)) | ||
2233 | return -EINVAL; | ||
2234 | |||
2235 | val = *(int *)(table->data) / HZ; | ||
2236 | if (put_user(val, (int __user *)oldval)) | ||
2237 | return -EFAULT; | ||
2238 | if (put_user(sizeof(int), oldlenp)) | ||
2560 | return -EFAULT; | 2239 | return -EFAULT; |
2561 | if (olen!=sizeof(int)) | ||
2562 | return -EINVAL; | ||
2563 | } | 2240 | } |
2564 | if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) || | ||
2565 | (oldlenp && put_user(sizeof(int),oldlenp))) | ||
2566 | return -EFAULT; | ||
2567 | } | 2241 | } |
2568 | if (newval && newlen) { | 2242 | if (newval && newlen) { |
2569 | int new; | 2243 | int new; |
@@ -2581,17 +2255,23 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2581 | void __user *oldval, size_t __user *oldlenp, | 2255 | void __user *oldval, size_t __user *oldlenp, |
2582 | void __user *newval, size_t newlen) | 2256 | void __user *newval, size_t newlen) |
2583 | { | 2257 | { |
2584 | if (oldval) { | 2258 | if (oldval && oldlenp) { |
2585 | size_t olen; | 2259 | size_t olen; |
2586 | if (oldlenp) { | 2260 | |
2587 | if (get_user(olen, oldlenp)) | 2261 | if (get_user(olen, oldlenp)) |
2262 | return -EFAULT; | ||
2263 | if (olen) { | ||
2264 | int val; | ||
2265 | |||
2266 | if (olen < sizeof(int)) | ||
2267 | return -EINVAL; | ||
2268 | |||
2269 | val = jiffies_to_msecs(*(int *)(table->data)); | ||
2270 | if (put_user(val, (int __user *)oldval)) | ||
2271 | return -EFAULT; | ||
2272 | if (put_user(sizeof(int), oldlenp)) | ||
2588 | return -EFAULT; | 2273 | return -EFAULT; |
2589 | if (olen!=sizeof(int)) | ||
2590 | return -EINVAL; | ||
2591 | } | 2274 | } |
2592 | if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) || | ||
2593 | (oldlenp && put_user(sizeof(int),oldlenp))) | ||
2594 | return -EFAULT; | ||
2595 | } | 2275 | } |
2596 | if (newval && newlen) { | 2276 | if (newval && newlen) { |
2597 | int new; | 2277 | int new; |
@@ -2605,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2605 | } | 2285 | } |
2606 | 2286 | ||
2607 | 2287 | ||
2608 | /* The generic string strategy routine: */ | ||
2609 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2610 | void __user *oldval, size_t __user *oldlenp, | ||
2611 | void __user *newval, size_t newlen) | ||
2612 | { | ||
2613 | struct ctl_table uts_table; | ||
2614 | int r, write; | ||
2615 | write = newval && newlen; | ||
2616 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
2617 | uts_table.data = get_uts(table, write); | ||
2618 | r = sysctl_string(&uts_table, name, nlen, | ||
2619 | oldval, oldlenp, newval, newlen); | ||
2620 | put_uts(table, write, uts_table.data); | ||
2621 | return r; | ||
2622 | } | ||
2623 | |||
2624 | #ifdef CONFIG_SYSVIPC | ||
2625 | /* The generic sysctl ipc data routine. */ | ||
2626 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2627 | void __user *oldval, size_t __user *oldlenp, | ||
2628 | void __user *newval, size_t newlen) | ||
2629 | { | ||
2630 | size_t len; | ||
2631 | void *data; | ||
2632 | |||
2633 | /* Get out of I don't have a variable */ | ||
2634 | if (!table->data || !table->maxlen) | ||
2635 | return -ENOTDIR; | ||
2636 | |||
2637 | data = get_ipc(table, 1); | ||
2638 | if (!data) | ||
2639 | return -ENOTDIR; | ||
2640 | |||
2641 | if (oldval && oldlenp) { | ||
2642 | if (get_user(len, oldlenp)) | ||
2643 | return -EFAULT; | ||
2644 | if (len) { | ||
2645 | if (len > table->maxlen) | ||
2646 | len = table->maxlen; | ||
2647 | if (copy_to_user(oldval, data, len)) | ||
2648 | return -EFAULT; | ||
2649 | if (put_user(len, oldlenp)) | ||
2650 | return -EFAULT; | ||
2651 | } | ||
2652 | } | ||
2653 | |||
2654 | if (newval && newlen) { | ||
2655 | if (newlen > table->maxlen) | ||
2656 | newlen = table->maxlen; | ||
2657 | |||
2658 | if (copy_from_user(data, newval, newlen)) | ||
2659 | return -EFAULT; | ||
2660 | } | ||
2661 | return 1; | ||
2662 | } | ||
2663 | #endif | ||
2664 | 2288 | ||
2665 | #else /* CONFIG_SYSCTL_SYSCALL */ | 2289 | #else /* CONFIG_SYSCTL_SYSCALL */ |
2666 | 2290 | ||
@@ -2726,18 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2726 | return -ENOSYS; | 2350 | return -ENOSYS; |
2727 | } | 2351 | } |
2728 | 2352 | ||
2729 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2730 | void __user *oldval, size_t __user *oldlenp, | ||
2731 | void __user *newval, size_t newlen) | ||
2732 | { | ||
2733 | return -ENOSYS; | ||
2734 | } | ||
2735 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2736 | void __user *oldval, size_t __user *oldlenp, | ||
2737 | void __user *newval, size_t newlen) | ||
2738 | { | ||
2739 | return -ENOSYS; | ||
2740 | } | ||
2741 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 2353 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
2742 | 2354 | ||
2743 | /* | 2355 | /* |
diff --git a/kernel/time.c b/kernel/time.c index 0e017bff4c19..c6c80ea5d0ea 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec) | |||
470 | return tv; | 470 | return tv; |
471 | } | 471 | } |
472 | 472 | ||
473 | /* | ||
474 | * Convert jiffies to milliseconds and back. | ||
475 | * | ||
476 | * Avoid unnecessary multiplications/divisions in the | ||
477 | * two most common HZ cases: | ||
478 | */ | ||
479 | unsigned int jiffies_to_msecs(const unsigned long j) | ||
480 | { | ||
481 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | ||
482 | return (MSEC_PER_SEC / HZ) * j; | ||
483 | #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) | ||
484 | return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); | ||
485 | #else | ||
486 | return (j * MSEC_PER_SEC) / HZ; | ||
487 | #endif | ||
488 | } | ||
489 | EXPORT_SYMBOL(jiffies_to_msecs); | ||
490 | |||
491 | unsigned int jiffies_to_usecs(const unsigned long j) | ||
492 | { | ||
493 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | ||
494 | return (USEC_PER_SEC / HZ) * j; | ||
495 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
496 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); | ||
497 | #else | ||
498 | return (j * USEC_PER_SEC) / HZ; | ||
499 | #endif | ||
500 | } | ||
501 | EXPORT_SYMBOL(jiffies_to_usecs); | ||
502 | |||
503 | /* | ||
504 | * When we convert to jiffies then we interpret incoming values | ||
505 | * the following way: | ||
506 | * | ||
507 | * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) | ||
508 | * | ||
509 | * - 'too large' values [that would result in larger than | ||
510 | * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. | ||
511 | * | ||
512 | * - all other values are converted to jiffies by either multiplying | ||
513 | * the input value by a factor or dividing it with a factor | ||
514 | * | ||
515 | * We must also be careful about 32-bit overflows. | ||
516 | */ | ||
517 | unsigned long msecs_to_jiffies(const unsigned int m) | ||
518 | { | ||
519 | /* | ||
520 | * Negative value, means infinite timeout: | ||
521 | */ | ||
522 | if ((int)m < 0) | ||
523 | return MAX_JIFFY_OFFSET; | ||
524 | |||
525 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | ||
526 | /* | ||
527 | * HZ is equal to or smaller than 1000, and 1000 is a nice | ||
528 | * round multiple of HZ, divide with the factor between them, | ||
529 | * but round upwards: | ||
530 | */ | ||
531 | return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); | ||
532 | #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) | ||
533 | /* | ||
534 | * HZ is larger than 1000, and HZ is a nice round multiple of | ||
535 | * 1000 - simply multiply with the factor between them. | ||
536 | * | ||
537 | * But first make sure the multiplication result cannot | ||
538 | * overflow: | ||
539 | */ | ||
540 | if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) | ||
541 | return MAX_JIFFY_OFFSET; | ||
542 | |||
543 | return m * (HZ / MSEC_PER_SEC); | ||
544 | #else | ||
545 | /* | ||
546 | * Generic case - multiply, round and divide. But first | ||
547 | * check that if we are doing a net multiplication, that | ||
548 | * we wouldnt overflow: | ||
549 | */ | ||
550 | if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) | ||
551 | return MAX_JIFFY_OFFSET; | ||
552 | |||
553 | return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; | ||
554 | #endif | ||
555 | } | ||
556 | EXPORT_SYMBOL(msecs_to_jiffies); | ||
557 | |||
558 | unsigned long usecs_to_jiffies(const unsigned int u) | ||
559 | { | ||
560 | if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) | ||
561 | return MAX_JIFFY_OFFSET; | ||
562 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | ||
563 | return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); | ||
564 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
565 | return u * (HZ / USEC_PER_SEC); | ||
566 | #else | ||
567 | return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; | ||
568 | #endif | ||
569 | } | ||
570 | EXPORT_SYMBOL(usecs_to_jiffies); | ||
571 | |||
572 | /* | ||
573 | * The TICK_NSEC - 1 rounds up the value to the next resolution. Note | ||
574 | * that a remainder subtract here would not do the right thing as the | ||
575 | * resolution values don't fall on second boundries. I.e. the line: | ||
576 | * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. | ||
577 | * | ||
578 | * Rather, we just shift the bits off the right. | ||
579 | * | ||
580 | * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec | ||
581 | * value to a scaled second value. | ||
582 | */ | ||
583 | unsigned long | ||
584 | timespec_to_jiffies(const struct timespec *value) | ||
585 | { | ||
586 | unsigned long sec = value->tv_sec; | ||
587 | long nsec = value->tv_nsec + TICK_NSEC - 1; | ||
588 | |||
589 | if (sec >= MAX_SEC_IN_JIFFIES){ | ||
590 | sec = MAX_SEC_IN_JIFFIES; | ||
591 | nsec = 0; | ||
592 | } | ||
593 | return (((u64)sec * SEC_CONVERSION) + | ||
594 | (((u64)nsec * NSEC_CONVERSION) >> | ||
595 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
596 | |||
597 | } | ||
598 | EXPORT_SYMBOL(timespec_to_jiffies); | ||
599 | |||
600 | void | ||
601 | jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) | ||
602 | { | ||
603 | /* | ||
604 | * Convert jiffies to nanoseconds and separate with | ||
605 | * one divide. | ||
606 | */ | ||
607 | u64 nsec = (u64)jiffies * TICK_NSEC; | ||
608 | value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); | ||
609 | } | ||
610 | EXPORT_SYMBOL(jiffies_to_timespec); | ||
611 | |||
612 | /* Same for "timeval" | ||
613 | * | ||
614 | * Well, almost. The problem here is that the real system resolution is | ||
615 | * in nanoseconds and the value being converted is in micro seconds. | ||
616 | * Also for some machines (those that use HZ = 1024, in-particular), | ||
617 | * there is a LARGE error in the tick size in microseconds. | ||
618 | |||
619 | * The solution we use is to do the rounding AFTER we convert the | ||
620 | * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. | ||
621 | * Instruction wise, this should cost only an additional add with carry | ||
622 | * instruction above the way it was done above. | ||
623 | */ | ||
624 | unsigned long | ||
625 | timeval_to_jiffies(const struct timeval *value) | ||
626 | { | ||
627 | unsigned long sec = value->tv_sec; | ||
628 | long usec = value->tv_usec; | ||
629 | |||
630 | if (sec >= MAX_SEC_IN_JIFFIES){ | ||
631 | sec = MAX_SEC_IN_JIFFIES; | ||
632 | usec = 0; | ||
633 | } | ||
634 | return (((u64)sec * SEC_CONVERSION) + | ||
635 | (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> | ||
636 | (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
637 | } | ||
638 | |||
639 | void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) | ||
640 | { | ||
641 | /* | ||
642 | * Convert jiffies to nanoseconds and separate with | ||
643 | * one divide. | ||
644 | */ | ||
645 | u64 nsec = (u64)jiffies * TICK_NSEC; | ||
646 | long tv_usec; | ||
647 | |||
648 | value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); | ||
649 | tv_usec /= NSEC_PER_USEC; | ||
650 | value->tv_usec = tv_usec; | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * Convert jiffies/jiffies_64 to clock_t and back. | ||
655 | */ | ||
656 | clock_t jiffies_to_clock_t(long x) | ||
657 | { | ||
658 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | ||
659 | return x / (HZ / USER_HZ); | ||
660 | #else | ||
661 | u64 tmp = (u64)x * TICK_NSEC; | ||
662 | do_div(tmp, (NSEC_PER_SEC / USER_HZ)); | ||
663 | return (long)tmp; | ||
664 | #endif | ||
665 | } | ||
666 | EXPORT_SYMBOL(jiffies_to_clock_t); | ||
667 | |||
668 | unsigned long clock_t_to_jiffies(unsigned long x) | ||
669 | { | ||
670 | #if (HZ % USER_HZ)==0 | ||
671 | if (x >= ~0UL / (HZ / USER_HZ)) | ||
672 | return ~0UL; | ||
673 | return x * (HZ / USER_HZ); | ||
674 | #else | ||
675 | u64 jif; | ||
676 | |||
677 | /* Don't worry about loss of precision here .. */ | ||
678 | if (x >= ~0UL / HZ * USER_HZ) | ||
679 | return ~0UL; | ||
680 | |||
681 | /* .. but do try to contain it here */ | ||
682 | jif = x * (u64) HZ; | ||
683 | do_div(jif, USER_HZ); | ||
684 | return jif; | ||
685 | #endif | ||
686 | } | ||
687 | EXPORT_SYMBOL(clock_t_to_jiffies); | ||
688 | |||
689 | u64 jiffies_64_to_clock_t(u64 x) | ||
690 | { | ||
691 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | ||
692 | do_div(x, HZ / USER_HZ); | ||
693 | #else | ||
694 | /* | ||
695 | * There are better ways that don't overflow early, | ||
696 | * but even this doesn't overflow in hundreds of years | ||
697 | * in 64 bits, so.. | ||
698 | */ | ||
699 | x *= TICK_NSEC; | ||
700 | do_div(x, (NSEC_PER_SEC / USER_HZ)); | ||
701 | #endif | ||
702 | return x; | ||
703 | } | ||
704 | |||
705 | EXPORT_SYMBOL(jiffies_64_to_clock_t); | ||
706 | |||
707 | u64 nsec_to_clock_t(u64 x) | ||
708 | { | ||
709 | #if (NSEC_PER_SEC % USER_HZ) == 0 | ||
710 | do_div(x, (NSEC_PER_SEC / USER_HZ)); | ||
711 | #elif (USER_HZ % 512) == 0 | ||
712 | x *= USER_HZ/512; | ||
713 | do_div(x, (NSEC_PER_SEC / 512)); | ||
714 | #else | ||
715 | /* | ||
716 | * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, | ||
717 | * overflow after 64.99 years. | ||
718 | * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... | ||
719 | */ | ||
720 | x *= 9; | ||
721 | do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / | ||
722 | USER_HZ)); | ||
723 | #endif | ||
724 | return x; | ||
725 | } | ||
726 | |||
473 | #if (BITS_PER_LONG < 64) | 727 | #if (BITS_PER_LONG < 64) |
474 | u64 get_jiffies_64(void) | 728 | u64 get_jiffies_64(void) |
475 | { | 729 | { |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 000000000000..f66351126544 --- /dev/null +++ b/kernel/time/Kconfig | |||
@@ -0,0 +1,25 @@ | |||
1 | # | ||
2 | # Timer subsystem related configuration options | ||
3 | # | ||
4 | config TICK_ONESHOT | ||
5 | bool | ||
6 | default n | ||
7 | |||
8 | config NO_HZ | ||
9 | bool "Tickless System (Dynamic Ticks)" | ||
10 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | ||
11 | select TICK_ONESHOT | ||
12 | help | ||
13 | This option enables a tickless system: timer interrupts will | ||
14 | only trigger on an as-needed basis both when the system is | ||
15 | busy and when the system is idle. | ||
16 | |||
17 | config HIGH_RES_TIMERS | ||
18 | bool "High Resolution Timer Support" | ||
19 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | ||
20 | select TICK_ONESHOT | ||
21 | help | ||
22 | This option enables high resolution timer support. If your | ||
23 | hardware is not capable then this option only increases | ||
24 | the size of the kernel image. | ||
25 | |||
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 61a3907d16fb..93bccba1f265 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1 +1,8 @@ | |||
1 | obj-y += ntp.o clocksource.o jiffies.o | 1 | obj-y += ntp.o clocksource.o jiffies.o timer_list.o |
2 | |||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | ||
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o | ||
6 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | ||
7 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | ||
8 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 000000000000..67932ea78c17 --- /dev/null +++ b/kernel/time/clockevents.c | |||
@@ -0,0 +1,345 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/clockevents.c | ||
3 | * | ||
4 | * This file contains functions which manage clock event devices. | ||
5 | * | ||
6 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
7 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
8 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
9 | * | ||
10 | * This code is licenced under the GPL version 2. For details see | ||
11 | * kernel-base/COPYING. | ||
12 | */ | ||
13 | |||
14 | #include <linux/clockchips.h> | ||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/notifier.h> | ||
19 | #include <linux/smp.h> | ||
20 | #include <linux/sysdev.h> | ||
21 | |||
22 | /* The registered clock event devices */ | ||
23 | static LIST_HEAD(clockevent_devices); | ||
24 | static LIST_HEAD(clockevents_released); | ||
25 | |||
26 | /* Notification for clock events */ | ||
27 | static RAW_NOTIFIER_HEAD(clockevents_chain); | ||
28 | |||
29 | /* Protection for the above */ | ||
30 | static DEFINE_SPINLOCK(clockevents_lock); | ||
31 | |||
32 | /** | ||
33 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | ||
34 | * @latch: value to convert | ||
35 | * @evt: pointer to clock event device descriptor | ||
36 | * | ||
37 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
38 | */ | ||
39 | unsigned long clockevent_delta2ns(unsigned long latch, | ||
40 | struct clock_event_device *evt) | ||
41 | { | ||
42 | u64 clc = ((u64) latch << evt->shift); | ||
43 | |||
44 | do_div(clc, evt->mult); | ||
45 | if (clc < 1000) | ||
46 | clc = 1000; | ||
47 | if (clc > LONG_MAX) | ||
48 | clc = LONG_MAX; | ||
49 | |||
50 | return (unsigned long) clc; | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * clockevents_set_mode - set the operating mode of a clock event device | ||
55 | * @dev: device to modify | ||
56 | * @mode: new mode | ||
57 | * | ||
58 | * Must be called with interrupts disabled ! | ||
59 | */ | ||
60 | void clockevents_set_mode(struct clock_event_device *dev, | ||
61 | enum clock_event_mode mode) | ||
62 | { | ||
63 | if (dev->mode != mode) { | ||
64 | dev->set_mode(mode, dev); | ||
65 | dev->mode = mode; | ||
66 | } | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * clockevents_program_event - Reprogram the clock event device. | ||
71 | * @expires: absolute expiry time (monotonic clock) | ||
72 | * | ||
73 | * Returns 0 on success, -ETIME when the event is in the past. | ||
74 | */ | ||
75 | int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | ||
76 | ktime_t now) | ||
77 | { | ||
78 | unsigned long long clc; | ||
79 | int64_t delta; | ||
80 | |||
81 | delta = ktime_to_ns(ktime_sub(expires, now)); | ||
82 | |||
83 | if (delta <= 0) | ||
84 | return -ETIME; | ||
85 | |||
86 | dev->next_event = expires; | ||
87 | |||
88 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
89 | return 0; | ||
90 | |||
91 | if (delta > dev->max_delta_ns) | ||
92 | delta = dev->max_delta_ns; | ||
93 | if (delta < dev->min_delta_ns) | ||
94 | delta = dev->min_delta_ns; | ||
95 | |||
96 | clc = delta * dev->mult; | ||
97 | clc >>= dev->shift; | ||
98 | |||
99 | return dev->set_next_event((unsigned long) clc, dev); | ||
100 | } | ||
101 | |||
102 | /** | ||
103 | * clockevents_register_notifier - register a clock events change listener | ||
104 | */ | ||
105 | int clockevents_register_notifier(struct notifier_block *nb) | ||
106 | { | ||
107 | int ret; | ||
108 | |||
109 | spin_lock(&clockevents_lock); | ||
110 | ret = raw_notifier_chain_register(&clockevents_chain, nb); | ||
111 | spin_unlock(&clockevents_lock); | ||
112 | |||
113 | return ret; | ||
114 | } | ||
115 | |||
116 | /** | ||
117 | * clockevents_unregister_notifier - unregister a clock events change listener | ||
118 | */ | ||
119 | void clockevents_unregister_notifier(struct notifier_block *nb) | ||
120 | { | ||
121 | spin_lock(&clockevents_lock); | ||
122 | raw_notifier_chain_unregister(&clockevents_chain, nb); | ||
123 | spin_unlock(&clockevents_lock); | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Notify about a clock event change. Called with clockevents_lock | ||
128 | * held. | ||
129 | */ | ||
130 | static void clockevents_do_notify(unsigned long reason, void *dev) | ||
131 | { | ||
132 | raw_notifier_call_chain(&clockevents_chain, reason, dev); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * Called after a notify add to make devices availble which were | ||
137 | * released from the notifier call. | ||
138 | */ | ||
139 | static void clockevents_notify_released(void) | ||
140 | { | ||
141 | struct clock_event_device *dev; | ||
142 | |||
143 | while (!list_empty(&clockevents_released)) { | ||
144 | dev = list_entry(clockevents_released.next, | ||
145 | struct clock_event_device, list); | ||
146 | list_del(&dev->list); | ||
147 | list_add(&dev->list, &clockevent_devices); | ||
148 | clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * clockevents_register_device - register a clock event device | ||
154 | * @dev: device to register | ||
155 | */ | ||
156 | void clockevents_register_device(struct clock_event_device *dev) | ||
157 | { | ||
158 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
159 | |||
160 | spin_lock(&clockevents_lock); | ||
161 | |||
162 | list_add(&dev->list, &clockevent_devices); | ||
163 | clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); | ||
164 | clockevents_notify_released(); | ||
165 | |||
166 | spin_unlock(&clockevents_lock); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Noop handler when we shut down an event device | ||
171 | */ | ||
172 | static void clockevents_handle_noop(struct clock_event_device *dev) | ||
173 | { | ||
174 | } | ||
175 | |||
176 | /** | ||
177 | * clockevents_exchange_device - release and request clock devices | ||
178 | * @old: device to release (can be NULL) | ||
179 | * @new: device to request (can be NULL) | ||
180 | * | ||
181 | * Called from the notifier chain. clockevents_lock is held already | ||
182 | */ | ||
183 | void clockevents_exchange_device(struct clock_event_device *old, | ||
184 | struct clock_event_device *new) | ||
185 | { | ||
186 | unsigned long flags; | ||
187 | |||
188 | local_irq_save(flags); | ||
189 | /* | ||
190 | * Caller releases a clock event device. We queue it into the | ||
191 | * released list and do a notify add later. | ||
192 | */ | ||
193 | if (old) { | ||
194 | old->event_handler = clockevents_handle_noop; | ||
195 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | ||
196 | list_del(&old->list); | ||
197 | list_add(&old->list, &clockevents_released); | ||
198 | } | ||
199 | |||
200 | if (new) { | ||
201 | BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); | ||
202 | clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); | ||
203 | } | ||
204 | local_irq_restore(flags); | ||
205 | } | ||
206 | |||
207 | /** | ||
208 | * clockevents_request_device | ||
209 | */ | ||
210 | struct clock_event_device *clockevents_request_device(unsigned int features, | ||
211 | cpumask_t cpumask) | ||
212 | { | ||
213 | struct clock_event_device *cur, *dev = NULL; | ||
214 | struct list_head *tmp; | ||
215 | |||
216 | spin_lock(&clockevents_lock); | ||
217 | |||
218 | list_for_each(tmp, &clockevent_devices) { | ||
219 | cur = list_entry(tmp, struct clock_event_device, list); | ||
220 | |||
221 | if ((cur->features & features) == features && | ||
222 | cpus_equal(cpumask, cur->cpumask)) { | ||
223 | if (!dev || dev->rating < cur->rating) | ||
224 | dev = cur; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | clockevents_exchange_device(NULL, dev); | ||
229 | |||
230 | spin_unlock(&clockevents_lock); | ||
231 | |||
232 | return dev; | ||
233 | } | ||
234 | |||
235 | /** | ||
236 | * clockevents_release_device | ||
237 | */ | ||
238 | void clockevents_release_device(struct clock_event_device *dev) | ||
239 | { | ||
240 | spin_lock(&clockevents_lock); | ||
241 | |||
242 | clockevents_exchange_device(dev, NULL); | ||
243 | clockevents_notify_released(); | ||
244 | |||
245 | spin_unlock(&clockevents_lock); | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * clockevents_notify - notification about relevant events | ||
250 | */ | ||
251 | void clockevents_notify(unsigned long reason, void *arg) | ||
252 | { | ||
253 | spin_lock(&clockevents_lock); | ||
254 | clockevents_do_notify(reason, arg); | ||
255 | |||
256 | switch (reason) { | ||
257 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
258 | /* | ||
259 | * Unregister the clock event devices which were | ||
260 | * released from the users in the notify chain. | ||
261 | */ | ||
262 | while (!list_empty(&clockevents_released)) { | ||
263 | struct clock_event_device *dev; | ||
264 | |||
265 | dev = list_entry(clockevents_released.next, | ||
266 | struct clock_event_device, list); | ||
267 | list_del(&dev->list); | ||
268 | } | ||
269 | break; | ||
270 | default: | ||
271 | break; | ||
272 | } | ||
273 | spin_unlock(&clockevents_lock); | ||
274 | } | ||
275 | EXPORT_SYMBOL_GPL(clockevents_notify); | ||
276 | |||
277 | #ifdef CONFIG_SYSFS | ||
278 | |||
279 | /** | ||
280 | * clockevents_show_registered - sysfs interface for listing clockevents | ||
281 | * @dev: unused | ||
282 | * @buf: char buffer to be filled with clock events list | ||
283 | * | ||
284 | * Provides sysfs interface for listing registered clock event devices | ||
285 | */ | ||
286 | static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf) | ||
287 | { | ||
288 | struct list_head *tmp; | ||
289 | char *p = buf; | ||
290 | int cpu; | ||
291 | |||
292 | spin_lock(&clockevents_lock); | ||
293 | |||
294 | list_for_each(tmp, &clockevent_devices) { | ||
295 | struct clock_event_device *ce; | ||
296 | |||
297 | ce = list_entry(tmp, struct clock_event_device, list); | ||
298 | p += sprintf(p, "%-20s F:%04x M:%d", ce->name, | ||
299 | ce->features, ce->mode); | ||
300 | p += sprintf(p, " C:"); | ||
301 | if (!cpus_equal(ce->cpumask, cpu_possible_map)) { | ||
302 | for_each_cpu_mask(cpu, ce->cpumask) | ||
303 | p += sprintf(p, " %d", cpu); | ||
304 | } else { | ||
305 | /* | ||
306 | * FIXME: Add the cpu which is handling this sucker | ||
307 | */ | ||
308 | } | ||
309 | p += sprintf(p, "\n"); | ||
310 | } | ||
311 | |||
312 | spin_unlock(&clockevents_lock); | ||
313 | |||
314 | return p - buf; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * Sysfs setup bits: | ||
319 | */ | ||
320 | static SYSDEV_ATTR(registered, 0600, | ||
321 | clockevents_show_registered, NULL); | ||
322 | |||
323 | static struct sysdev_class clockevents_sysclass = { | ||
324 | set_kset_name("clockevents"), | ||
325 | }; | ||
326 | |||
327 | static struct sys_device clockevents_sys_device = { | ||
328 | .id = 0, | ||
329 | .cls = &clockevents_sysclass, | ||
330 | }; | ||
331 | |||
332 | static int __init clockevents_sysfs_init(void) | ||
333 | { | ||
334 | int error = sysdev_class_register(&clockevents_sysclass); | ||
335 | |||
336 | if (!error) | ||
337 | error = sysdev_register(&clockevents_sys_device); | ||
338 | if (!error) | ||
339 | error = sysdev_create_file( | ||
340 | &clockevents_sys_device, | ||
341 | &attr_registered); | ||
342 | return error; | ||
343 | } | ||
344 | device_initcall(clockevents_sysfs_init); | ||
345 | #endif | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 22504afc0d34..193a0793af95 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include <linux/sysdev.h> | 28 | #include <linux/sysdev.h> |
29 | #include <linux/init.h> | 29 | #include <linux/init.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ | ||
32 | #include <linux/tick.h> | ||
31 | 33 | ||
32 | /* XXX - Would like a better way for initializing curr_clocksource */ | 34 | /* XXX - Would like a better way for initializing curr_clocksource */ |
33 | extern struct clocksource clocksource_jiffies; | 35 | extern struct clocksource clocksource_jiffies; |
@@ -47,6 +49,7 @@ extern struct clocksource clocksource_jiffies; | |||
47 | */ | 49 | */ |
48 | static struct clocksource *curr_clocksource = &clocksource_jiffies; | 50 | static struct clocksource *curr_clocksource = &clocksource_jiffies; |
49 | static struct clocksource *next_clocksource; | 51 | static struct clocksource *next_clocksource; |
52 | static struct clocksource *clocksource_override; | ||
50 | static LIST_HEAD(clocksource_list); | 53 | static LIST_HEAD(clocksource_list); |
51 | static DEFINE_SPINLOCK(clocksource_lock); | 54 | static DEFINE_SPINLOCK(clocksource_lock); |
52 | static char override_name[32]; | 55 | static char override_name[32]; |
@@ -61,9 +64,123 @@ static int __init clocksource_done_booting(void) | |||
61 | finished_booting = 1; | 64 | finished_booting = 1; |
62 | return 0; | 65 | return 0; |
63 | } | 66 | } |
64 | |||
65 | late_initcall(clocksource_done_booting); | 67 | late_initcall(clocksource_done_booting); |
66 | 68 | ||
69 | #ifdef CONFIG_CLOCKSOURCE_WATCHDOG | ||
70 | static LIST_HEAD(watchdog_list); | ||
71 | static struct clocksource *watchdog; | ||
72 | static struct timer_list watchdog_timer; | ||
73 | static DEFINE_SPINLOCK(watchdog_lock); | ||
74 | static cycle_t watchdog_last; | ||
75 | /* | ||
76 | * Interval: 0.5sec Treshold: 0.0625s | ||
77 | */ | ||
78 | #define WATCHDOG_INTERVAL (HZ >> 1) | ||
79 | #define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) | ||
80 | |||
81 | static void clocksource_ratewd(struct clocksource *cs, int64_t delta) | ||
82 | { | ||
83 | if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) | ||
84 | return; | ||
85 | |||
86 | printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", | ||
87 | cs->name, delta); | ||
88 | cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); | ||
89 | clocksource_change_rating(cs, 0); | ||
90 | cs->flags &= ~CLOCK_SOURCE_WATCHDOG; | ||
91 | list_del(&cs->wd_list); | ||
92 | } | ||
93 | |||
94 | static void clocksource_watchdog(unsigned long data) | ||
95 | { | ||
96 | struct clocksource *cs, *tmp; | ||
97 | cycle_t csnow, wdnow; | ||
98 | int64_t wd_nsec, cs_nsec; | ||
99 | |||
100 | spin_lock(&watchdog_lock); | ||
101 | |||
102 | wdnow = watchdog->read(); | ||
103 | wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); | ||
104 | watchdog_last = wdnow; | ||
105 | |||
106 | list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { | ||
107 | csnow = cs->read(); | ||
108 | /* Initialized ? */ | ||
109 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | ||
110 | if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && | ||
111 | (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { | ||
112 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | ||
113 | /* | ||
114 | * We just marked the clocksource as | ||
115 | * highres-capable, notify the rest of the | ||
116 | * system as well so that we transition | ||
117 | * into high-res mode: | ||
118 | */ | ||
119 | tick_clock_notify(); | ||
120 | } | ||
121 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | ||
122 | cs->wd_last = csnow; | ||
123 | } else { | ||
124 | cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); | ||
125 | cs->wd_last = csnow; | ||
126 | /* Check the delta. Might remove from the list ! */ | ||
127 | clocksource_ratewd(cs, cs_nsec - wd_nsec); | ||
128 | } | ||
129 | } | ||
130 | |||
131 | if (!list_empty(&watchdog_list)) { | ||
132 | __mod_timer(&watchdog_timer, | ||
133 | watchdog_timer.expires + WATCHDOG_INTERVAL); | ||
134 | } | ||
135 | spin_unlock(&watchdog_lock); | ||
136 | } | ||
137 | static void clocksource_check_watchdog(struct clocksource *cs) | ||
138 | { | ||
139 | struct clocksource *cse; | ||
140 | unsigned long flags; | ||
141 | |||
142 | spin_lock_irqsave(&watchdog_lock, flags); | ||
143 | if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { | ||
144 | int started = !list_empty(&watchdog_list); | ||
145 | |||
146 | list_add(&cs->wd_list, &watchdog_list); | ||
147 | if (!started && watchdog) { | ||
148 | watchdog_last = watchdog->read(); | ||
149 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | ||
150 | add_timer(&watchdog_timer); | ||
151 | } | ||
152 | } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { | ||
153 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | ||
154 | |||
155 | if (!watchdog || cs->rating > watchdog->rating) { | ||
156 | if (watchdog) | ||
157 | del_timer(&watchdog_timer); | ||
158 | watchdog = cs; | ||
159 | init_timer(&watchdog_timer); | ||
160 | watchdog_timer.function = clocksource_watchdog; | ||
161 | |||
162 | /* Reset watchdog cycles */ | ||
163 | list_for_each_entry(cse, &watchdog_list, wd_list) | ||
164 | cse->flags &= ~CLOCK_SOURCE_WATCHDOG; | ||
165 | /* Start if list is not empty */ | ||
166 | if (!list_empty(&watchdog_list)) { | ||
167 | watchdog_last = watchdog->read(); | ||
168 | watchdog_timer.expires = | ||
169 | jiffies + WATCHDOG_INTERVAL; | ||
170 | add_timer(&watchdog_timer); | ||
171 | } | ||
172 | } | ||
173 | } | ||
174 | spin_unlock_irqrestore(&watchdog_lock, flags); | ||
175 | } | ||
176 | #else | ||
177 | static void clocksource_check_watchdog(struct clocksource *cs) | ||
178 | { | ||
179 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | ||
180 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | ||
181 | } | ||
182 | #endif | ||
183 | |||
67 | /** | 184 | /** |
68 | * clocksource_get_next - Returns the selected clocksource | 185 | * clocksource_get_next - Returns the selected clocksource |
69 | * | 186 | * |
@@ -83,60 +200,54 @@ struct clocksource *clocksource_get_next(void) | |||
83 | } | 200 | } |
84 | 201 | ||
85 | /** | 202 | /** |
86 | * select_clocksource - Finds the best registered clocksource. | 203 | * select_clocksource - Selects the best registered clocksource. |
87 | * | 204 | * |
88 | * Private function. Must hold clocksource_lock when called. | 205 | * Private function. Must hold clocksource_lock when called. |
89 | * | 206 | * |
90 | * Looks through the list of registered clocksources, returning | 207 | * Select the clocksource with the best rating, or the clocksource, |
91 | * the one with the highest rating value. If there is a clocksource | 208 | * which is selected by userspace override. |
92 | * name that matches the override string, it returns that clocksource. | ||
93 | */ | 209 | */ |
94 | static struct clocksource *select_clocksource(void) | 210 | static struct clocksource *select_clocksource(void) |
95 | { | 211 | { |
96 | struct clocksource *best = NULL; | 212 | struct clocksource *next; |
97 | struct list_head *tmp; | ||
98 | 213 | ||
99 | list_for_each(tmp, &clocksource_list) { | 214 | if (list_empty(&clocksource_list)) |
100 | struct clocksource *src; | 215 | return NULL; |
101 | 216 | ||
102 | src = list_entry(tmp, struct clocksource, list); | 217 | if (clocksource_override) |
103 | if (!best) | 218 | next = clocksource_override; |
104 | best = src; | 219 | else |
105 | 220 | next = list_entry(clocksource_list.next, struct clocksource, | |
106 | /* check for override: */ | 221 | list); |
107 | if (strlen(src->name) == strlen(override_name) && | 222 | |
108 | !strcmp(src->name, override_name)) { | 223 | if (next == curr_clocksource) |
109 | best = src; | 224 | return NULL; |
110 | break; | ||
111 | } | ||
112 | /* pick the highest rating: */ | ||
113 | if (src->rating > best->rating) | ||
114 | best = src; | ||
115 | } | ||
116 | 225 | ||
117 | return best; | 226 | return next; |
118 | } | 227 | } |
119 | 228 | ||
120 | /** | 229 | /* |
121 | * is_registered_source - Checks if clocksource is registered | 230 | * Enqueue the clocksource sorted by rating |
122 | * @c: pointer to a clocksource | ||
123 | * | ||
124 | * Private helper function. Must hold clocksource_lock when called. | ||
125 | * | ||
126 | * Returns one if the clocksource is already registered, zero otherwise. | ||
127 | */ | 231 | */ |
128 | static int is_registered_source(struct clocksource *c) | 232 | static int clocksource_enqueue(struct clocksource *c) |
129 | { | 233 | { |
130 | int len = strlen(c->name); | 234 | struct list_head *tmp, *entry = &clocksource_list; |
131 | struct list_head *tmp; | ||
132 | 235 | ||
133 | list_for_each(tmp, &clocksource_list) { | 236 | list_for_each(tmp, &clocksource_list) { |
134 | struct clocksource *src; | 237 | struct clocksource *cs; |
135 | 238 | ||
136 | src = list_entry(tmp, struct clocksource, list); | 239 | cs = list_entry(tmp, struct clocksource, list); |
137 | if (strlen(src->name) == len && !strcmp(src->name, c->name)) | 240 | if (cs == c) |
138 | return 1; | 241 | return -EBUSY; |
242 | /* Keep track of the place, where to insert */ | ||
243 | if (cs->rating >= c->rating) | ||
244 | entry = tmp; | ||
139 | } | 245 | } |
246 | list_add(&c->list, entry); | ||
247 | |||
248 | if (strlen(c->name) == strlen(override_name) && | ||
249 | !strcmp(c->name, override_name)) | ||
250 | clocksource_override = c; | ||
140 | 251 | ||
141 | return 0; | 252 | return 0; |
142 | } | 253 | } |
@@ -149,42 +260,35 @@ static int is_registered_source(struct clocksource *c) | |||
149 | */ | 260 | */ |
150 | int clocksource_register(struct clocksource *c) | 261 | int clocksource_register(struct clocksource *c) |
151 | { | 262 | { |
152 | int ret = 0; | ||
153 | unsigned long flags; | 263 | unsigned long flags; |
264 | int ret; | ||
154 | 265 | ||
155 | spin_lock_irqsave(&clocksource_lock, flags); | 266 | spin_lock_irqsave(&clocksource_lock, flags); |
156 | /* check if clocksource is already registered */ | 267 | ret = clocksource_enqueue(c); |
157 | if (is_registered_source(c)) { | 268 | if (!ret) |
158 | printk("register_clocksource: Cannot register %s. " | ||
159 | "Already registered!", c->name); | ||
160 | ret = -EBUSY; | ||
161 | } else { | ||
162 | /* register it */ | ||
163 | list_add(&c->list, &clocksource_list); | ||
164 | /* scan the registered clocksources, and pick the best one */ | ||
165 | next_clocksource = select_clocksource(); | 269 | next_clocksource = select_clocksource(); |
166 | } | ||
167 | spin_unlock_irqrestore(&clocksource_lock, flags); | 270 | spin_unlock_irqrestore(&clocksource_lock, flags); |
271 | if (!ret) | ||
272 | clocksource_check_watchdog(c); | ||
168 | return ret; | 273 | return ret; |
169 | } | 274 | } |
170 | EXPORT_SYMBOL(clocksource_register); | 275 | EXPORT_SYMBOL(clocksource_register); |
171 | 276 | ||
172 | /** | 277 | /** |
173 | * clocksource_reselect - Rescan list for next clocksource | 278 | * clocksource_change_rating - Change the rating of a registered clocksource |
174 | * | 279 | * |
175 | * A quick helper function to be used if a clocksource changes its | ||
176 | * rating. Forces the clocksource list to be re-scanned for the best | ||
177 | * clocksource. | ||
178 | */ | 280 | */ |
179 | void clocksource_reselect(void) | 281 | void clocksource_change_rating(struct clocksource *cs, int rating) |
180 | { | 282 | { |
181 | unsigned long flags; | 283 | unsigned long flags; |
182 | 284 | ||
183 | spin_lock_irqsave(&clocksource_lock, flags); | 285 | spin_lock_irqsave(&clocksource_lock, flags); |
286 | list_del(&cs->list); | ||
287 | cs->rating = rating; | ||
288 | clocksource_enqueue(cs); | ||
184 | next_clocksource = select_clocksource(); | 289 | next_clocksource = select_clocksource(); |
185 | spin_unlock_irqrestore(&clocksource_lock, flags); | 290 | spin_unlock_irqrestore(&clocksource_lock, flags); |
186 | } | 291 | } |
187 | EXPORT_SYMBOL(clocksource_reselect); | ||
188 | 292 | ||
189 | #ifdef CONFIG_SYSFS | 293 | #ifdef CONFIG_SYSFS |
190 | /** | 294 | /** |
@@ -220,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) | |||
220 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | 324 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, |
221 | const char *buf, size_t count) | 325 | const char *buf, size_t count) |
222 | { | 326 | { |
327 | struct clocksource *ovr = NULL; | ||
328 | struct list_head *tmp; | ||
223 | size_t ret = count; | 329 | size_t ret = count; |
330 | int len; | ||
331 | |||
224 | /* strings from sysfs write are not 0 terminated! */ | 332 | /* strings from sysfs write are not 0 terminated! */ |
225 | if (count >= sizeof(override_name)) | 333 | if (count >= sizeof(override_name)) |
226 | return -EINVAL; | 334 | return -EINVAL; |
@@ -228,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, | |||
228 | /* strip of \n: */ | 336 | /* strip of \n: */ |
229 | if (buf[count-1] == '\n') | 337 | if (buf[count-1] == '\n') |
230 | count--; | 338 | count--; |
231 | if (count < 1) | ||
232 | return -EINVAL; | ||
233 | 339 | ||
234 | spin_lock_irq(&clocksource_lock); | 340 | spin_lock_irq(&clocksource_lock); |
235 | 341 | ||
236 | /* copy the name given: */ | 342 | if (count > 0) |
237 | memcpy(override_name, buf, count); | 343 | memcpy(override_name, buf, count); |
238 | override_name[count] = 0; | 344 | override_name[count] = 0; |
239 | 345 | ||
240 | /* try to select it: */ | 346 | len = strlen(override_name); |
241 | next_clocksource = select_clocksource(); | 347 | if (len) { |
348 | ovr = clocksource_override; | ||
349 | /* try to select it: */ | ||
350 | list_for_each(tmp, &clocksource_list) { | ||
351 | struct clocksource *cs; | ||
352 | |||
353 | cs = list_entry(tmp, struct clocksource, list); | ||
354 | if (strlen(cs->name) == len && | ||
355 | !strcmp(cs->name, override_name)) | ||
356 | ovr = cs; | ||
357 | } | ||
358 | } | ||
359 | |||
360 | /* Reselect, when the override name has changed */ | ||
361 | if (ovr != clocksource_override) { | ||
362 | clocksource_override = ovr; | ||
363 | next_clocksource = select_clocksource(); | ||
364 | } | ||
242 | 365 | ||
243 | spin_unlock_irq(&clocksource_lock); | 366 | spin_unlock_irq(&clocksource_lock); |
244 | 367 | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a99b2a6e6a07..3be8da8fed7e 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = { | |||
62 | .mask = 0xffffffff, /*32bits*/ | 62 | .mask = 0xffffffff, /*32bits*/ |
63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ |
64 | .shift = JIFFIES_SHIFT, | 64 | .shift = JIFFIES_SHIFT, |
65 | .is_continuous = 0, /* tick based, not free running */ | ||
66 | }; | 65 | }; |
67 | 66 | ||
68 | static int __init init_jiffies_clocksource(void) | 67 | static int __init init_jiffies_clocksource(void) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3afeaa3a73f9..eb12509e00bd 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base; | |||
24 | 24 | ||
25 | #define MAX_TICKADJ 500 /* microsecs */ | 25 | #define MAX_TICKADJ 500 /* microsecs */ |
26 | #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ | 26 | #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ |
27 | TICK_LENGTH_SHIFT) / HZ) | 27 | TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * phase-lock loop variables | 30 | * phase-lock loop variables |
@@ -46,13 +46,17 @@ long time_adjust; | |||
46 | 46 | ||
47 | static void ntp_update_frequency(void) | 47 | static void ntp_update_frequency(void) |
48 | { | 48 | { |
49 | tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; | 49 | u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) |
50 | tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; | 50 | << TICK_LENGTH_SHIFT; |
51 | tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); | 51 | second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; |
52 | second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); | ||
52 | 53 | ||
53 | do_div(tick_length_base, HZ); | 54 | tick_length_base = second_length; |
54 | 55 | ||
55 | tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; | 56 | do_div(second_length, HZ); |
57 | tick_nsec = second_length >> TICK_LENGTH_SHIFT; | ||
58 | |||
59 | do_div(tick_length_base, NTP_INTERVAL_FREQ); | ||
56 | } | 60 | } |
57 | 61 | ||
58 | /** | 62 | /** |
@@ -162,7 +166,7 @@ void second_overflow(void) | |||
162 | tick_length -= MAX_TICKADJ_SCALED; | 166 | tick_length -= MAX_TICKADJ_SCALED; |
163 | } else { | 167 | } else { |
164 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / | 168 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / |
165 | HZ) << TICK_LENGTH_SHIFT; | 169 | NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; |
166 | time_adjust = 0; | 170 | time_adjust = 0; |
167 | } | 171 | } |
168 | } | 172 | } |
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc) | |||
239 | result = -EINVAL; | 243 | result = -EINVAL; |
240 | goto leave; | 244 | goto leave; |
241 | } | 245 | } |
242 | time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); | 246 | time_freq = ((s64)txc->freq * NSEC_PER_USEC) |
247 | >> (SHIFT_USEC - SHIFT_NSEC); | ||
243 | } | 248 | } |
244 | 249 | ||
245 | if (txc->modes & ADJ_MAXERROR) { | 250 | if (txc->modes & ADJ_MAXERROR) { |
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc) | |||
309 | freq_adj += time_freq; | 314 | freq_adj += time_freq; |
310 | freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); | 315 | freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); |
311 | time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); | 316 | time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); |
312 | time_offset = (time_offset / HZ) << SHIFT_UPDATE; | 317 | time_offset = (time_offset / NTP_INTERVAL_FREQ) |
318 | << SHIFT_UPDATE; | ||
313 | } /* STA_PLL */ | 319 | } /* STA_PLL */ |
314 | } /* txc->modes & ADJ_OFFSET */ | 320 | } /* txc->modes & ADJ_OFFSET */ |
315 | if (txc->modes & ADJ_TICK) | 321 | if (txc->modes & ADJ_TICK) |
@@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | |||
324 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 330 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) |
325 | txc->offset = save_adjust; | 331 | txc->offset = save_adjust; |
326 | else | 332 | else |
327 | txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; | 333 | txc->offset = shift_right(time_offset, SHIFT_UPDATE) |
328 | txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); | 334 | * NTP_INTERVAL_FREQ / 1000; |
335 | txc->freq = (time_freq / NSEC_PER_USEC) | ||
336 | << (SHIFT_USEC - SHIFT_NSEC); | ||
329 | txc->maxerror = time_maxerror; | 337 | txc->maxerror = time_maxerror; |
330 | txc->esterror = time_esterror; | 338 | txc->esterror = time_esterror; |
331 | txc->status = time_status; | 339 | txc->status = time_status; |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 000000000000..12b3efeb9f6f --- /dev/null +++ b/kernel/time/tick-broadcast.c | |||
@@ -0,0 +1,480 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-broadcast.c | ||
3 | * | ||
4 | * This file contains functions which emulate a local clock-event | ||
5 | * device via a broadcast event source. | ||
6 | * | ||
7 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
8 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
9 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
10 | * | ||
11 | * This code is licenced under the GPL version 2. For details see | ||
12 | * kernel-base/COPYING. | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/profile.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/tick.h> | ||
22 | |||
23 | #include "tick-internal.h" | ||
24 | |||
25 | /* | ||
26 | * Broadcast support for broken x86 hardware, where the local apic | ||
27 | * timer stops in C3 state. | ||
28 | */ | ||
29 | |||
30 | struct tick_device tick_broadcast_device; | ||
31 | static cpumask_t tick_broadcast_mask; | ||
32 | static DEFINE_SPINLOCK(tick_broadcast_lock); | ||
33 | |||
34 | /* | ||
35 | * Debugging: see timer_list.c | ||
36 | */ | ||
37 | struct tick_device *tick_get_broadcast_device(void) | ||
38 | { | ||
39 | return &tick_broadcast_device; | ||
40 | } | ||
41 | |||
42 | cpumask_t *tick_get_broadcast_mask(void) | ||
43 | { | ||
44 | return &tick_broadcast_mask; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Start the device in periodic mode | ||
49 | */ | ||
50 | static void tick_broadcast_start_periodic(struct clock_event_device *bc) | ||
51 | { | ||
52 | if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
53 | tick_setup_periodic(bc, 1); | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * Check, if the device can be utilized as broadcast device: | ||
58 | */ | ||
59 | int tick_check_broadcast_device(struct clock_event_device *dev) | ||
60 | { | ||
61 | if (tick_broadcast_device.evtdev || | ||
62 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
63 | return 0; | ||
64 | |||
65 | clockevents_exchange_device(NULL, dev); | ||
66 | tick_broadcast_device.evtdev = dev; | ||
67 | if (!cpus_empty(tick_broadcast_mask)) | ||
68 | tick_broadcast_start_periodic(dev); | ||
69 | return 1; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Check, if the device is the broadcast device | ||
74 | */ | ||
75 | int tick_is_broadcast_device(struct clock_event_device *dev) | ||
76 | { | ||
77 | return (dev && tick_broadcast_device.evtdev == dev); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Check, if the device is disfunctional and a place holder, which | ||
82 | * needs to be handled by the broadcast device. | ||
83 | */ | ||
84 | int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | ||
85 | { | ||
86 | unsigned long flags; | ||
87 | int ret = 0; | ||
88 | |||
89 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
90 | |||
91 | /* | ||
92 | * Devices might be registered with both periodic and oneshot | ||
93 | * mode disabled. This signals, that the device needs to be | ||
94 | * operated from the broadcast device and is a placeholder for | ||
95 | * the cpu local device. | ||
96 | */ | ||
97 | if (!tick_device_is_functional(dev)) { | ||
98 | dev->event_handler = tick_handle_periodic; | ||
99 | cpu_set(cpu, tick_broadcast_mask); | ||
100 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | ||
101 | ret = 1; | ||
102 | } | ||
103 | |||
104 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
105 | return ret; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Broadcast the event to the cpus, which are set in the mask | ||
110 | */ | ||
111 | int tick_do_broadcast(cpumask_t mask) | ||
112 | { | ||
113 | int ret = 0, cpu = smp_processor_id(); | ||
114 | struct tick_device *td; | ||
115 | |||
116 | /* | ||
117 | * Check, if the current cpu is in the mask | ||
118 | */ | ||
119 | if (cpu_isset(cpu, mask)) { | ||
120 | cpu_clear(cpu, mask); | ||
121 | td = &per_cpu(tick_cpu_device, cpu); | ||
122 | td->evtdev->event_handler(td->evtdev); | ||
123 | ret = 1; | ||
124 | } | ||
125 | |||
126 | if (!cpus_empty(mask)) { | ||
127 | /* | ||
128 | * It might be necessary to actually check whether the devices | ||
129 | * have different broadcast functions. For now, just use the | ||
130 | * one of the first device. This works as long as we have this | ||
131 | * misfeature only on x86 (lapic) | ||
132 | */ | ||
133 | cpu = first_cpu(mask); | ||
134 | td = &per_cpu(tick_cpu_device, cpu); | ||
135 | td->evtdev->broadcast(mask); | ||
136 | ret = 1; | ||
137 | } | ||
138 | return ret; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * Periodic broadcast: | ||
143 | * - invoke the broadcast handlers | ||
144 | */ | ||
145 | static void tick_do_periodic_broadcast(void) | ||
146 | { | ||
147 | cpumask_t mask; | ||
148 | |||
149 | spin_lock(&tick_broadcast_lock); | ||
150 | |||
151 | cpus_and(mask, cpu_online_map, tick_broadcast_mask); | ||
152 | tick_do_broadcast(mask); | ||
153 | |||
154 | spin_unlock(&tick_broadcast_lock); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Event handler for periodic broadcast ticks | ||
159 | */ | ||
160 | static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | ||
161 | { | ||
162 | dev->next_event.tv64 = KTIME_MAX; | ||
163 | |||
164 | tick_do_periodic_broadcast(); | ||
165 | |||
166 | /* | ||
167 | * The device is in periodic mode. No reprogramming necessary: | ||
168 | */ | ||
169 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | ||
170 | return; | ||
171 | |||
172 | /* | ||
173 | * Setup the next period for devices, which do not have | ||
174 | * periodic mode: | ||
175 | */ | ||
176 | for (;;) { | ||
177 | ktime_t next = ktime_add(dev->next_event, tick_period); | ||
178 | |||
179 | if (!clockevents_program_event(dev, next, ktime_get())) | ||
180 | return; | ||
181 | tick_do_periodic_broadcast(); | ||
182 | } | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Powerstate information: The system enters/leaves a state, where | ||
187 | * affected devices might stop | ||
188 | */ | ||
189 | static void tick_do_broadcast_on_off(void *why) | ||
190 | { | ||
191 | struct clock_event_device *bc, *dev; | ||
192 | struct tick_device *td; | ||
193 | unsigned long flags, *reason = why; | ||
194 | int cpu; | ||
195 | |||
196 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
197 | |||
198 | cpu = smp_processor_id(); | ||
199 | td = &per_cpu(tick_cpu_device, cpu); | ||
200 | dev = td->evtdev; | ||
201 | bc = tick_broadcast_device.evtdev; | ||
202 | |||
203 | /* | ||
204 | * Is the device in broadcast mode forever or is it not | ||
205 | * affected by the powerstate ? | ||
206 | */ | ||
207 | if (!dev || !tick_device_is_functional(dev) || | ||
208 | !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
209 | goto out; | ||
210 | |||
211 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { | ||
212 | if (!cpu_isset(cpu, tick_broadcast_mask)) { | ||
213 | cpu_set(cpu, tick_broadcast_mask); | ||
214 | if (td->mode == TICKDEV_MODE_PERIODIC) | ||
215 | clockevents_set_mode(dev, | ||
216 | CLOCK_EVT_MODE_SHUTDOWN); | ||
217 | } | ||
218 | } else { | ||
219 | if (cpu_isset(cpu, tick_broadcast_mask)) { | ||
220 | cpu_clear(cpu, tick_broadcast_mask); | ||
221 | if (td->mode == TICKDEV_MODE_PERIODIC) | ||
222 | tick_setup_periodic(dev, 0); | ||
223 | } | ||
224 | } | ||
225 | |||
226 | if (cpus_empty(tick_broadcast_mask)) | ||
227 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
228 | else { | ||
229 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | ||
230 | tick_broadcast_start_periodic(bc); | ||
231 | else | ||
232 | tick_broadcast_setup_oneshot(bc); | ||
233 | } | ||
234 | out: | ||
235 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Powerstate information: The system enters/leaves a state, where | ||
240 | * affected devices might stop. | ||
241 | */ | ||
242 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | ||
243 | { | ||
244 | int cpu = get_cpu(); | ||
245 | |||
246 | if (cpu == *oncpu) | ||
247 | tick_do_broadcast_on_off(&reason); | ||
248 | else | ||
249 | smp_call_function_single(*oncpu, tick_do_broadcast_on_off, | ||
250 | &reason, 1, 1); | ||
251 | put_cpu(); | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Set the periodic handler depending on broadcast on/off | ||
256 | */ | ||
257 | void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | ||
258 | { | ||
259 | if (!broadcast) | ||
260 | dev->event_handler = tick_handle_periodic; | ||
261 | else | ||
262 | dev->event_handler = tick_handle_periodic_broadcast; | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * Remove a CPU from broadcasting | ||
267 | */ | ||
268 | void tick_shutdown_broadcast(unsigned int *cpup) | ||
269 | { | ||
270 | struct clock_event_device *bc; | ||
271 | unsigned long flags; | ||
272 | unsigned int cpu = *cpup; | ||
273 | |||
274 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
275 | |||
276 | bc = tick_broadcast_device.evtdev; | ||
277 | cpu_clear(cpu, tick_broadcast_mask); | ||
278 | |||
279 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { | ||
280 | if (bc && cpus_empty(tick_broadcast_mask)) | ||
281 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
282 | } | ||
283 | |||
284 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
285 | } | ||
286 | |||
287 | #ifdef CONFIG_TICK_ONESHOT | ||
288 | |||
289 | static cpumask_t tick_broadcast_oneshot_mask; | ||
290 | |||
291 | /* | ||
292 | * Debugging: see timer_list.c | ||
293 | */ | ||
294 | cpumask_t *tick_get_broadcast_oneshot_mask(void) | ||
295 | { | ||
296 | return &tick_broadcast_oneshot_mask; | ||
297 | } | ||
298 | |||
299 | static int tick_broadcast_set_event(ktime_t expires, int force) | ||
300 | { | ||
301 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | ||
302 | ktime_t now = ktime_get(); | ||
303 | int res; | ||
304 | |||
305 | for(;;) { | ||
306 | res = clockevents_program_event(bc, expires, now); | ||
307 | if (!res || !force) | ||
308 | return res; | ||
309 | now = ktime_get(); | ||
310 | expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * Reprogram the broadcast device: | ||
316 | * | ||
317 | * Called with tick_broadcast_lock held and interrupts disabled. | ||
318 | */ | ||
319 | static int tick_broadcast_reprogram(void) | ||
320 | { | ||
321 | ktime_t expires = { .tv64 = KTIME_MAX }; | ||
322 | struct tick_device *td; | ||
323 | int cpu; | ||
324 | |||
325 | /* | ||
326 | * Find the event which expires next: | ||
327 | */ | ||
328 | for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; | ||
329 | cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { | ||
330 | td = &per_cpu(tick_cpu_device, cpu); | ||
331 | if (td->evtdev->next_event.tv64 < expires.tv64) | ||
332 | expires = td->evtdev->next_event; | ||
333 | } | ||
334 | |||
335 | if (expires.tv64 == KTIME_MAX) | ||
336 | return 0; | ||
337 | |||
338 | return tick_broadcast_set_event(expires, 0); | ||
339 | } | ||
340 | |||
341 | /* | ||
342 | * Handle oneshot mode broadcasting | ||
343 | */ | ||
344 | static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) | ||
345 | { | ||
346 | struct tick_device *td; | ||
347 | cpumask_t mask; | ||
348 | ktime_t now; | ||
349 | int cpu; | ||
350 | |||
351 | spin_lock(&tick_broadcast_lock); | ||
352 | again: | ||
353 | dev->next_event.tv64 = KTIME_MAX; | ||
354 | mask = CPU_MASK_NONE; | ||
355 | now = ktime_get(); | ||
356 | /* Find all expired events */ | ||
357 | for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; | ||
358 | cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { | ||
359 | td = &per_cpu(tick_cpu_device, cpu); | ||
360 | if (td->evtdev->next_event.tv64 <= now.tv64) | ||
361 | cpu_set(cpu, mask); | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Wakeup the cpus which have an expired event. The broadcast | ||
366 | * device is reprogrammed in the return from idle code. | ||
367 | */ | ||
368 | if (!tick_do_broadcast(mask)) { | ||
369 | /* | ||
370 | * The global event did not expire any CPU local | ||
371 | * events. This happens in dyntick mode, as the | ||
372 | * maximum PIT delta is quite small. | ||
373 | */ | ||
374 | if (tick_broadcast_reprogram()) | ||
375 | goto again; | ||
376 | } | ||
377 | spin_unlock(&tick_broadcast_lock); | ||
378 | } | ||
379 | |||
380 | /* | ||
381 | * Powerstate information: The system enters/leaves a state, where | ||
382 | * affected devices might stop | ||
383 | */ | ||
384 | void tick_broadcast_oneshot_control(unsigned long reason) | ||
385 | { | ||
386 | struct clock_event_device *bc, *dev; | ||
387 | struct tick_device *td; | ||
388 | unsigned long flags; | ||
389 | int cpu; | ||
390 | |||
391 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
392 | |||
393 | /* | ||
394 | * Periodic mode does not care about the enter/exit of power | ||
395 | * states | ||
396 | */ | ||
397 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | ||
398 | goto out; | ||
399 | |||
400 | bc = tick_broadcast_device.evtdev; | ||
401 | cpu = smp_processor_id(); | ||
402 | td = &per_cpu(tick_cpu_device, cpu); | ||
403 | dev = td->evtdev; | ||
404 | |||
405 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
406 | goto out; | ||
407 | |||
408 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | ||
409 | if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { | ||
410 | cpu_set(cpu, tick_broadcast_oneshot_mask); | ||
411 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | ||
412 | if (dev->next_event.tv64 < bc->next_event.tv64) | ||
413 | tick_broadcast_set_event(dev->next_event, 1); | ||
414 | } | ||
415 | } else { | ||
416 | if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { | ||
417 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | ||
418 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | ||
419 | if (dev->next_event.tv64 != KTIME_MAX) | ||
420 | tick_program_event(dev->next_event, 1); | ||
421 | } | ||
422 | } | ||
423 | |||
424 | out: | ||
425 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
426 | } | ||
427 | |||
428 | /** | ||
429 | * tick_broadcast_setup_highres - setup the broadcast device for highres | ||
430 | */ | ||
431 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
432 | { | ||
433 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { | ||
434 | bc->event_handler = tick_handle_oneshot_broadcast; | ||
435 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
436 | bc->next_event.tv64 = KTIME_MAX; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | /* | ||
441 | * Select oneshot operating mode for the broadcast device | ||
442 | */ | ||
443 | void tick_broadcast_switch_to_oneshot(void) | ||
444 | { | ||
445 | struct clock_event_device *bc; | ||
446 | unsigned long flags; | ||
447 | |||
448 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
449 | |||
450 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; | ||
451 | bc = tick_broadcast_device.evtdev; | ||
452 | if (bc) | ||
453 | tick_broadcast_setup_oneshot(bc); | ||
454 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
455 | } | ||
456 | |||
457 | |||
458 | /* | ||
459 | * Remove a dead CPU from broadcasting | ||
460 | */ | ||
461 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | ||
462 | { | ||
463 | struct clock_event_device *bc; | ||
464 | unsigned long flags; | ||
465 | unsigned int cpu = *cpup; | ||
466 | |||
467 | spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
468 | |||
469 | bc = tick_broadcast_device.evtdev; | ||
470 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | ||
471 | |||
472 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { | ||
473 | if (bc && cpus_empty(tick_broadcast_oneshot_mask)) | ||
474 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
475 | } | ||
476 | |||
477 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
478 | } | ||
479 | |||
480 | #endif | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 000000000000..0986a2bfab49 --- /dev/null +++ b/kernel/time/tick-common.c | |||
@@ -0,0 +1,347 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-common.c | ||
3 | * | ||
4 | * This file contains the base functions to manage periodic tick | ||
5 | * related events. | ||
6 | * | ||
7 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
8 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
9 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
10 | * | ||
11 | * This code is licenced under the GPL version 2. For details see | ||
12 | * kernel-base/COPYING. | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/profile.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/tick.h> | ||
22 | |||
23 | #include "tick-internal.h" | ||
24 | |||
25 | /* | ||
26 | * Tick devices | ||
27 | */ | ||
28 | DEFINE_PER_CPU(struct tick_device, tick_cpu_device); | ||
29 | /* | ||
30 | * Tick next event: keeps track of the tick time | ||
31 | */ | ||
32 | ktime_t tick_next_period; | ||
33 | ktime_t tick_period; | ||
34 | static int tick_do_timer_cpu = -1; | ||
35 | DEFINE_SPINLOCK(tick_device_lock); | ||
36 | |||
37 | /* | ||
38 | * Debugging: see timer_list.c | ||
39 | */ | ||
40 | struct tick_device *tick_get_device(int cpu) | ||
41 | { | ||
42 | return &per_cpu(tick_cpu_device, cpu); | ||
43 | } | ||
44 | |||
45 | /** | ||
46 | * tick_is_oneshot_available - check for a oneshot capable event device | ||
47 | */ | ||
48 | int tick_is_oneshot_available(void) | ||
49 | { | ||
50 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | ||
51 | |||
52 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Periodic tick | ||
57 | */ | ||
58 | static void tick_periodic(int cpu) | ||
59 | { | ||
60 | if (tick_do_timer_cpu == cpu) { | ||
61 | write_seqlock(&xtime_lock); | ||
62 | |||
63 | /* Keep track of the next tick event */ | ||
64 | tick_next_period = ktime_add(tick_next_period, tick_period); | ||
65 | |||
66 | do_timer(1); | ||
67 | write_sequnlock(&xtime_lock); | ||
68 | } | ||
69 | |||
70 | update_process_times(user_mode(get_irq_regs())); | ||
71 | profile_tick(CPU_PROFILING); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Event handler for periodic ticks | ||
76 | */ | ||
77 | void tick_handle_periodic(struct clock_event_device *dev) | ||
78 | { | ||
79 | int cpu = smp_processor_id(); | ||
80 | ktime_t next; | ||
81 | |||
82 | tick_periodic(cpu); | ||
83 | |||
84 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | ||
85 | return; | ||
86 | /* | ||
87 | * Setup the next period for devices, which do not have | ||
88 | * periodic mode: | ||
89 | */ | ||
90 | next = ktime_add(dev->next_event, tick_period); | ||
91 | for (;;) { | ||
92 | if (!clockevents_program_event(dev, next, ktime_get())) | ||
93 | return; | ||
94 | tick_periodic(cpu); | ||
95 | next = ktime_add(next, tick_period); | ||
96 | } | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * Setup the device for a periodic tick | ||
101 | */ | ||
102 | void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | ||
103 | { | ||
104 | tick_set_periodic_handler(dev, broadcast); | ||
105 | |||
106 | /* Broadcast setup ? */ | ||
107 | if (!tick_device_is_functional(dev)) | ||
108 | return; | ||
109 | |||
110 | if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { | ||
111 | clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); | ||
112 | } else { | ||
113 | unsigned long seq; | ||
114 | ktime_t next; | ||
115 | |||
116 | do { | ||
117 | seq = read_seqbegin(&xtime_lock); | ||
118 | next = tick_next_period; | ||
119 | } while (read_seqretry(&xtime_lock, seq)); | ||
120 | |||
121 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | ||
122 | |||
123 | for (;;) { | ||
124 | if (!clockevents_program_event(dev, next, ktime_get())) | ||
125 | return; | ||
126 | next = ktime_add(next, tick_period); | ||
127 | } | ||
128 | } | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Setup the tick device | ||
133 | */ | ||
134 | static void tick_setup_device(struct tick_device *td, | ||
135 | struct clock_event_device *newdev, int cpu, | ||
136 | cpumask_t cpumask) | ||
137 | { | ||
138 | ktime_t next_event; | ||
139 | void (*handler)(struct clock_event_device *) = NULL; | ||
140 | |||
141 | /* | ||
142 | * First device setup ? | ||
143 | */ | ||
144 | if (!td->evtdev) { | ||
145 | /* | ||
146 | * If no cpu took the do_timer update, assign it to | ||
147 | * this cpu: | ||
148 | */ | ||
149 | if (tick_do_timer_cpu == -1) { | ||
150 | tick_do_timer_cpu = cpu; | ||
151 | tick_next_period = ktime_get(); | ||
152 | tick_period = ktime_set(0, NSEC_PER_SEC / HZ); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Startup in periodic mode first. | ||
157 | */ | ||
158 | td->mode = TICKDEV_MODE_PERIODIC; | ||
159 | } else { | ||
160 | handler = td->evtdev->event_handler; | ||
161 | next_event = td->evtdev->next_event; | ||
162 | } | ||
163 | |||
164 | td->evtdev = newdev; | ||
165 | |||
166 | /* | ||
167 | * When the device is not per cpu, pin the interrupt to the | ||
168 | * current cpu: | ||
169 | */ | ||
170 | if (!cpus_equal(newdev->cpumask, cpumask)) | ||
171 | irq_set_affinity(newdev->irq, cpumask); | ||
172 | |||
173 | /* | ||
174 | * When global broadcasting is active, check if the current | ||
175 | * device is registered as a placeholder for broadcast mode. | ||
176 | * This allows us to handle this x86 misfeature in a generic | ||
177 | * way. | ||
178 | */ | ||
179 | if (tick_device_uses_broadcast(newdev, cpu)) | ||
180 | return; | ||
181 | |||
182 | if (td->mode == TICKDEV_MODE_PERIODIC) | ||
183 | tick_setup_periodic(newdev, 0); | ||
184 | else | ||
185 | tick_setup_oneshot(newdev, handler, next_event); | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * Check, if the new registered device should be used. | ||
190 | */ | ||
191 | static int tick_check_new_device(struct clock_event_device *newdev) | ||
192 | { | ||
193 | struct clock_event_device *curdev; | ||
194 | struct tick_device *td; | ||
195 | int cpu, ret = NOTIFY_OK; | ||
196 | unsigned long flags; | ||
197 | cpumask_t cpumask; | ||
198 | |||
199 | spin_lock_irqsave(&tick_device_lock, flags); | ||
200 | |||
201 | cpu = smp_processor_id(); | ||
202 | if (!cpu_isset(cpu, newdev->cpumask)) | ||
203 | goto out; | ||
204 | |||
205 | td = &per_cpu(tick_cpu_device, cpu); | ||
206 | curdev = td->evtdev; | ||
207 | cpumask = cpumask_of_cpu(cpu); | ||
208 | |||
209 | /* cpu local device ? */ | ||
210 | if (!cpus_equal(newdev->cpumask, cpumask)) { | ||
211 | |||
212 | /* | ||
213 | * If the cpu affinity of the device interrupt can not | ||
214 | * be set, ignore it. | ||
215 | */ | ||
216 | if (!irq_can_set_affinity(newdev->irq)) | ||
217 | goto out_bc; | ||
218 | |||
219 | /* | ||
220 | * If we have a cpu local device already, do not replace it | ||
221 | * by a non cpu local device | ||
222 | */ | ||
223 | if (curdev && cpus_equal(curdev->cpumask, cpumask)) | ||
224 | goto out_bc; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * If we have an active device, then check the rating and the oneshot | ||
229 | * feature. | ||
230 | */ | ||
231 | if (curdev) { | ||
232 | /* | ||
233 | * Prefer one shot capable devices ! | ||
234 | */ | ||
235 | if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && | ||
236 | !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
237 | goto out_bc; | ||
238 | /* | ||
239 | * Check the rating | ||
240 | */ | ||
241 | if (curdev->rating >= newdev->rating) | ||
242 | goto out_bc; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Replace the eventually existing device by the new | ||
247 | * device. If the current device is the broadcast device, do | ||
248 | * not give it back to the clockevents layer ! | ||
249 | */ | ||
250 | if (tick_is_broadcast_device(curdev)) { | ||
251 | clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); | ||
252 | curdev = NULL; | ||
253 | } | ||
254 | clockevents_exchange_device(curdev, newdev); | ||
255 | tick_setup_device(td, newdev, cpu, cpumask); | ||
256 | if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) | ||
257 | tick_oneshot_notify(); | ||
258 | |||
259 | spin_unlock_irqrestore(&tick_device_lock, flags); | ||
260 | return NOTIFY_STOP; | ||
261 | |||
262 | out_bc: | ||
263 | /* | ||
264 | * Can the new device be used as a broadcast device ? | ||
265 | */ | ||
266 | if (tick_check_broadcast_device(newdev)) | ||
267 | ret = NOTIFY_STOP; | ||
268 | out: | ||
269 | spin_unlock_irqrestore(&tick_device_lock, flags); | ||
270 | |||
271 | return ret; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Shutdown an event device on a given cpu: | ||
276 | * | ||
277 | * This is called on a life CPU, when a CPU is dead. So we cannot | ||
278 | * access the hardware device itself. | ||
279 | * We just set the mode and remove it from the lists. | ||
280 | */ | ||
281 | static void tick_shutdown(unsigned int *cpup) | ||
282 | { | ||
283 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); | ||
284 | struct clock_event_device *dev = td->evtdev; | ||
285 | unsigned long flags; | ||
286 | |||
287 | spin_lock_irqsave(&tick_device_lock, flags); | ||
288 | td->mode = TICKDEV_MODE_PERIODIC; | ||
289 | if (dev) { | ||
290 | /* | ||
291 | * Prevent that the clock events layer tries to call | ||
292 | * the set mode function! | ||
293 | */ | ||
294 | dev->mode = CLOCK_EVT_MODE_UNUSED; | ||
295 | clockevents_exchange_device(dev, NULL); | ||
296 | td->evtdev = NULL; | ||
297 | } | ||
298 | spin_unlock_irqrestore(&tick_device_lock, flags); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Notification about clock event devices | ||
303 | */ | ||
304 | static int tick_notify(struct notifier_block *nb, unsigned long reason, | ||
305 | void *dev) | ||
306 | { | ||
307 | switch (reason) { | ||
308 | |||
309 | case CLOCK_EVT_NOTIFY_ADD: | ||
310 | return tick_check_new_device(dev); | ||
311 | |||
312 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | ||
313 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | ||
314 | tick_broadcast_on_off(reason, dev); | ||
315 | break; | ||
316 | |||
317 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | ||
318 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | ||
319 | tick_broadcast_oneshot_control(reason); | ||
320 | break; | ||
321 | |||
322 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
323 | tick_shutdown_broadcast_oneshot(dev); | ||
324 | tick_shutdown_broadcast(dev); | ||
325 | tick_shutdown(dev); | ||
326 | break; | ||
327 | |||
328 | default: | ||
329 | break; | ||
330 | } | ||
331 | |||
332 | return NOTIFY_OK; | ||
333 | } | ||
334 | |||
335 | static struct notifier_block tick_notifier = { | ||
336 | .notifier_call = tick_notify, | ||
337 | }; | ||
338 | |||
339 | /** | ||
340 | * tick_init - initialize the tick control | ||
341 | * | ||
342 | * Register the notifier with the clockevents framework | ||
343 | */ | ||
344 | void __init tick_init(void) | ||
345 | { | ||
346 | clockevents_register_notifier(&tick_notifier); | ||
347 | } | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 000000000000..54861a0f29ff --- /dev/null +++ b/kernel/time/tick-internal.h | |||
@@ -0,0 +1,110 @@ | |||
1 | /* | ||
2 | * tick internal variable and functions used by low/high res code | ||
3 | */ | ||
4 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | ||
5 | extern spinlock_t tick_device_lock; | ||
6 | extern ktime_t tick_next_period; | ||
7 | extern ktime_t tick_period; | ||
8 | |||
9 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | ||
10 | extern void tick_handle_periodic(struct clock_event_device *dev); | ||
11 | |||
12 | /* | ||
13 | * NO_HZ / high resolution timer shared code | ||
14 | */ | ||
15 | #ifdef CONFIG_TICK_ONESHOT | ||
16 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | ||
17 | void (*handler)(struct clock_event_device *), | ||
18 | ktime_t nextevt); | ||
19 | extern int tick_program_event(ktime_t expires, int force); | ||
20 | extern void tick_oneshot_notify(void); | ||
21 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | ||
22 | |||
23 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
24 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | ||
25 | extern void tick_broadcast_oneshot_control(unsigned long reason); | ||
26 | extern void tick_broadcast_switch_to_oneshot(void); | ||
27 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | ||
28 | # else /* BROADCAST */ | ||
29 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
30 | { | ||
31 | BUG(); | ||
32 | } | ||
33 | static inline void tick_broadcast_oneshot_control(unsigned long reason) { } | ||
34 | static inline void tick_broadcast_switch_to_oneshot(void) { } | ||
35 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
36 | # endif /* !BROADCAST */ | ||
37 | |||
38 | #else /* !ONESHOT */ | ||
39 | static inline | ||
40 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
41 | void (*handler)(struct clock_event_device *), | ||
42 | ktime_t nextevt) | ||
43 | { | ||
44 | BUG(); | ||
45 | } | ||
46 | static inline int tick_program_event(ktime_t expires, int force) | ||
47 | { | ||
48 | return 0; | ||
49 | } | ||
50 | static inline void tick_oneshot_notify(void) { } | ||
51 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
52 | { | ||
53 | BUG(); | ||
54 | } | ||
55 | static inline void tick_broadcast_oneshot_control(unsigned long reason) { } | ||
56 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
57 | #endif /* !TICK_ONESHOT */ | ||
58 | |||
59 | /* | ||
60 | * Broadcasting support | ||
61 | */ | ||
62 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
63 | extern int tick_do_broadcast(cpumask_t mask); | ||
64 | |||
65 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | ||
66 | extern int tick_check_broadcast_device(struct clock_event_device *dev); | ||
67 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
68 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | ||
69 | extern void tick_shutdown_broadcast(unsigned int *cpup); | ||
70 | |||
71 | extern void | ||
72 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
73 | |||
74 | #else /* !BROADCAST */ | ||
75 | |||
76 | static inline int tick_check_broadcast_device(struct clock_event_device *dev) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) | ||
82 | { | ||
83 | return 0; | ||
84 | } | ||
85 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, | ||
86 | int cpu) | ||
87 | { | ||
88 | return 0; | ||
89 | } | ||
90 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
91 | static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } | ||
92 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } | ||
93 | |||
94 | /* | ||
95 | * Set the periodic handler in non broadcast mode | ||
96 | */ | ||
97 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, | ||
98 | int broadcast) | ||
99 | { | ||
100 | dev->event_handler = tick_handle_periodic; | ||
101 | } | ||
102 | #endif /* !BROADCAST */ | ||
103 | |||
104 | /* | ||
105 | * Check, if the device is functional or a dummy for broadcast | ||
106 | */ | ||
107 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
108 | { | ||
109 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
110 | } | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 000000000000..2e8b7ff863cc --- /dev/null +++ b/kernel/time/tick-oneshot.c | |||
@@ -0,0 +1,84 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-oneshot.c | ||
3 | * | ||
4 | * This file contains functions which manage high resolution tick | ||
5 | * related events. | ||
6 | * | ||
7 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
8 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
9 | * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner | ||
10 | * | ||
11 | * This code is licenced under the GPL version 2. For details see | ||
12 | * kernel-base/COPYING. | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/irq.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/profile.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/tick.h> | ||
22 | |||
23 | #include "tick-internal.h" | ||
24 | |||
25 | /** | ||
26 | * tick_program_event | ||
27 | */ | ||
28 | int tick_program_event(ktime_t expires, int force) | ||
29 | { | ||
30 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | ||
31 | ktime_t now = ktime_get(); | ||
32 | |||
33 | while (1) { | ||
34 | int ret = clockevents_program_event(dev, expires, now); | ||
35 | |||
36 | if (!ret || !force) | ||
37 | return ret; | ||
38 | now = ktime_get(); | ||
39 | expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | /** | ||
44 | * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) | ||
45 | */ | ||
46 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
47 | void (*handler)(struct clock_event_device *), | ||
48 | ktime_t next_event) | ||
49 | { | ||
50 | newdev->event_handler = handler; | ||
51 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | ||
52 | clockevents_program_event(newdev, next_event, ktime_get()); | ||
53 | } | ||
54 | |||
55 | /** | ||
56 | * tick_switch_to_oneshot - switch to oneshot mode | ||
57 | */ | ||
58 | int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | ||
59 | { | ||
60 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | ||
61 | struct clock_event_device *dev = td->evtdev; | ||
62 | |||
63 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || | ||
64 | !tick_device_is_functional(dev)) | ||
65 | return -EINVAL; | ||
66 | |||
67 | td->mode = TICKDEV_MODE_ONESHOT; | ||
68 | dev->event_handler = handler; | ||
69 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | ||
70 | tick_broadcast_switch_to_oneshot(); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
75 | /** | ||
76 | * tick_init_highres - switch to high resolution mode | ||
77 | * | ||
78 | * Called with interrupts disabled. | ||
79 | */ | ||
80 | int tick_init_highres(void) | ||
81 | { | ||
82 | return tick_switch_to_oneshot(hrtimer_interrupt); | ||
83 | } | ||
84 | #endif | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 000000000000..51556b95f60f --- /dev/null +++ b/kernel/time/tick-sched.c | |||
@@ -0,0 +1,567 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-sched.c | ||
3 | * | ||
4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner | ||
7 | * | ||
8 | * No idle tick implementation for low and high resolution timers | ||
9 | * | ||
10 | * Started by: Thomas Gleixner and Ingo Molnar | ||
11 | * | ||
12 | * For licencing details see kernel-base/COPYING | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/kernel_stat.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/profile.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/tick.h> | ||
23 | |||
24 | #include <asm/irq_regs.h> | ||
25 | |||
26 | #include "tick-internal.h" | ||
27 | |||
28 | /* | ||
29 | * Per cpu nohz control structure | ||
30 | */ | ||
31 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | ||
32 | |||
33 | /* | ||
34 | * The time, when the last jiffy update happened. Protected by xtime_lock. | ||
35 | */ | ||
36 | static ktime_t last_jiffies_update; | ||
37 | |||
38 | struct tick_sched *tick_get_tick_sched(int cpu) | ||
39 | { | ||
40 | return &per_cpu(tick_cpu_sched, cpu); | ||
41 | } | ||
42 | |||
43 | /* | ||
44 | * Must be called with interrupts disabled ! | ||
45 | */ | ||
46 | static void tick_do_update_jiffies64(ktime_t now) | ||
47 | { | ||
48 | unsigned long ticks = 0; | ||
49 | ktime_t delta; | ||
50 | |||
51 | /* Reevalute with xtime_lock held */ | ||
52 | write_seqlock(&xtime_lock); | ||
53 | |||
54 | delta = ktime_sub(now, last_jiffies_update); | ||
55 | if (delta.tv64 >= tick_period.tv64) { | ||
56 | |||
57 | delta = ktime_sub(delta, tick_period); | ||
58 | last_jiffies_update = ktime_add(last_jiffies_update, | ||
59 | tick_period); | ||
60 | |||
61 | /* Slow path for long timeouts */ | ||
62 | if (unlikely(delta.tv64 >= tick_period.tv64)) { | ||
63 | s64 incr = ktime_to_ns(tick_period); | ||
64 | |||
65 | ticks = ktime_divns(delta, incr); | ||
66 | |||
67 | last_jiffies_update = ktime_add_ns(last_jiffies_update, | ||
68 | incr * ticks); | ||
69 | } | ||
70 | do_timer(++ticks); | ||
71 | } | ||
72 | write_sequnlock(&xtime_lock); | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Initialize and return retrieve the jiffies update. | ||
77 | */ | ||
78 | static ktime_t tick_init_jiffy_update(void) | ||
79 | { | ||
80 | ktime_t period; | ||
81 | |||
82 | write_seqlock(&xtime_lock); | ||
83 | /* Did we start the jiffies update yet ? */ | ||
84 | if (last_jiffies_update.tv64 == 0) | ||
85 | last_jiffies_update = tick_next_period; | ||
86 | period = last_jiffies_update; | ||
87 | write_sequnlock(&xtime_lock); | ||
88 | return period; | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * NOHZ - aka dynamic tick functionality | ||
93 | */ | ||
94 | #ifdef CONFIG_NO_HZ | ||
95 | /* | ||
96 | * NO HZ enabled ? | ||
97 | */ | ||
98 | static int tick_nohz_enabled __read_mostly = 1; | ||
99 | |||
100 | /* | ||
101 | * Enable / Disable tickless mode | ||
102 | */ | ||
103 | static int __init setup_tick_nohz(char *str) | ||
104 | { | ||
105 | if (!strcmp(str, "off")) | ||
106 | tick_nohz_enabled = 0; | ||
107 | else if (!strcmp(str, "on")) | ||
108 | tick_nohz_enabled = 1; | ||
109 | else | ||
110 | return 0; | ||
111 | return 1; | ||
112 | } | ||
113 | |||
114 | __setup("nohz=", setup_tick_nohz); | ||
115 | |||
116 | /** | ||
117 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | ||
118 | * | ||
119 | * Called from interrupt entry when the CPU was idle | ||
120 | * | ||
121 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies | ||
122 | * must be updated. Otherwise an interrupt handler could use a stale jiffy | ||
123 | * value. We do this unconditionally on any cpu, as we don't know whether the | ||
124 | * cpu, which has the update task assigned is in a long sleep. | ||
125 | */ | ||
126 | void tick_nohz_update_jiffies(void) | ||
127 | { | ||
128 | int cpu = smp_processor_id(); | ||
129 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
130 | unsigned long flags; | ||
131 | ktime_t now; | ||
132 | |||
133 | if (!ts->tick_stopped) | ||
134 | return; | ||
135 | |||
136 | cpu_clear(cpu, nohz_cpu_mask); | ||
137 | now = ktime_get(); | ||
138 | |||
139 | local_irq_save(flags); | ||
140 | tick_do_update_jiffies64(now); | ||
141 | local_irq_restore(flags); | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
146 | * | ||
147 | * When the next event is more than a tick into the future, stop the idle tick | ||
148 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
149 | * just interrupted by an interrupt which did not cause a reschedule. | ||
150 | */ | ||
151 | void tick_nohz_stop_sched_tick(void) | ||
152 | { | ||
153 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | ||
154 | struct tick_sched *ts; | ||
155 | ktime_t last_update, expires, now, delta; | ||
156 | int cpu; | ||
157 | |||
158 | local_irq_save(flags); | ||
159 | |||
160 | cpu = smp_processor_id(); | ||
161 | ts = &per_cpu(tick_cpu_sched, cpu); | ||
162 | |||
163 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
164 | goto end; | ||
165 | |||
166 | if (need_resched()) | ||
167 | goto end; | ||
168 | |||
169 | cpu = smp_processor_id(); | ||
170 | if (unlikely(local_softirq_pending())) | ||
171 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | ||
172 | local_softirq_pending()); | ||
173 | |||
174 | now = ktime_get(); | ||
175 | /* | ||
176 | * When called from irq_exit we need to account the idle sleep time | ||
177 | * correctly. | ||
178 | */ | ||
179 | if (ts->tick_stopped) { | ||
180 | delta = ktime_sub(now, ts->idle_entrytime); | ||
181 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
182 | } | ||
183 | |||
184 | ts->idle_entrytime = now; | ||
185 | ts->idle_calls++; | ||
186 | |||
187 | /* Read jiffies and the time when jiffies were updated last */ | ||
188 | do { | ||
189 | seq = read_seqbegin(&xtime_lock); | ||
190 | last_update = last_jiffies_update; | ||
191 | last_jiffies = jiffies; | ||
192 | } while (read_seqretry(&xtime_lock, seq)); | ||
193 | |||
194 | /* Get the next timer wheel timer */ | ||
195 | next_jiffies = get_next_timer_interrupt(last_jiffies); | ||
196 | delta_jiffies = next_jiffies - last_jiffies; | ||
197 | |||
198 | if (rcu_needs_cpu(cpu)) | ||
199 | delta_jiffies = 1; | ||
200 | /* | ||
201 | * Do not stop the tick, if we are only one off | ||
202 | * or if the cpu is required for rcu | ||
203 | */ | ||
204 | if (!ts->tick_stopped && delta_jiffies == 1) | ||
205 | goto out; | ||
206 | |||
207 | /* Schedule the tick, if we are at least one jiffie off */ | ||
208 | if ((long)delta_jiffies >= 1) { | ||
209 | |||
210 | if (delta_jiffies > 1) | ||
211 | cpu_set(cpu, nohz_cpu_mask); | ||
212 | /* | ||
213 | * nohz_stop_sched_tick can be called several times before | ||
214 | * the nohz_restart_sched_tick is called. This happens when | ||
215 | * interrupts arrive which do not cause a reschedule. In the | ||
216 | * first call we save the current tick time, so we can restart | ||
217 | * the scheduler tick in nohz_restart_sched_tick. | ||
218 | */ | ||
219 | if (!ts->tick_stopped) { | ||
220 | ts->idle_tick = ts->sched_timer.expires; | ||
221 | ts->tick_stopped = 1; | ||
222 | ts->idle_jiffies = last_jiffies; | ||
223 | } | ||
224 | /* | ||
225 | * calculate the expiry time for the next timer wheel | ||
226 | * timer | ||
227 | */ | ||
228 | expires = ktime_add_ns(last_update, tick_period.tv64 * | ||
229 | delta_jiffies); | ||
230 | ts->idle_expires = expires; | ||
231 | ts->idle_sleeps++; | ||
232 | |||
233 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | ||
234 | hrtimer_start(&ts->sched_timer, expires, | ||
235 | HRTIMER_MODE_ABS); | ||
236 | /* Check, if the timer was already in the past */ | ||
237 | if (hrtimer_active(&ts->sched_timer)) | ||
238 | goto out; | ||
239 | } else if(!tick_program_event(expires, 0)) | ||
240 | goto out; | ||
241 | /* | ||
242 | * We are past the event already. So we crossed a | ||
243 | * jiffie boundary. Update jiffies and raise the | ||
244 | * softirq. | ||
245 | */ | ||
246 | tick_do_update_jiffies64(ktime_get()); | ||
247 | cpu_clear(cpu, nohz_cpu_mask); | ||
248 | } | ||
249 | raise_softirq_irqoff(TIMER_SOFTIRQ); | ||
250 | out: | ||
251 | ts->next_jiffies = next_jiffies; | ||
252 | ts->last_jiffies = last_jiffies; | ||
253 | end: | ||
254 | local_irq_restore(flags); | ||
255 | } | ||
256 | |||
257 | /** | ||
258 | * nohz_restart_sched_tick - restart the idle tick from the idle task | ||
259 | * | ||
260 | * Restart the idle tick when the CPU is woken up from idle | ||
261 | */ | ||
262 | void tick_nohz_restart_sched_tick(void) | ||
263 | { | ||
264 | int cpu = smp_processor_id(); | ||
265 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
266 | unsigned long ticks; | ||
267 | ktime_t now, delta; | ||
268 | |||
269 | if (!ts->tick_stopped) | ||
270 | return; | ||
271 | |||
272 | /* Update jiffies first */ | ||
273 | now = ktime_get(); | ||
274 | |||
275 | local_irq_disable(); | ||
276 | tick_do_update_jiffies64(now); | ||
277 | cpu_clear(cpu, nohz_cpu_mask); | ||
278 | |||
279 | /* Account the idle time */ | ||
280 | delta = ktime_sub(now, ts->idle_entrytime); | ||
281 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
282 | |||
283 | /* | ||
284 | * We stopped the tick in idle. Update process times would miss the | ||
285 | * time we slept as update_process_times does only a 1 tick | ||
286 | * accounting. Enforce that this is accounted to idle ! | ||
287 | */ | ||
288 | ticks = jiffies - ts->idle_jiffies; | ||
289 | /* | ||
290 | * We might be one off. Do not randomly account a huge number of ticks! | ||
291 | */ | ||
292 | if (ticks && ticks < LONG_MAX) { | ||
293 | add_preempt_count(HARDIRQ_OFFSET); | ||
294 | account_system_time(current, HARDIRQ_OFFSET, | ||
295 | jiffies_to_cputime(ticks)); | ||
296 | sub_preempt_count(HARDIRQ_OFFSET); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * Cancel the scheduled timer and restore the tick | ||
301 | */ | ||
302 | ts->tick_stopped = 0; | ||
303 | hrtimer_cancel(&ts->sched_timer); | ||
304 | ts->sched_timer.expires = ts->idle_tick; | ||
305 | |||
306 | while (1) { | ||
307 | /* Forward the time to expire in the future */ | ||
308 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
309 | |||
310 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | ||
311 | hrtimer_start(&ts->sched_timer, | ||
312 | ts->sched_timer.expires, | ||
313 | HRTIMER_MODE_ABS); | ||
314 | /* Check, if the timer was already in the past */ | ||
315 | if (hrtimer_active(&ts->sched_timer)) | ||
316 | break; | ||
317 | } else { | ||
318 | if (!tick_program_event(ts->sched_timer.expires, 0)) | ||
319 | break; | ||
320 | } | ||
321 | /* Update jiffies and reread time */ | ||
322 | tick_do_update_jiffies64(now); | ||
323 | now = ktime_get(); | ||
324 | } | ||
325 | local_irq_enable(); | ||
326 | } | ||
327 | |||
328 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | ||
329 | { | ||
330 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
331 | return tick_program_event(ts->sched_timer.expires, 0); | ||
332 | } | ||
333 | |||
334 | /* | ||
335 | * The nohz low res interrupt handler | ||
336 | */ | ||
337 | static void tick_nohz_handler(struct clock_event_device *dev) | ||
338 | { | ||
339 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
340 | struct pt_regs *regs = get_irq_regs(); | ||
341 | ktime_t now = ktime_get(); | ||
342 | |||
343 | dev->next_event.tv64 = KTIME_MAX; | ||
344 | |||
345 | /* Check, if the jiffies need an update */ | ||
346 | tick_do_update_jiffies64(now); | ||
347 | |||
348 | /* | ||
349 | * When we are idle and the tick is stopped, we have to touch | ||
350 | * the watchdog as we might not schedule for a really long | ||
351 | * time. This happens on complete idle SMP systems while | ||
352 | * waiting on the login prompt. We also increment the "start | ||
353 | * of idle" jiffy stamp so the idle accounting adjustment we | ||
354 | * do when we go busy again does not account too much ticks. | ||
355 | */ | ||
356 | if (ts->tick_stopped) { | ||
357 | touch_softlockup_watchdog(); | ||
358 | ts->idle_jiffies++; | ||
359 | } | ||
360 | |||
361 | update_process_times(user_mode(regs)); | ||
362 | profile_tick(CPU_PROFILING); | ||
363 | |||
364 | /* Do not restart, when we are in the idle loop */ | ||
365 | if (ts->tick_stopped) | ||
366 | return; | ||
367 | |||
368 | while (tick_nohz_reprogram(ts, now)) { | ||
369 | now = ktime_get(); | ||
370 | tick_do_update_jiffies64(now); | ||
371 | } | ||
372 | } | ||
373 | |||
374 | /** | ||
375 | * tick_nohz_switch_to_nohz - switch to nohz mode | ||
376 | */ | ||
377 | static void tick_nohz_switch_to_nohz(void) | ||
378 | { | ||
379 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
380 | ktime_t next; | ||
381 | |||
382 | if (!tick_nohz_enabled) | ||
383 | return; | ||
384 | |||
385 | local_irq_disable(); | ||
386 | if (tick_switch_to_oneshot(tick_nohz_handler)) { | ||
387 | local_irq_enable(); | ||
388 | return; | ||
389 | } | ||
390 | |||
391 | ts->nohz_mode = NOHZ_MODE_LOWRES; | ||
392 | |||
393 | /* | ||
394 | * Recycle the hrtimer in ts, so we can share the | ||
395 | * hrtimer_forward with the highres code. | ||
396 | */ | ||
397 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
398 | /* Get the next period */ | ||
399 | next = tick_init_jiffy_update(); | ||
400 | |||
401 | for (;;) { | ||
402 | ts->sched_timer.expires = next; | ||
403 | if (!tick_program_event(next, 0)) | ||
404 | break; | ||
405 | next = ktime_add(next, tick_period); | ||
406 | } | ||
407 | local_irq_enable(); | ||
408 | |||
409 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | ||
410 | smp_processor_id()); | ||
411 | } | ||
412 | |||
413 | #else | ||
414 | |||
415 | static inline void tick_nohz_switch_to_nohz(void) { } | ||
416 | |||
417 | #endif /* NO_HZ */ | ||
418 | |||
419 | /* | ||
420 | * High resolution timer specific code | ||
421 | */ | ||
422 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
423 | /* | ||
424 | * We rearm the timer until we get disabled by the idle code | ||
425 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | ||
426 | */ | ||
427 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | ||
428 | { | ||
429 | struct tick_sched *ts = | ||
430 | container_of(timer, struct tick_sched, sched_timer); | ||
431 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
432 | struct pt_regs *regs = get_irq_regs(); | ||
433 | ktime_t now = ktime_get(); | ||
434 | |||
435 | /* Check, if the jiffies need an update */ | ||
436 | tick_do_update_jiffies64(now); | ||
437 | |||
438 | /* | ||
439 | * Do not call, when we are not in irq context and have | ||
440 | * no valid regs pointer | ||
441 | */ | ||
442 | if (regs) { | ||
443 | /* | ||
444 | * When we are idle and the tick is stopped, we have to touch | ||
445 | * the watchdog as we might not schedule for a really long | ||
446 | * time. This happens on complete idle SMP systems while | ||
447 | * waiting on the login prompt. We also increment the "start of | ||
448 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
449 | * when we go busy again does not account too much ticks. | ||
450 | */ | ||
451 | if (ts->tick_stopped) { | ||
452 | touch_softlockup_watchdog(); | ||
453 | ts->idle_jiffies++; | ||
454 | } | ||
455 | /* | ||
456 | * update_process_times() might take tasklist_lock, hence | ||
457 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
458 | * never accessible by userspace APIs, so this is safe to do. | ||
459 | */ | ||
460 | spin_unlock(&base->lock); | ||
461 | update_process_times(user_mode(regs)); | ||
462 | profile_tick(CPU_PROFILING); | ||
463 | spin_lock(&base->lock); | ||
464 | } | ||
465 | |||
466 | /* Do not restart, when we are in the idle loop */ | ||
467 | if (ts->tick_stopped) | ||
468 | return HRTIMER_NORESTART; | ||
469 | |||
470 | hrtimer_forward(timer, now, tick_period); | ||
471 | |||
472 | return HRTIMER_RESTART; | ||
473 | } | ||
474 | |||
475 | /** | ||
476 | * tick_setup_sched_timer - setup the tick emulation timer | ||
477 | */ | ||
478 | void tick_setup_sched_timer(void) | ||
479 | { | ||
480 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
481 | ktime_t now = ktime_get(); | ||
482 | |||
483 | /* | ||
484 | * Emulate tick processing via per-CPU hrtimers: | ||
485 | */ | ||
486 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
487 | ts->sched_timer.function = tick_sched_timer; | ||
488 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
489 | |||
490 | /* Get the next period */ | ||
491 | ts->sched_timer.expires = tick_init_jiffy_update(); | ||
492 | |||
493 | for (;;) { | ||
494 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
495 | hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, | ||
496 | HRTIMER_MODE_ABS); | ||
497 | /* Check, if the timer was already in the past */ | ||
498 | if (hrtimer_active(&ts->sched_timer)) | ||
499 | break; | ||
500 | now = ktime_get(); | ||
501 | } | ||
502 | |||
503 | #ifdef CONFIG_NO_HZ | ||
504 | if (tick_nohz_enabled) | ||
505 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | ||
506 | #endif | ||
507 | } | ||
508 | |||
509 | void tick_cancel_sched_timer(int cpu) | ||
510 | { | ||
511 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
512 | |||
513 | if (ts->sched_timer.base) | ||
514 | hrtimer_cancel(&ts->sched_timer); | ||
515 | ts->tick_stopped = 0; | ||
516 | ts->nohz_mode = NOHZ_MODE_INACTIVE; | ||
517 | } | ||
518 | #endif /* HIGH_RES_TIMERS */ | ||
519 | |||
520 | /** | ||
521 | * Async notification about clocksource changes | ||
522 | */ | ||
523 | void tick_clock_notify(void) | ||
524 | { | ||
525 | int cpu; | ||
526 | |||
527 | for_each_possible_cpu(cpu) | ||
528 | set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); | ||
529 | } | ||
530 | |||
531 | /* | ||
532 | * Async notification about clock event changes | ||
533 | */ | ||
534 | void tick_oneshot_notify(void) | ||
535 | { | ||
536 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
537 | |||
538 | set_bit(0, &ts->check_clocks); | ||
539 | } | ||
540 | |||
541 | /** | ||
542 | * Check, if a change happened, which makes oneshot possible. | ||
543 | * | ||
544 | * Called cyclic from the hrtimer softirq (driven by the timer | ||
545 | * softirq) allow_nohz signals, that we can switch into low-res nohz | ||
546 | * mode, because high resolution timers are disabled (either compile | ||
547 | * or runtime). | ||
548 | */ | ||
549 | int tick_check_oneshot_change(int allow_nohz) | ||
550 | { | ||
551 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
552 | |||
553 | if (!test_and_clear_bit(0, &ts->check_clocks)) | ||
554 | return 0; | ||
555 | |||
556 | if (ts->nohz_mode != NOHZ_MODE_INACTIVE) | ||
557 | return 0; | ||
558 | |||
559 | if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) | ||
560 | return 0; | ||
561 | |||
562 | if (!allow_nohz) | ||
563 | return 1; | ||
564 | |||
565 | tick_nohz_switch_to_nohz(); | ||
566 | return 0; | ||
567 | } | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 000000000000..f82c635c3d5c --- /dev/null +++ b/kernel/time/timer_list.c | |||
@@ -0,0 +1,287 @@ | |||
1 | /* | ||
2 | * kernel/time/timer_list.c | ||
3 | * | ||
4 | * List pending timers | ||
5 | * | ||
6 | * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License version 2 as | ||
10 | * published by the Free Software Foundation. | ||
11 | */ | ||
12 | |||
13 | #include <linux/proc_fs.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | #include <linux/kallsyms.h> | ||
19 | #include <linux/tick.h> | ||
20 | |||
21 | #include <asm/uaccess.h> | ||
22 | |||
23 | typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); | ||
24 | |||
25 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | ||
26 | |||
27 | /* | ||
28 | * This allows printing both to /proc/timer_list and | ||
29 | * to the console (on SysRq-Q): | ||
30 | */ | ||
31 | #define SEQ_printf(m, x...) \ | ||
32 | do { \ | ||
33 | if (m) \ | ||
34 | seq_printf(m, x); \ | ||
35 | else \ | ||
36 | printk(x); \ | ||
37 | } while (0) | ||
38 | |||
39 | static void print_name_offset(struct seq_file *m, void *sym) | ||
40 | { | ||
41 | unsigned long addr = (unsigned long)sym; | ||
42 | char namebuf[KSYM_NAME_LEN+1]; | ||
43 | unsigned long size, offset; | ||
44 | const char *sym_name; | ||
45 | char *modname; | ||
46 | |||
47 | sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); | ||
48 | if (sym_name) | ||
49 | SEQ_printf(m, "%s", sym_name); | ||
50 | else | ||
51 | SEQ_printf(m, "<%p>", sym); | ||
52 | } | ||
53 | |||
54 | static void | ||
55 | print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) | ||
56 | { | ||
57 | #ifdef CONFIG_TIMER_STATS | ||
58 | char tmp[TASK_COMM_LEN + 1]; | ||
59 | #endif | ||
60 | SEQ_printf(m, " #%d: ", idx); | ||
61 | print_name_offset(m, timer); | ||
62 | SEQ_printf(m, ", "); | ||
63 | print_name_offset(m, timer->function); | ||
64 | SEQ_printf(m, ", S:%02lx", timer->state); | ||
65 | #ifdef CONFIG_TIMER_STATS | ||
66 | SEQ_printf(m, ", "); | ||
67 | print_name_offset(m, timer->start_site); | ||
68 | memcpy(tmp, timer->start_comm, TASK_COMM_LEN); | ||
69 | tmp[TASK_COMM_LEN] = 0; | ||
70 | SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); | ||
71 | #endif | ||
72 | SEQ_printf(m, "\n"); | ||
73 | SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", | ||
74 | (unsigned long long)ktime_to_ns(timer->expires), | ||
75 | (unsigned long long)(ktime_to_ns(timer->expires) - now)); | ||
76 | } | ||
77 | |||
78 | static void | ||
79 | print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | ||
80 | u64 now) | ||
81 | { | ||
82 | struct hrtimer *timer, tmp; | ||
83 | unsigned long next = 0, i; | ||
84 | struct rb_node *curr; | ||
85 | unsigned long flags; | ||
86 | |||
87 | next_one: | ||
88 | i = 0; | ||
89 | spin_lock_irqsave(&base->cpu_base->lock, flags); | ||
90 | |||
91 | curr = base->first; | ||
92 | /* | ||
93 | * Crude but we have to do this O(N*N) thing, because | ||
94 | * we have to unlock the base when printing: | ||
95 | */ | ||
96 | while (curr && i < next) { | ||
97 | curr = rb_next(curr); | ||
98 | i++; | ||
99 | } | ||
100 | |||
101 | if (curr) { | ||
102 | |||
103 | timer = rb_entry(curr, struct hrtimer, node); | ||
104 | tmp = *timer; | ||
105 | spin_unlock_irqrestore(&base->cpu_base->lock, flags); | ||
106 | |||
107 | print_timer(m, &tmp, i, now); | ||
108 | next++; | ||
109 | goto next_one; | ||
110 | } | ||
111 | spin_unlock_irqrestore(&base->cpu_base->lock, flags); | ||
112 | } | ||
113 | |||
114 | static void | ||
115 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | ||
116 | { | ||
117 | SEQ_printf(m, " .index: %d\n", | ||
118 | base->index); | ||
119 | SEQ_printf(m, " .resolution: %Ld nsecs\n", | ||
120 | (unsigned long long)ktime_to_ns(base->resolution)); | ||
121 | SEQ_printf(m, " .get_time: "); | ||
122 | print_name_offset(m, base->get_time); | ||
123 | SEQ_printf(m, "\n"); | ||
124 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
125 | SEQ_printf(m, " .offset: %Ld nsecs\n", | ||
126 | ktime_to_ns(base->offset)); | ||
127 | #endif | ||
128 | SEQ_printf(m, "active timers:\n"); | ||
129 | print_active_timers(m, base, now); | ||
130 | } | ||
131 | |||
132 | static void print_cpu(struct seq_file *m, int cpu, u64 now) | ||
133 | { | ||
134 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | ||
135 | int i; | ||
136 | |||
137 | SEQ_printf(m, "\ncpu: %d\n", cpu); | ||
138 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | ||
139 | SEQ_printf(m, " clock %d:\n", i); | ||
140 | print_base(m, cpu_base->clock_base + i, now); | ||
141 | } | ||
142 | #define P(x) \ | ||
143 | SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) | ||
144 | #define P_ns(x) \ | ||
145 | SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ | ||
146 | (u64)(ktime_to_ns(cpu_base->x))) | ||
147 | |||
148 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
149 | P_ns(expires_next); | ||
150 | P(hres_active); | ||
151 | P(nr_events); | ||
152 | #endif | ||
153 | #undef P | ||
154 | #undef P_ns | ||
155 | |||
156 | #ifdef CONFIG_TICK_ONESHOT | ||
157 | # define P(x) \ | ||
158 | SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) | ||
159 | # define P_ns(x) \ | ||
160 | SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ | ||
161 | (u64)(ktime_to_ns(ts->x))) | ||
162 | { | ||
163 | struct tick_sched *ts = tick_get_tick_sched(cpu); | ||
164 | P(nohz_mode); | ||
165 | P_ns(idle_tick); | ||
166 | P(tick_stopped); | ||
167 | P(idle_jiffies); | ||
168 | P(idle_calls); | ||
169 | P(idle_sleeps); | ||
170 | P_ns(idle_entrytime); | ||
171 | P_ns(idle_sleeptime); | ||
172 | P(last_jiffies); | ||
173 | P(next_jiffies); | ||
174 | P_ns(idle_expires); | ||
175 | SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); | ||
176 | } | ||
177 | #endif | ||
178 | |||
179 | #undef P | ||
180 | #undef P_ns | ||
181 | } | ||
182 | |||
183 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
184 | static void | ||
185 | print_tickdevice(struct seq_file *m, struct tick_device *td) | ||
186 | { | ||
187 | struct clock_event_device *dev = td->evtdev; | ||
188 | |||
189 | SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); | ||
190 | |||
191 | SEQ_printf(m, "Clock Event Device: "); | ||
192 | if (!dev) { | ||
193 | SEQ_printf(m, "<NULL>\n"); | ||
194 | return; | ||
195 | } | ||
196 | SEQ_printf(m, "%s\n", dev->name); | ||
197 | SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); | ||
198 | SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); | ||
199 | SEQ_printf(m, " mult: %ld\n", dev->mult); | ||
200 | SEQ_printf(m, " shift: %d\n", dev->shift); | ||
201 | SEQ_printf(m, " mode: %d\n", dev->mode); | ||
202 | SEQ_printf(m, " next_event: %Ld nsecs\n", | ||
203 | (unsigned long long) ktime_to_ns(dev->next_event)); | ||
204 | |||
205 | SEQ_printf(m, " set_next_event: "); | ||
206 | print_name_offset(m, dev->set_next_event); | ||
207 | SEQ_printf(m, "\n"); | ||
208 | |||
209 | SEQ_printf(m, " set_mode: "); | ||
210 | print_name_offset(m, dev->set_mode); | ||
211 | SEQ_printf(m, "\n"); | ||
212 | |||
213 | SEQ_printf(m, " event_handler: "); | ||
214 | print_name_offset(m, dev->event_handler); | ||
215 | SEQ_printf(m, "\n"); | ||
216 | } | ||
217 | |||
218 | static void timer_list_show_tickdevices(struct seq_file *m) | ||
219 | { | ||
220 | int cpu; | ||
221 | |||
222 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
223 | print_tickdevice(m, tick_get_broadcast_device()); | ||
224 | SEQ_printf(m, "tick_broadcast_mask: %08lx\n", | ||
225 | tick_get_broadcast_mask()->bits[0]); | ||
226 | #ifdef CONFIG_TICK_ONESHOT | ||
227 | SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", | ||
228 | tick_get_broadcast_oneshot_mask()->bits[0]); | ||
229 | #endif | ||
230 | SEQ_printf(m, "\n"); | ||
231 | #endif | ||
232 | for_each_online_cpu(cpu) | ||
233 | print_tickdevice(m, tick_get_device(cpu)); | ||
234 | SEQ_printf(m, "\n"); | ||
235 | } | ||
236 | #else | ||
237 | static void timer_list_show_tickdevices(struct seq_file *m) { } | ||
238 | #endif | ||
239 | |||
240 | static int timer_list_show(struct seq_file *m, void *v) | ||
241 | { | ||
242 | u64 now = ktime_to_ns(ktime_get()); | ||
243 | int cpu; | ||
244 | |||
245 | SEQ_printf(m, "Timer List Version: v0.3\n"); | ||
246 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | ||
247 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | ||
248 | |||
249 | for_each_online_cpu(cpu) | ||
250 | print_cpu(m, cpu, now); | ||
251 | |||
252 | SEQ_printf(m, "\n"); | ||
253 | timer_list_show_tickdevices(m); | ||
254 | |||
255 | return 0; | ||
256 | } | ||
257 | |||
258 | void sysrq_timer_list_show(void) | ||
259 | { | ||
260 | timer_list_show(NULL, NULL); | ||
261 | } | ||
262 | |||
263 | static int timer_list_open(struct inode *inode, struct file *filp) | ||
264 | { | ||
265 | return single_open(filp, timer_list_show, NULL); | ||
266 | } | ||
267 | |||
268 | static struct file_operations timer_list_fops = { | ||
269 | .open = timer_list_open, | ||
270 | .read = seq_read, | ||
271 | .llseek = seq_lseek, | ||
272 | .release = seq_release, | ||
273 | }; | ||
274 | |||
275 | static int __init init_timer_list_procfs(void) | ||
276 | { | ||
277 | struct proc_dir_entry *pe; | ||
278 | |||
279 | pe = create_proc_entry("timer_list", 0644, NULL); | ||
280 | if (!pe) | ||
281 | return -ENOMEM; | ||
282 | |||
283 | pe->proc_fops = &timer_list_fops; | ||
284 | |||
285 | return 0; | ||
286 | } | ||
287 | __initcall(init_timer_list_procfs); | ||
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 000000000000..1bc4882e28e0 --- /dev/null +++ b/kernel/time/timer_stats.c | |||
@@ -0,0 +1,411 @@ | |||
1 | /* | ||
2 | * kernel/time/timer_stats.c | ||
3 | * | ||
4 | * Collect timer usage statistics. | ||
5 | * | ||
6 | * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar | ||
7 | * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * timer_stats is based on timer_top, a similar functionality which was part of | ||
10 | * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the | ||
11 | * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based | ||
12 | * on dynamic allocation of the statistics entries and linear search based | ||
13 | * lookup combined with a global lock, rather than the static array, hash | ||
14 | * and per-CPU locking which is used by timer_stats. It was written for the | ||
15 | * pre hrtimer kernel code and therefore did not take hrtimers into account. | ||
16 | * Nevertheless it provided the base for the timer_stats implementation and | ||
17 | * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks | ||
18 | * for this effort. | ||
19 | * | ||
20 | * timer_top.c is | ||
21 | * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus | ||
22 | * Written by Daniel Petrini <d.pensator@gmail.com> | ||
23 | * timer_top.c was released under the GNU General Public License version 2 | ||
24 | * | ||
25 | * We export the addresses and counting of timer functions being called, | ||
26 | * the pid and cmdline from the owner process if applicable. | ||
27 | * | ||
28 | * Start/stop data collection: | ||
29 | * # echo 1[0] >/proc/timer_stats | ||
30 | * | ||
31 | * Display the information collected so far: | ||
32 | * # cat /proc/timer_stats | ||
33 | * | ||
34 | * This program is free software; you can redistribute it and/or modify | ||
35 | * it under the terms of the GNU General Public License version 2 as | ||
36 | * published by the Free Software Foundation. | ||
37 | */ | ||
38 | |||
39 | #include <linux/proc_fs.h> | ||
40 | #include <linux/module.h> | ||
41 | #include <linux/spinlock.h> | ||
42 | #include <linux/sched.h> | ||
43 | #include <linux/seq_file.h> | ||
44 | #include <linux/kallsyms.h> | ||
45 | |||
46 | #include <asm/uaccess.h> | ||
47 | |||
48 | /* | ||
49 | * This is our basic unit of interest: a timer expiry event identified | ||
50 | * by the timer, its start/expire functions and the PID of the task that | ||
51 | * started the timer. We count the number of times an event happens: | ||
52 | */ | ||
53 | struct entry { | ||
54 | /* | ||
55 | * Hash list: | ||
56 | */ | ||
57 | struct entry *next; | ||
58 | |||
59 | /* | ||
60 | * Hash keys: | ||
61 | */ | ||
62 | void *timer; | ||
63 | void *start_func; | ||
64 | void *expire_func; | ||
65 | pid_t pid; | ||
66 | |||
67 | /* | ||
68 | * Number of timeout events: | ||
69 | */ | ||
70 | unsigned long count; | ||
71 | |||
72 | /* | ||
73 | * We save the command-line string to preserve | ||
74 | * this information past task exit: | ||
75 | */ | ||
76 | char comm[TASK_COMM_LEN + 1]; | ||
77 | |||
78 | } ____cacheline_aligned_in_smp; | ||
79 | |||
80 | /* | ||
81 | * Spinlock protecting the tables - not taken during lookup: | ||
82 | */ | ||
83 | static DEFINE_SPINLOCK(table_lock); | ||
84 | |||
85 | /* | ||
86 | * Per-CPU lookup locks for fast hash lookup: | ||
87 | */ | ||
88 | static DEFINE_PER_CPU(spinlock_t, lookup_lock); | ||
89 | |||
90 | /* | ||
91 | * Mutex to serialize state changes with show-stats activities: | ||
92 | */ | ||
93 | static DEFINE_MUTEX(show_mutex); | ||
94 | |||
95 | /* | ||
96 | * Collection status, active/inactive: | ||
97 | */ | ||
98 | static int __read_mostly active; | ||
99 | |||
100 | /* | ||
101 | * Beginning/end timestamps of measurement: | ||
102 | */ | ||
103 | static ktime_t time_start, time_stop; | ||
104 | |||
105 | /* | ||
106 | * tstat entry structs only get allocated while collection is | ||
107 | * active and never freed during that time - this simplifies | ||
108 | * things quite a bit. | ||
109 | * | ||
110 | * They get freed when a new collection period is started. | ||
111 | */ | ||
112 | #define MAX_ENTRIES_BITS 10 | ||
113 | #define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) | ||
114 | |||
115 | static unsigned long nr_entries; | ||
116 | static struct entry entries[MAX_ENTRIES]; | ||
117 | |||
118 | static atomic_t overflow_count; | ||
119 | |||
120 | static void reset_entries(void) | ||
121 | { | ||
122 | nr_entries = 0; | ||
123 | memset(entries, 0, sizeof(entries)); | ||
124 | atomic_set(&overflow_count, 0); | ||
125 | } | ||
126 | |||
127 | static struct entry *alloc_entry(void) | ||
128 | { | ||
129 | if (nr_entries >= MAX_ENTRIES) | ||
130 | return NULL; | ||
131 | |||
132 | return entries + nr_entries++; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * The entries are in a hash-table, for fast lookup: | ||
137 | */ | ||
138 | #define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) | ||
139 | #define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) | ||
140 | #define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) | ||
141 | |||
142 | #define __tstat_hashfn(entry) \ | ||
143 | (((unsigned long)(entry)->timer ^ \ | ||
144 | (unsigned long)(entry)->start_func ^ \ | ||
145 | (unsigned long)(entry)->expire_func ^ \ | ||
146 | (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) | ||
147 | |||
148 | #define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) | ||
149 | |||
150 | static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; | ||
151 | |||
152 | static int match_entries(struct entry *entry1, struct entry *entry2) | ||
153 | { | ||
154 | return entry1->timer == entry2->timer && | ||
155 | entry1->start_func == entry2->start_func && | ||
156 | entry1->expire_func == entry2->expire_func && | ||
157 | entry1->pid == entry2->pid; | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Look up whether an entry matching this item is present | ||
162 | * in the hash already. Must be called with irqs off and the | ||
163 | * lookup lock held: | ||
164 | */ | ||
165 | static struct entry *tstat_lookup(struct entry *entry, char *comm) | ||
166 | { | ||
167 | struct entry **head, *curr, *prev; | ||
168 | |||
169 | head = tstat_hashentry(entry); | ||
170 | curr = *head; | ||
171 | |||
172 | /* | ||
173 | * The fastpath is when the entry is already hashed, | ||
174 | * we do this with the lookup lock held, but with the | ||
175 | * table lock not held: | ||
176 | */ | ||
177 | while (curr) { | ||
178 | if (match_entries(curr, entry)) | ||
179 | return curr; | ||
180 | |||
181 | curr = curr->next; | ||
182 | } | ||
183 | /* | ||
184 | * Slowpath: allocate, set up and link a new hash entry: | ||
185 | */ | ||
186 | prev = NULL; | ||
187 | curr = *head; | ||
188 | |||
189 | spin_lock(&table_lock); | ||
190 | /* | ||
191 | * Make sure we have not raced with another CPU: | ||
192 | */ | ||
193 | while (curr) { | ||
194 | if (match_entries(curr, entry)) | ||
195 | goto out_unlock; | ||
196 | |||
197 | prev = curr; | ||
198 | curr = curr->next; | ||
199 | } | ||
200 | |||
201 | curr = alloc_entry(); | ||
202 | if (curr) { | ||
203 | *curr = *entry; | ||
204 | curr->count = 0; | ||
205 | memcpy(curr->comm, comm, TASK_COMM_LEN); | ||
206 | if (prev) | ||
207 | prev->next = curr; | ||
208 | else | ||
209 | *head = curr; | ||
210 | curr->next = NULL; | ||
211 | } | ||
212 | out_unlock: | ||
213 | spin_unlock(&table_lock); | ||
214 | |||
215 | return curr; | ||
216 | } | ||
217 | |||
218 | /** | ||
219 | * timer_stats_update_stats - Update the statistics for a timer. | ||
220 | * @timer: pointer to either a timer_list or a hrtimer | ||
221 | * @pid: the pid of the task which set up the timer | ||
222 | * @startf: pointer to the function which did the timer setup | ||
223 | * @timerf: pointer to the timer callback function of the timer | ||
224 | * @comm: name of the process which set up the timer | ||
225 | * | ||
226 | * When the timer is already registered, then the event counter is | ||
227 | * incremented. Otherwise the timer is registered in a free slot. | ||
228 | */ | ||
229 | void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | ||
230 | void *timerf, char * comm) | ||
231 | { | ||
232 | /* | ||
233 | * It doesnt matter which lock we take: | ||
234 | */ | ||
235 | spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); | ||
236 | struct entry *entry, input; | ||
237 | unsigned long flags; | ||
238 | |||
239 | input.timer = timer; | ||
240 | input.start_func = startf; | ||
241 | input.expire_func = timerf; | ||
242 | input.pid = pid; | ||
243 | |||
244 | spin_lock_irqsave(lock, flags); | ||
245 | if (!active) | ||
246 | goto out_unlock; | ||
247 | |||
248 | entry = tstat_lookup(&input, comm); | ||
249 | if (likely(entry)) | ||
250 | entry->count++; | ||
251 | else | ||
252 | atomic_inc(&overflow_count); | ||
253 | |||
254 | out_unlock: | ||
255 | spin_unlock_irqrestore(lock, flags); | ||
256 | } | ||
257 | |||
258 | static void print_name_offset(struct seq_file *m, unsigned long addr) | ||
259 | { | ||
260 | char namebuf[KSYM_NAME_LEN+1]; | ||
261 | unsigned long size, offset; | ||
262 | const char *sym_name; | ||
263 | char *modname; | ||
264 | |||
265 | sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); | ||
266 | if (sym_name) | ||
267 | seq_printf(m, "%s", sym_name); | ||
268 | else | ||
269 | seq_printf(m, "<%p>", (void *)addr); | ||
270 | } | ||
271 | |||
272 | static int tstats_show(struct seq_file *m, void *v) | ||
273 | { | ||
274 | struct timespec period; | ||
275 | struct entry *entry; | ||
276 | unsigned long ms; | ||
277 | long events = 0; | ||
278 | ktime_t time; | ||
279 | int i; | ||
280 | |||
281 | mutex_lock(&show_mutex); | ||
282 | /* | ||
283 | * If still active then calculate up to now: | ||
284 | */ | ||
285 | if (active) | ||
286 | time_stop = ktime_get(); | ||
287 | |||
288 | time = ktime_sub(time_stop, time_start); | ||
289 | |||
290 | period = ktime_to_timespec(time); | ||
291 | ms = period.tv_nsec / 1000000; | ||
292 | |||
293 | seq_puts(m, "Timer Stats Version: v0.1\n"); | ||
294 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | ||
295 | if (atomic_read(&overflow_count)) | ||
296 | seq_printf(m, "Overflow: %d entries\n", | ||
297 | atomic_read(&overflow_count)); | ||
298 | |||
299 | for (i = 0; i < nr_entries; i++) { | ||
300 | entry = entries + i; | ||
301 | seq_printf(m, "%4lu, %5d %-16s ", | ||
302 | entry->count, entry->pid, entry->comm); | ||
303 | |||
304 | print_name_offset(m, (unsigned long)entry->start_func); | ||
305 | seq_puts(m, " ("); | ||
306 | print_name_offset(m, (unsigned long)entry->expire_func); | ||
307 | seq_puts(m, ")\n"); | ||
308 | |||
309 | events += entry->count; | ||
310 | } | ||
311 | |||
312 | ms += period.tv_sec * 1000; | ||
313 | if (!ms) | ||
314 | ms = 1; | ||
315 | |||
316 | if (events && period.tv_sec) | ||
317 | seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, | ||
318 | events / period.tv_sec, events * 1000 / ms); | ||
319 | else | ||
320 | seq_printf(m, "%ld total events\n", events); | ||
321 | |||
322 | mutex_unlock(&show_mutex); | ||
323 | |||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * After a state change, make sure all concurrent lookup/update | ||
329 | * activities have stopped: | ||
330 | */ | ||
331 | static void sync_access(void) | ||
332 | { | ||
333 | unsigned long flags; | ||
334 | int cpu; | ||
335 | |||
336 | for_each_online_cpu(cpu) { | ||
337 | spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); | ||
338 | /* nothing */ | ||
339 | spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); | ||
340 | } | ||
341 | } | ||
342 | |||
343 | static ssize_t tstats_write(struct file *file, const char __user *buf, | ||
344 | size_t count, loff_t *offs) | ||
345 | { | ||
346 | char ctl[2]; | ||
347 | |||
348 | if (count != 2 || *offs) | ||
349 | return -EINVAL; | ||
350 | |||
351 | if (copy_from_user(ctl, buf, count)) | ||
352 | return -EFAULT; | ||
353 | |||
354 | mutex_lock(&show_mutex); | ||
355 | switch (ctl[0]) { | ||
356 | case '0': | ||
357 | if (active) { | ||
358 | active = 0; | ||
359 | time_stop = ktime_get(); | ||
360 | sync_access(); | ||
361 | } | ||
362 | break; | ||
363 | case '1': | ||
364 | if (!active) { | ||
365 | reset_entries(); | ||
366 | time_start = ktime_get(); | ||
367 | active = 1; | ||
368 | } | ||
369 | break; | ||
370 | default: | ||
371 | count = -EINVAL; | ||
372 | } | ||
373 | mutex_unlock(&show_mutex); | ||
374 | |||
375 | return count; | ||
376 | } | ||
377 | |||
378 | static int tstats_open(struct inode *inode, struct file *filp) | ||
379 | { | ||
380 | return single_open(filp, tstats_show, NULL); | ||
381 | } | ||
382 | |||
383 | static struct file_operations tstats_fops = { | ||
384 | .open = tstats_open, | ||
385 | .read = seq_read, | ||
386 | .write = tstats_write, | ||
387 | .llseek = seq_lseek, | ||
388 | .release = seq_release, | ||
389 | }; | ||
390 | |||
391 | void __init init_timer_stats(void) | ||
392 | { | ||
393 | int cpu; | ||
394 | |||
395 | for_each_possible_cpu(cpu) | ||
396 | spin_lock_init(&per_cpu(lookup_lock, cpu)); | ||
397 | } | ||
398 | |||
399 | static int __init init_tstats_procfs(void) | ||
400 | { | ||
401 | struct proc_dir_entry *pe; | ||
402 | |||
403 | pe = create_proc_entry("timer_stats", 0644, NULL); | ||
404 | if (!pe) | ||
405 | return -ENOMEM; | ||
406 | |||
407 | pe->proc_fops = &tstats_fops; | ||
408 | |||
409 | return 0; | ||
410 | } | ||
411 | __initcall(init_tstats_procfs); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index c2a8ccfc2882..cb1b86a9c52f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -34,6 +34,8 @@ | |||
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
37 | #include <linux/tick.h> | ||
38 | #include <linux/kallsyms.h> | ||
37 | 39 | ||
38 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
39 | #include <asm/unistd.h> | 41 | #include <asm/unistd.h> |
@@ -85,7 +87,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | |||
85 | * @j: the time in (absolute) jiffies that should be rounded | 87 | * @j: the time in (absolute) jiffies that should be rounded |
86 | * @cpu: the processor number on which the timeout will happen | 88 | * @cpu: the processor number on which the timeout will happen |
87 | * | 89 | * |
88 | * __round_jiffies rounds an absolute time in the future (in jiffies) | 90 | * __round_jiffies() rounds an absolute time in the future (in jiffies) |
89 | * up or down to (approximately) full seconds. This is useful for timers | 91 | * up or down to (approximately) full seconds. This is useful for timers |
90 | * for which the exact time they fire does not matter too much, as long as | 92 | * for which the exact time they fire does not matter too much, as long as |
91 | * they fire approximately every X seconds. | 93 | * they fire approximately every X seconds. |
@@ -98,7 +100,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | |||
98 | * processors firing at the exact same time, which could lead | 100 | * processors firing at the exact same time, which could lead |
99 | * to lock contention or spurious cache line bouncing. | 101 | * to lock contention or spurious cache line bouncing. |
100 | * | 102 | * |
101 | * The return value is the rounded version of the "j" parameter. | 103 | * The return value is the rounded version of the @j parameter. |
102 | */ | 104 | */ |
103 | unsigned long __round_jiffies(unsigned long j, int cpu) | 105 | unsigned long __round_jiffies(unsigned long j, int cpu) |
104 | { | 106 | { |
@@ -142,7 +144,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies); | |||
142 | * @j: the time in (relative) jiffies that should be rounded | 144 | * @j: the time in (relative) jiffies that should be rounded |
143 | * @cpu: the processor number on which the timeout will happen | 145 | * @cpu: the processor number on which the timeout will happen |
144 | * | 146 | * |
145 | * __round_jiffies_relative rounds a time delta in the future (in jiffies) | 147 | * __round_jiffies_relative() rounds a time delta in the future (in jiffies) |
146 | * up or down to (approximately) full seconds. This is useful for timers | 148 | * up or down to (approximately) full seconds. This is useful for timers |
147 | * for which the exact time they fire does not matter too much, as long as | 149 | * for which the exact time they fire does not matter too much, as long as |
148 | * they fire approximately every X seconds. | 150 | * they fire approximately every X seconds. |
@@ -155,7 +157,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies); | |||
155 | * processors firing at the exact same time, which could lead | 157 | * processors firing at the exact same time, which could lead |
156 | * to lock contention or spurious cache line bouncing. | 158 | * to lock contention or spurious cache line bouncing. |
157 | * | 159 | * |
158 | * The return value is the rounded version of the "j" parameter. | 160 | * The return value is the rounded version of the @j parameter. |
159 | */ | 161 | */ |
160 | unsigned long __round_jiffies_relative(unsigned long j, int cpu) | 162 | unsigned long __round_jiffies_relative(unsigned long j, int cpu) |
161 | { | 163 | { |
@@ -173,7 +175,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); | |||
173 | * round_jiffies - function to round jiffies to a full second | 175 | * round_jiffies - function to round jiffies to a full second |
174 | * @j: the time in (absolute) jiffies that should be rounded | 176 | * @j: the time in (absolute) jiffies that should be rounded |
175 | * | 177 | * |
176 | * round_jiffies rounds an absolute time in the future (in jiffies) | 178 | * round_jiffies() rounds an absolute time in the future (in jiffies) |
177 | * up or down to (approximately) full seconds. This is useful for timers | 179 | * up or down to (approximately) full seconds. This is useful for timers |
178 | * for which the exact time they fire does not matter too much, as long as | 180 | * for which the exact time they fire does not matter too much, as long as |
179 | * they fire approximately every X seconds. | 181 | * they fire approximately every X seconds. |
@@ -182,7 +184,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); | |||
182 | * at the same time, rather than at various times spread out. The goal | 184 | * at the same time, rather than at various times spread out. The goal |
183 | * of this is to have the CPU wake up less, which saves power. | 185 | * of this is to have the CPU wake up less, which saves power. |
184 | * | 186 | * |
185 | * The return value is the rounded version of the "j" parameter. | 187 | * The return value is the rounded version of the @j parameter. |
186 | */ | 188 | */ |
187 | unsigned long round_jiffies(unsigned long j) | 189 | unsigned long round_jiffies(unsigned long j) |
188 | { | 190 | { |
@@ -194,7 +196,7 @@ EXPORT_SYMBOL_GPL(round_jiffies); | |||
194 | * round_jiffies_relative - function to round jiffies to a full second | 196 | * round_jiffies_relative - function to round jiffies to a full second |
195 | * @j: the time in (relative) jiffies that should be rounded | 197 | * @j: the time in (relative) jiffies that should be rounded |
196 | * | 198 | * |
197 | * round_jiffies_relative rounds a time delta in the future (in jiffies) | 199 | * round_jiffies_relative() rounds a time delta in the future (in jiffies) |
198 | * up or down to (approximately) full seconds. This is useful for timers | 200 | * up or down to (approximately) full seconds. This is useful for timers |
199 | * for which the exact time they fire does not matter too much, as long as | 201 | * for which the exact time they fire does not matter too much, as long as |
200 | * they fire approximately every X seconds. | 202 | * they fire approximately every X seconds. |
@@ -203,7 +205,7 @@ EXPORT_SYMBOL_GPL(round_jiffies); | |||
203 | * at the same time, rather than at various times spread out. The goal | 205 | * at the same time, rather than at various times spread out. The goal |
204 | * of this is to have the CPU wake up less, which saves power. | 206 | * of this is to have the CPU wake up less, which saves power. |
205 | * | 207 | * |
206 | * The return value is the rounded version of the "j" parameter. | 208 | * The return value is the rounded version of the @j parameter. |
207 | */ | 209 | */ |
208 | unsigned long round_jiffies_relative(unsigned long j) | 210 | unsigned long round_jiffies_relative(unsigned long j) |
209 | { | 211 | { |
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
262 | list_add_tail(&timer->entry, vec); | 264 | list_add_tail(&timer->entry, vec); |
263 | } | 265 | } |
264 | 266 | ||
267 | #ifdef CONFIG_TIMER_STATS | ||
268 | void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) | ||
269 | { | ||
270 | if (timer->start_site) | ||
271 | return; | ||
272 | |||
273 | timer->start_site = addr; | ||
274 | memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); | ||
275 | timer->start_pid = current->pid; | ||
276 | } | ||
277 | #endif | ||
278 | |||
265 | /** | 279 | /** |
266 | * init_timer - initialize a timer. | 280 | * init_timer - initialize a timer. |
267 | * @timer: the timer to be initialized | 281 | * @timer: the timer to be initialized |
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer) | |||
273 | { | 287 | { |
274 | timer->entry.next = NULL; | 288 | timer->entry.next = NULL; |
275 | timer->base = __raw_get_cpu_var(tvec_bases); | 289 | timer->base = __raw_get_cpu_var(tvec_bases); |
290 | #ifdef CONFIG_TIMER_STATS | ||
291 | timer->start_site = NULL; | ||
292 | timer->start_pid = -1; | ||
293 | memset(timer->start_comm, 0, TASK_COMM_LEN); | ||
294 | #endif | ||
276 | } | 295 | } |
277 | EXPORT_SYMBOL(init_timer); | 296 | EXPORT_SYMBOL(init_timer); |
278 | 297 | ||
279 | static inline void detach_timer(struct timer_list *timer, | 298 | static inline void detach_timer(struct timer_list *timer, |
280 | int clear_pending) | 299 | int clear_pending) |
281 | { | 300 | { |
282 | struct list_head *entry = &timer->entry; | 301 | struct list_head *entry = &timer->entry; |
283 | 302 | ||
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
324 | unsigned long flags; | 343 | unsigned long flags; |
325 | int ret = 0; | 344 | int ret = 0; |
326 | 345 | ||
346 | timer_stats_timer_set_start_info(timer); | ||
327 | BUG_ON(!timer->function); | 347 | BUG_ON(!timer->function); |
328 | 348 | ||
329 | base = lock_timer_base(timer, &flags); | 349 | base = lock_timer_base(timer, &flags); |
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
374 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 394 | tvec_base_t *base = per_cpu(tvec_bases, cpu); |
375 | unsigned long flags; | 395 | unsigned long flags; |
376 | 396 | ||
397 | timer_stats_timer_set_start_info(timer); | ||
377 | BUG_ON(timer_pending(timer) || !timer->function); | 398 | BUG_ON(timer_pending(timer) || !timer->function); |
378 | spin_lock_irqsave(&base->lock, flags); | 399 | spin_lock_irqsave(&base->lock, flags); |
379 | timer->base = base; | 400 | timer->base = base; |
@@ -387,7 +408,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
387 | * @timer: the timer to be modified | 408 | * @timer: the timer to be modified |
388 | * @expires: new timeout in jiffies | 409 | * @expires: new timeout in jiffies |
389 | * | 410 | * |
390 | * mod_timer is a more efficient way to update the expire field of an | 411 | * mod_timer() is a more efficient way to update the expire field of an |
391 | * active timer (if the timer is inactive it will be activated) | 412 | * active timer (if the timer is inactive it will be activated) |
392 | * | 413 | * |
393 | * mod_timer(timer, expires) is equivalent to: | 414 | * mod_timer(timer, expires) is equivalent to: |
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
406 | { | 427 | { |
407 | BUG_ON(!timer->function); | 428 | BUG_ON(!timer->function); |
408 | 429 | ||
430 | timer_stats_timer_set_start_info(timer); | ||
409 | /* | 431 | /* |
410 | * This is a common optimization triggered by the | 432 | * This is a common optimization triggered by the |
411 | * networking code - if the timer is re-modified | 433 | * networking code - if the timer is re-modified |
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer) | |||
436 | unsigned long flags; | 458 | unsigned long flags; |
437 | int ret = 0; | 459 | int ret = 0; |
438 | 460 | ||
461 | timer_stats_timer_clear_start_info(timer); | ||
439 | if (timer_pending(timer)) { | 462 | if (timer_pending(timer)) { |
440 | base = lock_timer_base(timer, &flags); | 463 | base = lock_timer_base(timer, &flags); |
441 | if (timer_pending(timer)) { | 464 | if (timer_pending(timer)) { |
@@ -490,7 +513,7 @@ out: | |||
490 | * the timer it also makes sure the handler has finished executing on other | 513 | * the timer it also makes sure the handler has finished executing on other |
491 | * CPUs. | 514 | * CPUs. |
492 | * | 515 | * |
493 | * Synchronization rules: callers must prevent restarting of the timer, | 516 | * Synchronization rules: Callers must prevent restarting of the timer, |
494 | * otherwise this function is meaningless. It must not be called from | 517 | * otherwise this function is meaningless. It must not be called from |
495 | * interrupt contexts. The caller must not hold locks which would prevent | 518 | * interrupt contexts. The caller must not hold locks which would prevent |
496 | * completion of the timer's handler. The timer's handler must not call | 519 | * completion of the timer's handler. The timer's handler must not call |
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base) | |||
569 | fn = timer->function; | 592 | fn = timer->function; |
570 | data = timer->data; | 593 | data = timer->data; |
571 | 594 | ||
595 | timer_stats_account_timer(timer); | ||
596 | |||
572 | set_running_timer(base, timer); | 597 | set_running_timer(base, timer); |
573 | detach_timer(timer, 1); | 598 | detach_timer(timer, 1); |
574 | spin_unlock_irq(&base->lock); | 599 | spin_unlock_irq(&base->lock); |
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base) | |||
591 | spin_unlock_irq(&base->lock); | 616 | spin_unlock_irq(&base->lock); |
592 | } | 617 | } |
593 | 618 | ||
594 | #ifdef CONFIG_NO_IDLE_HZ | 619 | #if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) |
595 | /* | 620 | /* |
596 | * Find out when the next timer event is due to happen. This | 621 | * Find out when the next timer event is due to happen. This |
597 | * is used on S/390 to stop all activity when a cpus is idle. | 622 | * is used on S/390 to stop all activity when a cpus is idle. |
598 | * This functions needs to be called disabled. | 623 | * This functions needs to be called disabled. |
599 | */ | 624 | */ |
600 | unsigned long next_timer_interrupt(void) | 625 | static unsigned long __next_timer_interrupt(tvec_base_t *base) |
601 | { | 626 | { |
602 | tvec_base_t *base; | 627 | unsigned long timer_jiffies = base->timer_jiffies; |
603 | struct list_head *list; | 628 | unsigned long expires = timer_jiffies + (LONG_MAX >> 1); |
629 | int index, slot, array, found = 0; | ||
604 | struct timer_list *nte; | 630 | struct timer_list *nte; |
605 | unsigned long expires; | ||
606 | unsigned long hr_expires = MAX_JIFFY_OFFSET; | ||
607 | ktime_t hr_delta; | ||
608 | tvec_t *varray[4]; | 631 | tvec_t *varray[4]; |
609 | int i, j; | ||
610 | |||
611 | hr_delta = hrtimer_get_next_event(); | ||
612 | if (hr_delta.tv64 != KTIME_MAX) { | ||
613 | struct timespec tsdelta; | ||
614 | tsdelta = ktime_to_timespec(hr_delta); | ||
615 | hr_expires = timespec_to_jiffies(&tsdelta); | ||
616 | if (hr_expires < 3) | ||
617 | return hr_expires + jiffies; | ||
618 | } | ||
619 | hr_expires += jiffies; | ||
620 | |||
621 | base = __get_cpu_var(tvec_bases); | ||
622 | spin_lock(&base->lock); | ||
623 | expires = base->timer_jiffies + (LONG_MAX >> 1); | ||
624 | list = NULL; | ||
625 | 632 | ||
626 | /* Look for timer events in tv1. */ | 633 | /* Look for timer events in tv1. */ |
627 | j = base->timer_jiffies & TVR_MASK; | 634 | index = slot = timer_jiffies & TVR_MASK; |
628 | do { | 635 | do { |
629 | list_for_each_entry(nte, base->tv1.vec + j, entry) { | 636 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { |
637 | found = 1; | ||
630 | expires = nte->expires; | 638 | expires = nte->expires; |
631 | if (j < (base->timer_jiffies & TVR_MASK)) | 639 | /* Look at the cascade bucket(s)? */ |
632 | list = base->tv2.vec + (INDEX(0)); | 640 | if (!index || slot < index) |
633 | goto found; | 641 | goto cascade; |
642 | return expires; | ||
634 | } | 643 | } |
635 | j = (j + 1) & TVR_MASK; | 644 | slot = (slot + 1) & TVR_MASK; |
636 | } while (j != (base->timer_jiffies & TVR_MASK)); | 645 | } while (slot != index); |
646 | |||
647 | cascade: | ||
648 | /* Calculate the next cascade event */ | ||
649 | if (index) | ||
650 | timer_jiffies += TVR_SIZE - index; | ||
651 | timer_jiffies >>= TVR_BITS; | ||
637 | 652 | ||
638 | /* Check tv2-tv5. */ | 653 | /* Check tv2-tv5. */ |
639 | varray[0] = &base->tv2; | 654 | varray[0] = &base->tv2; |
640 | varray[1] = &base->tv3; | 655 | varray[1] = &base->tv3; |
641 | varray[2] = &base->tv4; | 656 | varray[2] = &base->tv4; |
642 | varray[3] = &base->tv5; | 657 | varray[3] = &base->tv5; |
643 | for (i = 0; i < 4; i++) { | 658 | |
644 | j = INDEX(i); | 659 | for (array = 0; array < 4; array++) { |
660 | tvec_t *varp = varray[array]; | ||
661 | |||
662 | index = slot = timer_jiffies & TVN_MASK; | ||
645 | do { | 663 | do { |
646 | if (list_empty(varray[i]->vec + j)) { | 664 | list_for_each_entry(nte, varp->vec + slot, entry) { |
647 | j = (j + 1) & TVN_MASK; | 665 | found = 1; |
648 | continue; | ||
649 | } | ||
650 | list_for_each_entry(nte, varray[i]->vec + j, entry) | ||
651 | if (time_before(nte->expires, expires)) | 666 | if (time_before(nte->expires, expires)) |
652 | expires = nte->expires; | 667 | expires = nte->expires; |
653 | if (j < (INDEX(i)) && i < 3) | 668 | } |
654 | list = varray[i + 1]->vec + (INDEX(i + 1)); | 669 | /* |
655 | goto found; | 670 | * Do we still search for the first timer or are |
656 | } while (j != (INDEX(i))); | 671 | * we looking up the cascade buckets ? |
657 | } | 672 | */ |
658 | found: | 673 | if (found) { |
659 | if (list) { | 674 | /* Look at the cascade bucket(s)? */ |
660 | /* | 675 | if (!index || slot < index) |
661 | * The search wrapped. We need to look at the next list | 676 | break; |
662 | * from next tv element that would cascade into tv element | 677 | return expires; |
663 | * where we found the timer element. | 678 | } |
664 | */ | 679 | slot = (slot + 1) & TVN_MASK; |
665 | list_for_each_entry(nte, list, entry) { | 680 | } while (slot != index); |
666 | if (time_before(nte->expires, expires)) | 681 | |
667 | expires = nte->expires; | 682 | if (index) |
668 | } | 683 | timer_jiffies += TVN_SIZE - index; |
684 | timer_jiffies >>= TVN_BITS; | ||
669 | } | 685 | } |
670 | spin_unlock(&base->lock); | 686 | return expires; |
687 | } | ||
671 | 688 | ||
672 | /* | 689 | /* |
673 | * It can happen that other CPUs service timer IRQs and increment | 690 | * Check, if the next hrtimer event is before the next timer wheel |
674 | * jiffies, but we have not yet got a local timer tick to process | 691 | * event: |
675 | * the timer wheels. In that case, the expiry time can be before | 692 | */ |
676 | * jiffies, but since the high-resolution timer here is relative to | 693 | static unsigned long cmp_next_hrtimer_event(unsigned long now, |
677 | * jiffies, the default expression when high-resolution timers are | 694 | unsigned long expires) |
678 | * not active, | 695 | { |
679 | * | 696 | ktime_t hr_delta = hrtimer_get_next_event(); |
680 | * time_before(MAX_JIFFY_OFFSET + jiffies, expires) | 697 | struct timespec tsdelta; |
681 | * | ||
682 | * would falsely evaluate to true. If that is the case, just | ||
683 | * return jiffies so that we can immediately fire the local timer | ||
684 | */ | ||
685 | if (time_before(expires, jiffies)) | ||
686 | return jiffies; | ||
687 | 698 | ||
688 | if (time_before(hr_expires, expires)) | 699 | if (hr_delta.tv64 == KTIME_MAX) |
689 | return hr_expires; | 700 | return expires; |
690 | 701 | ||
702 | if (hr_delta.tv64 <= TICK_NSEC) | ||
703 | return now; | ||
704 | |||
705 | tsdelta = ktime_to_timespec(hr_delta); | ||
706 | now += timespec_to_jiffies(&tsdelta); | ||
707 | if (time_before(now, expires)) | ||
708 | return now; | ||
691 | return expires; | 709 | return expires; |
692 | } | 710 | } |
711 | |||
712 | /** | ||
713 | * next_timer_interrupt - return the jiffy of the next pending timer | ||
714 | */ | ||
715 | unsigned long get_next_timer_interrupt(unsigned long now) | ||
716 | { | ||
717 | tvec_base_t *base = __get_cpu_var(tvec_bases); | ||
718 | unsigned long expires; | ||
719 | |||
720 | spin_lock(&base->lock); | ||
721 | expires = __next_timer_interrupt(base); | ||
722 | spin_unlock(&base->lock); | ||
723 | |||
724 | if (time_before_eq(expires, now)) | ||
725 | return now; | ||
726 | |||
727 | return cmp_next_hrtimer_event(now, expires); | ||
728 | } | ||
729 | |||
730 | #ifdef CONFIG_NO_IDLE_HZ | ||
731 | unsigned long next_timer_interrupt(void) | ||
732 | { | ||
733 | return get_next_timer_interrupt(jiffies); | ||
734 | } | ||
735 | #endif | ||
736 | |||
693 | #endif | 737 | #endif |
694 | 738 | ||
695 | /******************************************************************/ | 739 | /******************************************************************/ |
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday); | |||
832 | * | 876 | * |
833 | * Accumulates current time interval and initializes new clocksource | 877 | * Accumulates current time interval and initializes new clocksource |
834 | */ | 878 | */ |
835 | static int change_clocksource(void) | 879 | static void change_clocksource(void) |
836 | { | 880 | { |
837 | struct clocksource *new; | 881 | struct clocksource *new; |
838 | cycle_t now; | 882 | cycle_t now; |
839 | u64 nsec; | 883 | u64 nsec; |
884 | |||
840 | new = clocksource_get_next(); | 885 | new = clocksource_get_next(); |
841 | if (clock != new) { | 886 | |
842 | now = clocksource_read(new); | 887 | if (clock == new) |
843 | nsec = __get_nsec_offset(); | 888 | return; |
844 | timespec_add_ns(&xtime, nsec); | 889 | |
845 | 890 | now = clocksource_read(new); | |
846 | clock = new; | 891 | nsec = __get_nsec_offset(); |
847 | clock->cycle_last = now; | 892 | timespec_add_ns(&xtime, nsec); |
848 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | 893 | |
849 | clock->name); | 894 | clock = new; |
850 | return 1; | 895 | clock->cycle_last = now; |
851 | } else if (clock->update_callback) { | 896 | |
852 | return clock->update_callback(); | 897 | clock->error = 0; |
853 | } | 898 | clock->xtime_nsec = 0; |
854 | return 0; | 899 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
900 | |||
901 | tick_clock_notify(); | ||
902 | |||
903 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
904 | clock->name); | ||
855 | } | 905 | } |
856 | #else | 906 | #else |
857 | static inline int change_clocksource(void) | 907 | static inline void change_clocksource(void) { } |
858 | { | ||
859 | return 0; | ||
860 | } | ||
861 | #endif | 908 | #endif |
862 | 909 | ||
863 | /** | 910 | /** |
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void) | |||
871 | do { | 918 | do { |
872 | seq = read_seqbegin(&xtime_lock); | 919 | seq = read_seqbegin(&xtime_lock); |
873 | 920 | ||
874 | ret = clock->is_continuous; | 921 | ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
875 | 922 | ||
876 | } while (read_seqretry(&xtime_lock, seq)); | 923 | } while (read_seqretry(&xtime_lock, seq)); |
877 | 924 | ||
878 | return ret; | 925 | return ret; |
879 | } | 926 | } |
880 | 927 | ||
928 | /** | ||
929 | * read_persistent_clock - Return time in seconds from the persistent clock. | ||
930 | * | ||
931 | * Weak dummy function for arches that do not yet support it. | ||
932 | * Returns seconds from epoch using the battery backed persistent clock. | ||
933 | * Returns zero if unsupported. | ||
934 | * | ||
935 | * XXX - Do be sure to remove it once all arches implement it. | ||
936 | */ | ||
937 | unsigned long __attribute__((weak)) read_persistent_clock(void) | ||
938 | { | ||
939 | return 0; | ||
940 | } | ||
941 | |||
881 | /* | 942 | /* |
882 | * timekeeping_init - Initializes the clocksource and common timekeeping values | 943 | * timekeeping_init - Initializes the clocksource and common timekeeping values |
883 | */ | 944 | */ |
884 | void __init timekeeping_init(void) | 945 | void __init timekeeping_init(void) |
885 | { | 946 | { |
886 | unsigned long flags; | 947 | unsigned long flags; |
948 | unsigned long sec = read_persistent_clock(); | ||
887 | 949 | ||
888 | write_seqlock_irqsave(&xtime_lock, flags); | 950 | write_seqlock_irqsave(&xtime_lock, flags); |
889 | 951 | ||
890 | ntp_clear(); | 952 | ntp_clear(); |
891 | 953 | ||
892 | clock = clocksource_get_next(); | 954 | clock = clocksource_get_next(); |
893 | clocksource_calculate_interval(clock, tick_nsec); | 955 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); |
894 | clock->cycle_last = clocksource_read(clock); | 956 | clock->cycle_last = clocksource_read(clock); |
895 | 957 | ||
958 | xtime.tv_sec = sec; | ||
959 | xtime.tv_nsec = 0; | ||
960 | set_normalized_timespec(&wall_to_monotonic, | ||
961 | -xtime.tv_sec, -xtime.tv_nsec); | ||
962 | |||
896 | write_sequnlock_irqrestore(&xtime_lock, flags); | 963 | write_sequnlock_irqrestore(&xtime_lock, flags); |
897 | } | 964 | } |
898 | 965 | ||
899 | 966 | /* flag for if timekeeping is suspended */ | |
900 | static int timekeeping_suspended; | 967 | static int timekeeping_suspended; |
968 | /* time in seconds when suspend began */ | ||
969 | static unsigned long timekeeping_suspend_time; | ||
970 | |||
901 | /** | 971 | /** |
902 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 972 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
903 | * @dev: unused | 973 | * @dev: unused |
@@ -909,13 +979,26 @@ static int timekeeping_suspended; | |||
909 | static int timekeeping_resume(struct sys_device *dev) | 979 | static int timekeeping_resume(struct sys_device *dev) |
910 | { | 980 | { |
911 | unsigned long flags; | 981 | unsigned long flags; |
982 | unsigned long now = read_persistent_clock(); | ||
912 | 983 | ||
913 | write_seqlock_irqsave(&xtime_lock, flags); | 984 | write_seqlock_irqsave(&xtime_lock, flags); |
914 | /* restart the last cycle value */ | 985 | |
986 | if (now && (now > timekeeping_suspend_time)) { | ||
987 | unsigned long sleep_length = now - timekeeping_suspend_time; | ||
988 | |||
989 | xtime.tv_sec += sleep_length; | ||
990 | wall_to_monotonic.tv_sec -= sleep_length; | ||
991 | } | ||
992 | /* re-base the last cycle value */ | ||
915 | clock->cycle_last = clocksource_read(clock); | 993 | clock->cycle_last = clocksource_read(clock); |
916 | clock->error = 0; | 994 | clock->error = 0; |
917 | timekeeping_suspended = 0; | 995 | timekeeping_suspended = 0; |
918 | write_sequnlock_irqrestore(&xtime_lock, flags); | 996 | write_sequnlock_irqrestore(&xtime_lock, flags); |
997 | |||
998 | touch_softlockup_watchdog(); | ||
999 | /* Resume hrtimers */ | ||
1000 | clock_was_set(); | ||
1001 | |||
919 | return 0; | 1002 | return 0; |
920 | } | 1003 | } |
921 | 1004 | ||
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
925 | 1008 | ||
926 | write_seqlock_irqsave(&xtime_lock, flags); | 1009 | write_seqlock_irqsave(&xtime_lock, flags); |
927 | timekeeping_suspended = 1; | 1010 | timekeeping_suspended = 1; |
1011 | timekeeping_suspend_time = read_persistent_clock(); | ||
928 | write_sequnlock_irqrestore(&xtime_lock, flags); | 1012 | write_sequnlock_irqrestore(&xtime_lock, flags); |
929 | return 0; | 1013 | return 0; |
930 | } | 1014 | } |
@@ -1089,11 +1173,8 @@ static void update_wall_time(void) | |||
1089 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | 1173 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; |
1090 | 1174 | ||
1091 | /* check to see if there is a new clocksource to use */ | 1175 | /* check to see if there is a new clocksource to use */ |
1092 | if (change_clocksource()) { | 1176 | change_clocksource(); |
1093 | clock->error = 0; | 1177 | update_vsyscall(&xtime, clock); |
1094 | clock->xtime_nsec = 0; | ||
1095 | clocksource_calculate_interval(clock, tick_nsec); | ||
1096 | } | ||
1097 | } | 1178 | } |
1098 | 1179 | ||
1099 | /* | 1180 | /* |
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks) | |||
1162 | * This read-write spinlock protects us from races in SMP while | 1243 | * This read-write spinlock protects us from races in SMP while |
1163 | * playing with xtime and avenrun. | 1244 | * playing with xtime and avenrun. |
1164 | */ | 1245 | */ |
1165 | #ifndef ARCH_HAVE_XTIME_LOCK | 1246 | __attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
1166 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
1167 | 1247 | ||
1168 | EXPORT_SYMBOL(xtime_lock); | 1248 | EXPORT_SYMBOL(xtime_lock); |
1169 | #endif | ||
1170 | 1249 | ||
1171 | /* | 1250 | /* |
1172 | * This function runs timers and the timer-tq in bottom half context. | 1251 | * This function runs timers and the timer-tq in bottom half context. |
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h) | |||
1175 | { | 1254 | { |
1176 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 1255 | tvec_base_t *base = __get_cpu_var(tvec_bases); |
1177 | 1256 | ||
1178 | hrtimer_run_queues(); | 1257 | hrtimer_run_queues(); |
1258 | |||
1179 | if (time_after_eq(jiffies, base->timer_jiffies)) | 1259 | if (time_after_eq(jiffies, base->timer_jiffies)) |
1180 | __run_timers(base); | 1260 | __run_timers(base); |
1181 | } | 1261 | } |
@@ -1392,17 +1472,16 @@ asmlinkage long sys_gettid(void) | |||
1392 | } | 1472 | } |
1393 | 1473 | ||
1394 | /** | 1474 | /** |
1395 | * sys_sysinfo - fill in sysinfo struct | 1475 | * do_sysinfo - fill in sysinfo struct |
1396 | * @info: pointer to buffer to fill | 1476 | * @info: pointer to buffer to fill |
1397 | */ | 1477 | */ |
1398 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) | 1478 | int do_sysinfo(struct sysinfo *info) |
1399 | { | 1479 | { |
1400 | struct sysinfo val; | ||
1401 | unsigned long mem_total, sav_total; | 1480 | unsigned long mem_total, sav_total; |
1402 | unsigned int mem_unit, bitcount; | 1481 | unsigned int mem_unit, bitcount; |
1403 | unsigned long seq; | 1482 | unsigned long seq; |
1404 | 1483 | ||
1405 | memset((char *)&val, 0, sizeof(struct sysinfo)); | 1484 | memset(info, 0, sizeof(struct sysinfo)); |
1406 | 1485 | ||
1407 | do { | 1486 | do { |
1408 | struct timespec tp; | 1487 | struct timespec tp; |
@@ -1422,17 +1501,17 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1422 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; | 1501 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; |
1423 | tp.tv_sec++; | 1502 | tp.tv_sec++; |
1424 | } | 1503 | } |
1425 | val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | 1504 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); |
1426 | 1505 | ||
1427 | val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); | 1506 | info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); |
1428 | val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); | 1507 | info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); |
1429 | val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); | 1508 | info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); |
1430 | 1509 | ||
1431 | val.procs = nr_threads; | 1510 | info->procs = nr_threads; |
1432 | } while (read_seqretry(&xtime_lock, seq)); | 1511 | } while (read_seqretry(&xtime_lock, seq)); |
1433 | 1512 | ||
1434 | si_meminfo(&val); | 1513 | si_meminfo(info); |
1435 | si_swapinfo(&val); | 1514 | si_swapinfo(info); |
1436 | 1515 | ||
1437 | /* | 1516 | /* |
1438 | * If the sum of all the available memory (i.e. ram + swap) | 1517 | * If the sum of all the available memory (i.e. ram + swap) |
@@ -1443,11 +1522,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1443 | * -Erik Andersen <andersee@debian.org> | 1522 | * -Erik Andersen <andersee@debian.org> |
1444 | */ | 1523 | */ |
1445 | 1524 | ||
1446 | mem_total = val.totalram + val.totalswap; | 1525 | mem_total = info->totalram + info->totalswap; |
1447 | if (mem_total < val.totalram || mem_total < val.totalswap) | 1526 | if (mem_total < info->totalram || mem_total < info->totalswap) |
1448 | goto out; | 1527 | goto out; |
1449 | bitcount = 0; | 1528 | bitcount = 0; |
1450 | mem_unit = val.mem_unit; | 1529 | mem_unit = info->mem_unit; |
1451 | while (mem_unit > 1) { | 1530 | while (mem_unit > 1) { |
1452 | bitcount++; | 1531 | bitcount++; |
1453 | mem_unit >>= 1; | 1532 | mem_unit >>= 1; |
@@ -1459,22 +1538,31 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1459 | 1538 | ||
1460 | /* | 1539 | /* |
1461 | * If mem_total did not overflow, multiply all memory values by | 1540 | * If mem_total did not overflow, multiply all memory values by |
1462 | * val.mem_unit and set it to 1. This leaves things compatible | 1541 | * info->mem_unit and set it to 1. This leaves things compatible |
1463 | * with 2.2.x, and also retains compatibility with earlier 2.4.x | 1542 | * with 2.2.x, and also retains compatibility with earlier 2.4.x |
1464 | * kernels... | 1543 | * kernels... |
1465 | */ | 1544 | */ |
1466 | 1545 | ||
1467 | val.mem_unit = 1; | 1546 | info->mem_unit = 1; |
1468 | val.totalram <<= bitcount; | 1547 | info->totalram <<= bitcount; |
1469 | val.freeram <<= bitcount; | 1548 | info->freeram <<= bitcount; |
1470 | val.sharedram <<= bitcount; | 1549 | info->sharedram <<= bitcount; |
1471 | val.bufferram <<= bitcount; | 1550 | info->bufferram <<= bitcount; |
1472 | val.totalswap <<= bitcount; | 1551 | info->totalswap <<= bitcount; |
1473 | val.freeswap <<= bitcount; | 1552 | info->freeswap <<= bitcount; |
1474 | val.totalhigh <<= bitcount; | 1553 | info->totalhigh <<= bitcount; |
1475 | val.freehigh <<= bitcount; | 1554 | info->freehigh <<= bitcount; |
1555 | |||
1556 | out: | ||
1557 | return 0; | ||
1558 | } | ||
1559 | |||
1560 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) | ||
1561 | { | ||
1562 | struct sysinfo val; | ||
1563 | |||
1564 | do_sysinfo(&val); | ||
1476 | 1565 | ||
1477 | out: | ||
1478 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) | 1566 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) |
1479 | return -EFAULT; | 1567 | return -EFAULT; |
1480 | 1568 | ||
@@ -1613,6 +1701,8 @@ void __init init_timers(void) | |||
1613 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1701 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, |
1614 | (void *)(long)smp_processor_id()); | 1702 | (void *)(long)smp_processor_id()); |
1615 | 1703 | ||
1704 | init_timer_stats(); | ||
1705 | |||
1616 | BUG_ON(err == NOTIFY_BAD); | 1706 | BUG_ON(err == NOTIFY_BAD); |
1617 | register_cpu_notifier(&timers_nb); | 1707 | register_cpu_notifier(&timers_nb); |
1618 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | 1708 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); |
@@ -1624,7 +1714,7 @@ struct time_interpolator *time_interpolator __read_mostly; | |||
1624 | static struct time_interpolator *time_interpolator_list __read_mostly; | 1714 | static struct time_interpolator *time_interpolator_list __read_mostly; |
1625 | static DEFINE_SPINLOCK(time_interpolator_lock); | 1715 | static DEFINE_SPINLOCK(time_interpolator_lock); |
1626 | 1716 | ||
1627 | static inline u64 time_interpolator_get_cycles(unsigned int src) | 1717 | static inline cycles_t time_interpolator_get_cycles(unsigned int src) |
1628 | { | 1718 | { |
1629 | unsigned long (*x)(void); | 1719 | unsigned long (*x)(void); |
1630 | 1720 | ||
@@ -1650,8 +1740,8 @@ static inline u64 time_interpolator_get_counter(int writelock) | |||
1650 | 1740 | ||
1651 | if (time_interpolator->jitter) | 1741 | if (time_interpolator->jitter) |
1652 | { | 1742 | { |
1653 | u64 lcycle; | 1743 | cycles_t lcycle; |
1654 | u64 now; | 1744 | cycles_t now; |
1655 | 1745 | ||
1656 | do { | 1746 | do { |
1657 | lcycle = time_interpolator->last_cycle; | 1747 | lcycle = time_interpolator->last_cycle; |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index baacc3691415..658f638c402c 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -22,8 +22,6 @@ | |||
22 | #include <linux/acct.h> | 22 | #include <linux/acct.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | 24 | ||
25 | |||
26 | #define USEC_PER_TICK (USEC_PER_SEC/HZ) | ||
27 | /* | 25 | /* |
28 | * fill in basic accounting fields | 26 | * fill in basic accounting fields |
29 | */ | 27 | */ |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c new file mode 100644 index 000000000000..f22b9dbd2a9c --- /dev/null +++ b/kernel/utsname_sysctl.c | |||
@@ -0,0 +1,146 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 | ||
3 | * | ||
4 | * Author: Eric Biederman <ebiederm@xmision.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation, version 2 of the | ||
9 | * License. | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/uts.h> | ||
14 | #include <linux/utsname.h> | ||
15 | #include <linux/version.h> | ||
16 | #include <linux/sysctl.h> | ||
17 | |||
18 | static void *get_uts(ctl_table *table, int write) | ||
19 | { | ||
20 | char *which = table->data; | ||
21 | #ifdef CONFIG_UTS_NS | ||
22 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
23 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
24 | #endif | ||
25 | if (!write) | ||
26 | down_read(&uts_sem); | ||
27 | else | ||
28 | down_write(&uts_sem); | ||
29 | return which; | ||
30 | } | ||
31 | |||
32 | static void put_uts(ctl_table *table, int write, void *which) | ||
33 | { | ||
34 | if (!write) | ||
35 | up_read(&uts_sem); | ||
36 | else | ||
37 | up_write(&uts_sem); | ||
38 | } | ||
39 | |||
40 | #ifdef CONFIG_PROC_FS | ||
41 | /* | ||
42 | * Special case of dostring for the UTS structure. This has locks | ||
43 | * to observe. Should this be in kernel/sys.c ???? | ||
44 | */ | ||
45 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
46 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
47 | { | ||
48 | struct ctl_table uts_table; | ||
49 | int r; | ||
50 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
51 | uts_table.data = get_uts(table, write); | ||
52 | r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); | ||
53 | put_uts(table, write, uts_table.data); | ||
54 | return r; | ||
55 | } | ||
56 | #else | ||
57 | #define proc_do_uts_string NULL | ||
58 | #endif | ||
59 | |||
60 | |||
61 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
62 | /* The generic string strategy routine: */ | ||
63 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
64 | void __user *oldval, size_t __user *oldlenp, | ||
65 | void __user *newval, size_t newlen) | ||
66 | { | ||
67 | struct ctl_table uts_table; | ||
68 | int r, write; | ||
69 | write = newval && newlen; | ||
70 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
71 | uts_table.data = get_uts(table, write); | ||
72 | r = sysctl_string(&uts_table, name, nlen, | ||
73 | oldval, oldlenp, newval, newlen); | ||
74 | put_uts(table, write, uts_table.data); | ||
75 | return r; | ||
76 | } | ||
77 | #else | ||
78 | #define sysctl_uts_string NULL | ||
79 | #endif | ||
80 | |||
81 | static struct ctl_table uts_kern_table[] = { | ||
82 | { | ||
83 | .ctl_name = KERN_OSTYPE, | ||
84 | .procname = "ostype", | ||
85 | .data = init_uts_ns.name.sysname, | ||
86 | .maxlen = sizeof(init_uts_ns.name.sysname), | ||
87 | .mode = 0444, | ||
88 | .proc_handler = proc_do_uts_string, | ||
89 | .strategy = sysctl_uts_string, | ||
90 | }, | ||
91 | { | ||
92 | .ctl_name = KERN_OSRELEASE, | ||
93 | .procname = "osrelease", | ||
94 | .data = init_uts_ns.name.release, | ||
95 | .maxlen = sizeof(init_uts_ns.name.release), | ||
96 | .mode = 0444, | ||
97 | .proc_handler = proc_do_uts_string, | ||
98 | .strategy = sysctl_uts_string, | ||
99 | }, | ||
100 | { | ||
101 | .ctl_name = KERN_VERSION, | ||
102 | .procname = "version", | ||
103 | .data = init_uts_ns.name.version, | ||
104 | .maxlen = sizeof(init_uts_ns.name.version), | ||
105 | .mode = 0444, | ||
106 | .proc_handler = proc_do_uts_string, | ||
107 | .strategy = sysctl_uts_string, | ||
108 | }, | ||
109 | { | ||
110 | .ctl_name = KERN_NODENAME, | ||
111 | .procname = "hostname", | ||
112 | .data = init_uts_ns.name.nodename, | ||
113 | .maxlen = sizeof(init_uts_ns.name.nodename), | ||
114 | .mode = 0644, | ||
115 | .proc_handler = proc_do_uts_string, | ||
116 | .strategy = sysctl_uts_string, | ||
117 | }, | ||
118 | { | ||
119 | .ctl_name = KERN_DOMAINNAME, | ||
120 | .procname = "domainname", | ||
121 | .data = init_uts_ns.name.domainname, | ||
122 | .maxlen = sizeof(init_uts_ns.name.domainname), | ||
123 | .mode = 0644, | ||
124 | .proc_handler = proc_do_uts_string, | ||
125 | .strategy = sysctl_uts_string, | ||
126 | }, | ||
127 | {} | ||
128 | }; | ||
129 | |||
130 | static struct ctl_table uts_root_table[] = { | ||
131 | { | ||
132 | .ctl_name = CTL_KERN, | ||
133 | .procname = "kernel", | ||
134 | .mode = 0555, | ||
135 | .child = uts_kern_table, | ||
136 | }, | ||
137 | {} | ||
138 | }; | ||
139 | |||
140 | static int __init utsname_sysctl_init(void) | ||
141 | { | ||
142 | register_sysctl_table(uts_root_table); | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | __initcall(utsname_sysctl_init); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a3da07c5af28..b6fa5e63085d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
218 | } | 218 | } |
219 | EXPORT_SYMBOL_GPL(queue_work); | 219 | EXPORT_SYMBOL_GPL(queue_work); |
220 | 220 | ||
221 | static void delayed_work_timer_fn(unsigned long __data) | 221 | void delayed_work_timer_fn(unsigned long __data) |
222 | { | 222 | { |
223 | struct delayed_work *dwork = (struct delayed_work *)__data; | 223 | struct delayed_work *dwork = (struct delayed_work *)__data; |
224 | struct workqueue_struct *wq = get_wq_data(&dwork->work); | 224 | struct workqueue_struct *wq = get_wq_data(&dwork->work); |
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, | |||
245 | struct timer_list *timer = &dwork->timer; | 245 | struct timer_list *timer = &dwork->timer; |
246 | struct work_struct *work = &dwork->work; | 246 | struct work_struct *work = &dwork->work; |
247 | 247 | ||
248 | timer_stats_timer_set_start_info(timer); | ||
248 | if (delay == 0) | 249 | if (delay == 0) |
249 | return queue_work(wq, work); | 250 | return queue_work(wq, work); |
250 | 251 | ||
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work); | |||
593 | * After waiting for a given time this puts a job in the kernel-global | 594 | * After waiting for a given time this puts a job in the kernel-global |
594 | * workqueue. | 595 | * workqueue. |
595 | */ | 596 | */ |
596 | int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) | 597 | int fastcall schedule_delayed_work(struct delayed_work *dwork, |
598 | unsigned long delay) | ||
597 | { | 599 | { |
600 | timer_stats_timer_set_start_info(&dwork->timer); | ||
598 | return queue_delayed_work(keventd_wq, dwork, delay); | 601 | return queue_delayed_work(keventd_wq, dwork, delay); |
599 | } | 602 | } |
600 | EXPORT_SYMBOL(schedule_delayed_work); | 603 | EXPORT_SYMBOL(schedule_delayed_work); |
@@ -656,8 +659,7 @@ void flush_scheduled_work(void) | |||
656 | EXPORT_SYMBOL(flush_scheduled_work); | 659 | EXPORT_SYMBOL(flush_scheduled_work); |
657 | 660 | ||
658 | /** | 661 | /** |
659 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed | 662 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work. |
660 | * work whose handler rearms the delayed work. | ||
661 | * @wq: the controlling workqueue structure | 663 | * @wq: the controlling workqueue structure |
662 | * @dwork: the delayed work struct | 664 | * @dwork: the delayed work struct |
663 | */ | 665 | */ |
@@ -670,8 +672,7 @@ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, | |||
670 | EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); | 672 | EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); |
671 | 673 | ||
672 | /** | 674 | /** |
673 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd | 675 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work. |
674 | * work whose handler rearms the delayed work. | ||
675 | * @dwork: the delayed work struct | 676 | * @dwork: the delayed work struct |
676 | */ | 677 | */ |
677 | void cancel_rearming_delayed_work(struct delayed_work *dwork) | 678 | void cancel_rearming_delayed_work(struct delayed_work *dwork) |