diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2011-08-27 09:43:54 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2011-08-27 10:06:11 -0400 |
commit | 7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch) | |
tree | 5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel | |
parent | 7d754596756240fa918b94cd0c3011c77a638987 (diff) | |
parent | 02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff) |
Merge 'Linux v3.0' into Litmus
Some notes:
* Litmus^RT scheduling class is the topmost scheduling class
(above stop_sched_class).
* scheduler_ipi() function (e.g., in smp_reschedule_interrupt())
may increase IPI latencies.
* Added path into schedule() to quickly re-evaluate scheduling
decision without becoming preemptive again. This used to be
a standard path before the removal of BKL.
Conflicts:
Makefile
arch/arm/kernel/calls.S
arch/arm/kernel/smp.c
arch/x86/include/asm/unistd_32.h
arch/x86/kernel/smp.c
arch/x86/kernel/syscall_table_32.S
include/linux/hrtimer.h
kernel/printk.c
kernel/sched.c
kernel/sched_fair.c
Diffstat (limited to 'kernel')
196 files changed, 24954 insertions, 11587 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb44618..5068e2a4e75f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE | |||
199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE |
200 | 200 | ||
201 | config MUTEX_SPIN_ON_OWNER | 201 | config MUTEX_SPIN_ON_OWNER |
202 | def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES | 202 | def_bool SMP && !DEBUG_MUTEXES |
diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..2d64cfcc8b42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o jump_label.o |
14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | ||
15 | obj-y += groups.o | 14 | obj-y += groups.o |
16 | 15 | ||
17 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER |
@@ -22,7 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg | |||
22 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
23 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
24 | CFLAGS_REMOVE_sched_clock.o = -pg | 23 | CFLAGS_REMOVE_sched_clock.o = -pg |
25 | CFLAGS_REMOVE_perf_event.o = -pg | 24 | CFLAGS_REMOVE_irq_work.o = -pg |
26 | endif | 25 | endif |
27 | 26 | ||
28 | obj-$(CONFIG_FREEZER) += freezer.o | 27 | obj-$(CONFIG_FREEZER) += freezer.o |
@@ -43,7 +42,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
43 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 42 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
44 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
45 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
46 | obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o | 45 | obj-$(CONFIG_SMP) += smp.o |
47 | ifneq ($(CONFIG_SMP),y) | 46 | ifneq ($(CONFIG_SMP),y) |
48 | obj-y += up.o | 47 | obj-y += up.o |
49 | endif | 48 | endif |
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o | |||
62 | obj-$(CONFIG_CGROUPS) += cgroup.o | 61 | obj-$(CONFIG_CGROUPS) += cgroup.o |
63 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | 62 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o |
64 | obj-$(CONFIG_CPUSETS) += cpuset.o | 63 | obj-$(CONFIG_CPUSETS) += cpuset.o |
65 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | ||
66 | obj-$(CONFIG_UTS_NS) += utsname.o | 64 | obj-$(CONFIG_UTS_NS) += utsname.o |
67 | obj-$(CONFIG_USER_NS) += user_namespace.o | 65 | obj-$(CONFIG_USER_NS) += user_namespace.o |
68 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 66 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
@@ -86,6 +84,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o | |||
86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | 84 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o |
87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | 85 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o |
88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | 86 | obj-$(CONFIG_TINY_RCU) += rcutiny.o |
87 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
89 | obj-$(CONFIG_RELAY) += relay.o | 88 | obj-$(CONFIG_RELAY) += relay.o |
90 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 89 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
91 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 90 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
@@ -99,11 +98,15 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ | |||
99 | obj-$(CONFIG_TRACING) += trace/ | 98 | obj-$(CONFIG_TRACING) += trace/ |
100 | obj-$(CONFIG_X86_DS) += trace/ | 99 | obj-$(CONFIG_X86_DS) += trace/ |
101 | obj-$(CONFIG_RING_BUFFER) += trace/ | 100 | obj-$(CONFIG_RING_BUFFER) += trace/ |
101 | obj-$(CONFIG_TRACEPOINTS) += trace/ | ||
102 | obj-$(CONFIG_SMP) += sched_cpupri.o | 102 | obj-$(CONFIG_SMP) += sched_cpupri.o |
103 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
104 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 104 | |
105 | obj-$(CONFIG_PERF_EVENTS) += events/ | ||
106 | |||
105 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
106 | obj-$(CONFIG_PADATA) += padata.o | 108 | obj-$(CONFIG_PADATA) += padata.o |
109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
107 | 110 | ||
108 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 111 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
109 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 112 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
@@ -119,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h | |||
119 | # config_data.h contains the same information as ikconfig.h but gzipped. | 122 | # config_data.h contains the same information as ikconfig.h but gzipped. |
120 | # Info from config_data can be extracted from /proc/config* | 123 | # Info from config_data can be extracted from /proc/config* |
121 | targets += config_data.gz | 124 | targets += config_data.gz |
122 | $(obj)/config_data.gz: .config FORCE | 125 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
123 | $(call if_changed,gzip) | 126 | $(call if_changed,gzip) |
124 | 127 | ||
125 | quiet_cmd_ikconfiggz = IKCFG $@ | 128 | quiet_cmd_ikconfiggz = IKCFG $@ |
diff --git a/kernel/audit.c b/kernel/audit.c index d96045789b54..939500317066 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -74,6 +74,8 @@ static int audit_initialized; | |||
74 | int audit_enabled; | 74 | int audit_enabled; |
75 | int audit_ever_enabled; | 75 | int audit_ever_enabled; |
76 | 76 | ||
77 | EXPORT_SYMBOL_GPL(audit_enabled); | ||
78 | |||
77 | /* Default state when kernel boots without any parameters. */ | 79 | /* Default state when kernel boots without any parameters. */ |
78 | static int audit_default; | 80 | static int audit_default; |
79 | 81 | ||
@@ -400,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
400 | if (err < 0) { | 402 | if (err < 0) { |
401 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 403 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
402 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 404 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
403 | audit_log_lost("auditd dissapeared\n"); | 405 | audit_log_lost("auditd disappeared\n"); |
404 | audit_pid = 0; | 406 | audit_pid = 0; |
405 | /* we might get lucky and get this in the next auditd */ | 407 | /* we might get lucky and get this in the next auditd */ |
406 | audit_hold_skb(skb); | 408 | audit_hold_skb(skb); |
@@ -467,23 +469,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) | |||
467 | struct task_struct *tsk; | 469 | struct task_struct *tsk; |
468 | int err; | 470 | int err; |
469 | 471 | ||
470 | read_lock(&tasklist_lock); | 472 | rcu_read_lock(); |
471 | tsk = find_task_by_vpid(pid); | 473 | tsk = find_task_by_vpid(pid); |
472 | err = -ESRCH; | 474 | if (!tsk) { |
473 | if (!tsk) | 475 | rcu_read_unlock(); |
474 | goto out; | 476 | return -ESRCH; |
475 | err = 0; | 477 | } |
476 | 478 | get_task_struct(tsk); | |
477 | spin_lock_irq(&tsk->sighand->siglock); | 479 | rcu_read_unlock(); |
478 | if (!tsk->signal->audit_tty) | 480 | err = tty_audit_push_task(tsk, loginuid, sessionid); |
479 | err = -EPERM; | 481 | put_task_struct(tsk); |
480 | spin_unlock_irq(&tsk->sighand->siglock); | ||
481 | if (err) | ||
482 | goto out; | ||
483 | |||
484 | tty_audit_push_task(tsk, loginuid, sessionid); | ||
485 | out: | ||
486 | read_unlock(&tasklist_lock); | ||
487 | return err; | 482 | return err; |
488 | } | 483 | } |
489 | 484 | ||
@@ -506,7 +501,7 @@ int audit_send_list(void *_dest) | |||
506 | } | 501 | } |
507 | 502 | ||
508 | struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, | 503 | struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, |
509 | int multi, void *payload, int size) | 504 | int multi, const void *payload, int size) |
510 | { | 505 | { |
511 | struct sk_buff *skb; | 506 | struct sk_buff *skb; |
512 | struct nlmsghdr *nlh; | 507 | struct nlmsghdr *nlh; |
@@ -555,8 +550,8 @@ static int audit_send_reply_thread(void *arg) | |||
555 | * Allocates an skb, builds the netlink message, and sends it to the pid. | 550 | * Allocates an skb, builds the netlink message, and sends it to the pid. |
556 | * No failure notifications. | 551 | * No failure notifications. |
557 | */ | 552 | */ |
558 | void audit_send_reply(int pid, int seq, int type, int done, int multi, | 553 | static void audit_send_reply(int pid, int seq, int type, int done, int multi, |
559 | void *payload, int size) | 554 | const void *payload, int size) |
560 | { | 555 | { |
561 | struct sk_buff *skb; | 556 | struct sk_buff *skb; |
562 | struct task_struct *tsk; | 557 | struct task_struct *tsk; |
@@ -678,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
678 | 673 | ||
679 | pid = NETLINK_CREDS(skb)->pid; | 674 | pid = NETLINK_CREDS(skb)->pid; |
680 | uid = NETLINK_CREDS(skb)->uid; | 675 | uid = NETLINK_CREDS(skb)->uid; |
681 | loginuid = NETLINK_CB(skb).loginuid; | 676 | loginuid = audit_get_loginuid(current); |
682 | sessionid = NETLINK_CB(skb).sessionid; | 677 | sessionid = audit_get_sessionid(current); |
683 | sid = NETLINK_CB(skb).sid; | 678 | security_task_getsecid(current, &sid); |
684 | seq = nlh->nlmsg_seq; | 679 | seq = nlh->nlmsg_seq; |
685 | data = NLMSG_DATA(nlh); | 680 | data = NLMSG_DATA(nlh); |
686 | 681 | ||
@@ -880,40 +875,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
880 | case AUDIT_TTY_GET: { | 875 | case AUDIT_TTY_GET: { |
881 | struct audit_tty_status s; | 876 | struct audit_tty_status s; |
882 | struct task_struct *tsk; | 877 | struct task_struct *tsk; |
878 | unsigned long flags; | ||
883 | 879 | ||
884 | read_lock(&tasklist_lock); | 880 | rcu_read_lock(); |
885 | tsk = find_task_by_vpid(pid); | 881 | tsk = find_task_by_vpid(pid); |
886 | if (!tsk) | 882 | if (tsk && lock_task_sighand(tsk, &flags)) { |
887 | err = -ESRCH; | ||
888 | else { | ||
889 | spin_lock_irq(&tsk->sighand->siglock); | ||
890 | s.enabled = tsk->signal->audit_tty != 0; | 883 | s.enabled = tsk->signal->audit_tty != 0; |
891 | spin_unlock_irq(&tsk->sighand->siglock); | 884 | unlock_task_sighand(tsk, &flags); |
892 | } | 885 | } else |
893 | read_unlock(&tasklist_lock); | 886 | err = -ESRCH; |
894 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, | 887 | rcu_read_unlock(); |
895 | &s, sizeof(s)); | 888 | |
889 | if (!err) | ||
890 | audit_send_reply(NETLINK_CB(skb).pid, seq, | ||
891 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); | ||
896 | break; | 892 | break; |
897 | } | 893 | } |
898 | case AUDIT_TTY_SET: { | 894 | case AUDIT_TTY_SET: { |
899 | struct audit_tty_status *s; | 895 | struct audit_tty_status *s; |
900 | struct task_struct *tsk; | 896 | struct task_struct *tsk; |
897 | unsigned long flags; | ||
901 | 898 | ||
902 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) | 899 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) |
903 | return -EINVAL; | 900 | return -EINVAL; |
904 | s = data; | 901 | s = data; |
905 | if (s->enabled != 0 && s->enabled != 1) | 902 | if (s->enabled != 0 && s->enabled != 1) |
906 | return -EINVAL; | 903 | return -EINVAL; |
907 | read_lock(&tasklist_lock); | 904 | rcu_read_lock(); |
908 | tsk = find_task_by_vpid(pid); | 905 | tsk = find_task_by_vpid(pid); |
909 | if (!tsk) | 906 | if (tsk && lock_task_sighand(tsk, &flags)) { |
910 | err = -ESRCH; | ||
911 | else { | ||
912 | spin_lock_irq(&tsk->sighand->siglock); | ||
913 | tsk->signal->audit_tty = s->enabled != 0; | 907 | tsk->signal->audit_tty = s->enabled != 0; |
914 | spin_unlock_irq(&tsk->sighand->siglock); | 908 | unlock_task_sighand(tsk, &flags); |
915 | } | 909 | } else |
916 | read_unlock(&tasklist_lock); | 910 | err = -ESRCH; |
911 | rcu_read_unlock(); | ||
917 | break; | 912 | break; |
918 | } | 913 | } |
919 | default: | 914 | default: |
diff --git a/kernel/audit.h b/kernel/audit.h index f7206db4e13d..91e7071c4d2c 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path, | |||
84 | int *dirlen); | 84 | int *dirlen); |
85 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | 85 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, |
86 | int done, int multi, | 86 | int done, int multi, |
87 | void *payload, int size); | 87 | const void *payload, int size); |
88 | extern void audit_send_reply(int pid, int seq, int type, | ||
89 | int done, int multi, | ||
90 | void *payload, int size); | ||
91 | extern void audit_panic(const char *message); | 88 | extern void audit_panic(const char *message); |
92 | 89 | ||
93 | struct audit_netlink_list { | 90 | struct audit_netlink_list { |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7f18d3a4527e..e99dda04b126 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p) | |||
223 | { | 223 | { |
224 | struct audit_chunk *chunk = find_chunk(p); | 224 | struct audit_chunk *chunk = find_chunk(p); |
225 | struct fsnotify_mark *entry = &chunk->mark; | 225 | struct fsnotify_mark *entry = &chunk->mark; |
226 | struct audit_chunk *new; | 226 | struct audit_chunk *new = NULL; |
227 | struct audit_tree *owner; | 227 | struct audit_tree *owner; |
228 | int size = chunk->count - 1; | 228 | int size = chunk->count - 1; |
229 | int i, j; | 229 | int i, j; |
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p) | |||
232 | 232 | ||
233 | spin_unlock(&hash_lock); | 233 | spin_unlock(&hash_lock); |
234 | 234 | ||
235 | if (size) | ||
236 | new = alloc_chunk(size); | ||
237 | |||
235 | spin_lock(&entry->lock); | 238 | spin_lock(&entry->lock); |
236 | if (chunk->dead || !entry->i.inode) { | 239 | if (chunk->dead || !entry->i.inode) { |
237 | spin_unlock(&entry->lock); | 240 | spin_unlock(&entry->lock); |
241 | if (new) | ||
242 | free_chunk(new); | ||
238 | goto out; | 243 | goto out; |
239 | } | 244 | } |
240 | 245 | ||
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p) | |||
255 | goto out; | 260 | goto out; |
256 | } | 261 | } |
257 | 262 | ||
258 | new = alloc_chunk(size); | ||
259 | if (!new) | 263 | if (!new) |
260 | goto Fallback; | 264 | goto Fallback; |
265 | |||
261 | fsnotify_duplicate_mark(&new->mark, entry); | 266 | fsnotify_duplicate_mark(&new->mark, entry); |
262 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | 267 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { |
263 | free_chunk(new); | 268 | free_chunk(new); |
@@ -602,7 +607,7 @@ void audit_trim_trees(void) | |||
602 | spin_lock(&hash_lock); | 607 | spin_lock(&hash_lock); |
603 | list_for_each_entry(node, &tree->chunks, list) { | 608 | list_for_each_entry(node, &tree->chunks, list) { |
604 | struct audit_chunk *chunk = find_chunk(node); | 609 | struct audit_chunk *chunk = find_chunk(node); |
605 | /* this could be NULL if the watch is dieing else where... */ | 610 | /* this could be NULL if the watch is dying else where... */ |
606 | struct inode *inode = chunk->mark.i.inode; | 611 | struct inode *inode = chunk->mark.i.inode; |
607 | node->index |= 1U<<31; | 612 | node->index |= 1U<<31; |
608 | if (iterate_mounts(compare_root, inode, root_mnt)) | 613 | if (iterate_mounts(compare_root, inode, root_mnt)) |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f0c9b2e7542d..e683869365d9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -60,7 +60,7 @@ struct audit_parent { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | /* fsnotify handle. */ | 62 | /* fsnotify handle. */ |
63 | struct fsnotify_group *audit_watch_group; | 63 | static struct fsnotify_group *audit_watch_group; |
64 | 64 | ||
65 | /* fsnotify events we care about. */ | 65 | /* fsnotify events we care about. */ |
66 | #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ | 66 | #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ |
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch) | |||
123 | } | 123 | } |
124 | } | 124 | } |
125 | 125 | ||
126 | void audit_remove_watch(struct audit_watch *watch) | 126 | static void audit_remove_watch(struct audit_watch *watch) |
127 | { | 127 | { |
128 | list_del(&watch->wlist); | 128 | list_del(&watch->wlist); |
129 | audit_put_parent(watch->parent); | 129 | audit_put_parent(watch->parent); |
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) | |||
144 | } | 144 | } |
145 | 145 | ||
146 | /* Initialize a parent watch entry. */ | 146 | /* Initialize a parent watch entry. */ |
147 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | 147 | static struct audit_parent *audit_init_parent(struct path *path) |
148 | { | 148 | { |
149 | struct inode *inode = ndp->path.dentry->d_inode; | 149 | struct inode *inode = path->dentry->d_inode; |
150 | struct audit_parent *parent; | 150 | struct audit_parent *parent; |
151 | int ret; | 151 | int ret; |
152 | 152 | ||
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
353 | } | 353 | } |
354 | 354 | ||
355 | /* Get path information necessary for adding watches. */ | 355 | /* Get path information necessary for adding watches. */ |
356 | static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) | 356 | static int audit_get_nd(struct audit_watch *watch, struct path *parent) |
357 | { | 357 | { |
358 | struct nameidata *ndparent, *ndwatch; | 358 | struct nameidata nd; |
359 | struct dentry *d; | ||
359 | int err; | 360 | int err; |
360 | 361 | ||
361 | ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); | 362 | err = kern_path_parent(watch->path, &nd); |
362 | if (unlikely(!ndparent)) | 363 | if (err) |
363 | return -ENOMEM; | 364 | return err; |
364 | 365 | ||
365 | ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); | 366 | if (nd.last_type != LAST_NORM) { |
366 | if (unlikely(!ndwatch)) { | 367 | path_put(&nd.path); |
367 | kfree(ndparent); | 368 | return -EINVAL; |
368 | return -ENOMEM; | ||
369 | } | 369 | } |
370 | 370 | ||
371 | err = path_lookup(path, LOOKUP_PARENT, ndparent); | 371 | mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); |
372 | if (err) { | 372 | d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); |
373 | kfree(ndparent); | 373 | if (IS_ERR(d)) { |
374 | kfree(ndwatch); | 374 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); |
375 | return err; | 375 | path_put(&nd.path); |
376 | return PTR_ERR(d); | ||
376 | } | 377 | } |
377 | 378 | if (d->d_inode) { | |
378 | err = path_lookup(path, 0, ndwatch); | 379 | /* update watch filter fields */ |
379 | if (err) { | 380 | watch->dev = d->d_inode->i_sb->s_dev; |
380 | kfree(ndwatch); | 381 | watch->ino = d->d_inode->i_ino; |
381 | ndwatch = NULL; | ||
382 | } | 382 | } |
383 | mutex_unlock(&nd.path.dentry->d_inode->i_mutex); | ||
383 | 384 | ||
384 | *ndp = ndparent; | 385 | *parent = nd.path; |
385 | *ndw = ndwatch; | 386 | dput(d); |
386 | |||
387 | return 0; | 387 | return 0; |
388 | } | 388 | } |
389 | 389 | ||
390 | /* Release resources used for watch path information. */ | ||
391 | static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | ||
392 | { | ||
393 | if (ndp) { | ||
394 | path_put(&ndp->path); | ||
395 | kfree(ndp); | ||
396 | } | ||
397 | if (ndw) { | ||
398 | path_put(&ndw->path); | ||
399 | kfree(ndw); | ||
400 | } | ||
401 | } | ||
402 | |||
403 | /* Associate the given rule with an existing parent. | 390 | /* Associate the given rule with an existing parent. |
404 | * Caller must hold audit_filter_mutex. */ | 391 | * Caller must hold audit_filter_mutex. */ |
405 | static void audit_add_to_parent(struct audit_krule *krule, | 392 | static void audit_add_to_parent(struct audit_krule *krule, |
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
440 | { | 427 | { |
441 | struct audit_watch *watch = krule->watch; | 428 | struct audit_watch *watch = krule->watch; |
442 | struct audit_parent *parent; | 429 | struct audit_parent *parent; |
443 | struct nameidata *ndp = NULL, *ndw = NULL; | 430 | struct path parent_path; |
444 | int h, ret = 0; | 431 | int h, ret = 0; |
445 | 432 | ||
446 | mutex_unlock(&audit_filter_mutex); | 433 | mutex_unlock(&audit_filter_mutex); |
447 | 434 | ||
448 | /* Avoid calling path_lookup under audit_filter_mutex. */ | 435 | /* Avoid calling path_lookup under audit_filter_mutex. */ |
449 | ret = audit_get_nd(watch->path, &ndp, &ndw); | 436 | ret = audit_get_nd(watch, &parent_path); |
450 | if (ret) { | ||
451 | /* caller expects mutex locked */ | ||
452 | mutex_lock(&audit_filter_mutex); | ||
453 | goto error; | ||
454 | } | ||
455 | 437 | ||
438 | /* caller expects mutex locked */ | ||
456 | mutex_lock(&audit_filter_mutex); | 439 | mutex_lock(&audit_filter_mutex); |
457 | 440 | ||
458 | /* update watch filter fields */ | 441 | if (ret) |
459 | if (ndw) { | 442 | return ret; |
460 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; | ||
461 | watch->ino = ndw->path.dentry->d_inode->i_ino; | ||
462 | } | ||
463 | 443 | ||
464 | /* either find an old parent or attach a new one */ | 444 | /* either find an old parent or attach a new one */ |
465 | parent = audit_find_parent(ndp->path.dentry->d_inode); | 445 | parent = audit_find_parent(parent_path.dentry->d_inode); |
466 | if (!parent) { | 446 | if (!parent) { |
467 | parent = audit_init_parent(ndp); | 447 | parent = audit_init_parent(&parent_path); |
468 | if (IS_ERR(parent)) { | 448 | if (IS_ERR(parent)) { |
469 | ret = PTR_ERR(parent); | 449 | ret = PTR_ERR(parent); |
470 | goto error; | 450 | goto error; |
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) | |||
479 | h = audit_hash_ino((u32)watch->ino); | 459 | h = audit_hash_ino((u32)watch->ino); |
480 | *list = &audit_inode_hash[h]; | 460 | *list = &audit_inode_hash[h]; |
481 | error: | 461 | error: |
482 | audit_put_nd(ndp, ndw); /* NULL args OK */ | 462 | path_put(&parent_path); |
483 | return ret; | 463 | return ret; |
484 | |||
485 | } | 464 | } |
486 | 465 | ||
487 | void audit_remove_watch_rule(struct audit_krule *krule) | 466 | void audit_remove_watch_rule(struct audit_krule *krule) |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eb7675499fb5..f8277c80d678 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
1238 | for (i = 0; i < rule->field_count; i++) { | 1238 | for (i = 0; i < rule->field_count; i++) { |
1239 | struct audit_field *f = &rule->fields[i]; | 1239 | struct audit_field *f = &rule->fields[i]; |
1240 | int result = 0; | 1240 | int result = 0; |
1241 | u32 sid; | ||
1241 | 1242 | ||
1242 | switch (f->type) { | 1243 | switch (f->type) { |
1243 | case AUDIT_PID: | 1244 | case AUDIT_PID: |
@@ -1250,7 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
1250 | result = audit_comparator(cb->creds.gid, f->op, f->val); | 1251 | result = audit_comparator(cb->creds.gid, f->op, f->val); |
1251 | break; | 1252 | break; |
1252 | case AUDIT_LOGINUID: | 1253 | case AUDIT_LOGINUID: |
1253 | result = audit_comparator(cb->loginuid, f->op, f->val); | 1254 | result = audit_comparator(audit_get_loginuid(current), |
1255 | f->op, f->val); | ||
1256 | break; | ||
1257 | case AUDIT_SUBJ_USER: | ||
1258 | case AUDIT_SUBJ_ROLE: | ||
1259 | case AUDIT_SUBJ_TYPE: | ||
1260 | case AUDIT_SUBJ_SEN: | ||
1261 | case AUDIT_SUBJ_CLR: | ||
1262 | if (f->lsm_rule) { | ||
1263 | security_task_getsecid(current, &sid); | ||
1264 | result = security_audit_rule_match(sid, | ||
1265 | f->type, | ||
1266 | f->op, | ||
1267 | f->lsm_rule, | ||
1268 | NULL); | ||
1269 | } | ||
1254 | break; | 1270 | break; |
1255 | } | 1271 | } |
1256 | 1272 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1b31c130d034..00d79df03e76 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -241,6 +241,10 @@ struct audit_context { | |||
241 | pid_t pid; | 241 | pid_t pid; |
242 | struct audit_cap_data cap; | 242 | struct audit_cap_data cap; |
243 | } capset; | 243 | } capset; |
244 | struct { | ||
245 | int fd; | ||
246 | int flags; | ||
247 | } mmap; | ||
244 | }; | 248 | }; |
245 | int fds[2]; | 249 | int fds[2]; |
246 | 250 | ||
@@ -439,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
439 | 443 | ||
440 | /* Determine if any context name data matches a rule's watch data */ | 444 | /* Determine if any context name data matches a rule's watch data */ |
441 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
442 | * otherwise. */ | 446 | * otherwise. |
447 | * | ||
448 | * If task_creation is true, this is an explicit indication that we are | ||
449 | * filtering a task rule at task creation time. This and tsk == current are | ||
450 | * the only situations where tsk->cred may be accessed without an rcu read lock. | ||
451 | */ | ||
443 | static int audit_filter_rules(struct task_struct *tsk, | 452 | static int audit_filter_rules(struct task_struct *tsk, |
444 | struct audit_krule *rule, | 453 | struct audit_krule *rule, |
445 | struct audit_context *ctx, | 454 | struct audit_context *ctx, |
446 | struct audit_names *name, | 455 | struct audit_names *name, |
447 | enum audit_state *state) | 456 | enum audit_state *state, |
457 | bool task_creation) | ||
448 | { | 458 | { |
449 | const struct cred *cred = get_task_cred(tsk); | 459 | const struct cred *cred; |
450 | int i, j, need_sid = 1; | 460 | int i, j, need_sid = 1; |
451 | u32 sid; | 461 | u32 sid; |
452 | 462 | ||
463 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); | ||
464 | |||
453 | for (i = 0; i < rule->field_count; i++) { | 465 | for (i = 0; i < rule->field_count; i++) { |
454 | struct audit_field *f = &rule->fields[i]; | 466 | struct audit_field *f = &rule->fields[i]; |
455 | int result = 0; | 467 | int result = 0; |
@@ -633,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
633 | break; | 645 | break; |
634 | } | 646 | } |
635 | 647 | ||
636 | if (!result) { | 648 | if (!result) |
637 | put_cred(cred); | ||
638 | return 0; | 649 | return 0; |
639 | } | ||
640 | } | 650 | } |
641 | 651 | ||
642 | if (ctx) { | 652 | if (ctx) { |
@@ -652,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
652 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 662 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
653 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 663 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
654 | } | 664 | } |
655 | put_cred(cred); | ||
656 | return 1; | 665 | return 1; |
657 | } | 666 | } |
658 | 667 | ||
@@ -667,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) | |||
667 | 676 | ||
668 | rcu_read_lock(); | 677 | rcu_read_lock(); |
669 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { | 678 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { |
670 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { | 679 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, |
680 | &state, true)) { | ||
671 | if (state == AUDIT_RECORD_CONTEXT) | 681 | if (state == AUDIT_RECORD_CONTEXT) |
672 | *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); | 682 | *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); |
673 | rcu_read_unlock(); | 683 | rcu_read_unlock(); |
@@ -701,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
701 | list_for_each_entry_rcu(e, list, list) { | 711 | list_for_each_entry_rcu(e, list, list) { |
702 | if ((e->rule.mask[word] & bit) == bit && | 712 | if ((e->rule.mask[word] & bit) == bit && |
703 | audit_filter_rules(tsk, &e->rule, ctx, NULL, | 713 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
704 | &state)) { | 714 | &state, false)) { |
705 | rcu_read_unlock(); | 715 | rcu_read_unlock(); |
706 | ctx->current_state = state; | 716 | ctx->current_state = state; |
707 | return state; | 717 | return state; |
@@ -739,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) | |||
739 | 749 | ||
740 | list_for_each_entry_rcu(e, list, list) { | 750 | list_for_each_entry_rcu(e, list, list) { |
741 | if ((e->rule.mask[word] & bit) == bit && | 751 | if ((e->rule.mask[word] & bit) == bit && |
742 | audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { | 752 | audit_filter_rules(tsk, &e->rule, ctx, n, |
753 | &state, false)) { | ||
743 | rcu_read_unlock(); | 754 | rcu_read_unlock(); |
744 | ctx->current_state = state; | 755 | ctx->current_state = state; |
745 | return; | 756 | return; |
@@ -1007,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
1007 | /* | 1018 | /* |
1008 | * to_send and len_sent accounting are very loose estimates. We aren't | 1019 | * to_send and len_sent accounting are very loose estimates. We aren't |
1009 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being | 1020 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being |
1010 | * within about 500 bytes (next page boundry) | 1021 | * within about 500 bytes (next page boundary) |
1011 | * | 1022 | * |
1012 | * why snprintf? an int is up to 12 digits long. if we just assumed when | 1023 | * why snprintf? an int is up to 12 digits long. if we just assumed when |
1013 | * logging that a[%d]= was going to be 16 characters long we would be wasting | 1024 | * logging that a[%d]= was going to be 16 characters long we would be wasting |
@@ -1305,6 +1316,10 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1305 | audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); | 1316 | audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); |
1306 | audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); | 1317 | audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); |
1307 | break; } | 1318 | break; } |
1319 | case AUDIT_MMAP: { | ||
1320 | audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, | ||
1321 | context->mmap.flags); | ||
1322 | break; } | ||
1308 | } | 1323 | } |
1309 | audit_log_end(ab); | 1324 | audit_log_end(ab); |
1310 | } | 1325 | } |
@@ -2476,6 +2491,14 @@ void __audit_log_capset(pid_t pid, | |||
2476 | context->type = AUDIT_CAPSET; | 2491 | context->type = AUDIT_CAPSET; |
2477 | } | 2492 | } |
2478 | 2493 | ||
2494 | void __audit_mmap_fd(int fd, int flags) | ||
2495 | { | ||
2496 | struct audit_context *context = current->audit_context; | ||
2497 | context->mmap.fd = fd; | ||
2498 | context->mmap.flags = flags; | ||
2499 | context->type = AUDIT_MMAP; | ||
2500 | } | ||
2501 | |||
2479 | /** | 2502 | /** |
2480 | * audit_core_dumps - record information about processes that end abnormally | 2503 | * audit_core_dumps - record information about processes that end abnormally |
2481 | * @signr: signal value | 2504 | * @signr: signal value |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f26c136..0c9b862292b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
@@ -9,11 +9,13 @@ | |||
9 | #include <linux/page-flags.h> | 9 | #include <linux/page-flags.h> |
10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
12 | #include <linux/page_cgroup.h> | ||
12 | 13 | ||
13 | void foo(void) | 14 | void foo(void) |
14 | { | 15 | { |
15 | /* The enum constants to put into include/generated/bounds.h */ | 16 | /* The enum constants to put into include/generated/bounds.h */ |
16 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
17 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
19 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | ||
18 | /* End of constants */ | 20 | /* End of constants */ |
19 | } | 21 | } |
diff --git a/kernel/capability.c b/kernel/capability.c index 2f05303715a5..283c529f8b1c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/security.h> | 14 | #include <linux/security.h> |
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/pid_namespace.h> | 16 | #include <linux/pid_namespace.h> |
17 | #include <linux/user_namespace.h> | ||
17 | #include <asm/uaccess.h> | 18 | #include <asm/uaccess.h> |
18 | 19 | ||
19 | /* | 20 | /* |
@@ -21,12 +22,8 @@ | |||
21 | */ | 22 | */ |
22 | 23 | ||
23 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; | 24 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; |
24 | const kernel_cap_t __cap_full_set = CAP_FULL_SET; | ||
25 | const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; | ||
26 | 25 | ||
27 | EXPORT_SYMBOL(__cap_empty_set); | 26 | EXPORT_SYMBOL(__cap_empty_set); |
28 | EXPORT_SYMBOL(__cap_full_set); | ||
29 | EXPORT_SYMBOL(__cap_init_eff_set); | ||
30 | 27 | ||
31 | int file_caps_enabled = 1; | 28 | int file_caps_enabled = 1; |
32 | 29 | ||
@@ -290,6 +287,60 @@ error: | |||
290 | } | 287 | } |
291 | 288 | ||
292 | /** | 289 | /** |
290 | * has_capability - Does a task have a capability in init_user_ns | ||
291 | * @t: The task in question | ||
292 | * @cap: The capability to be tested for | ||
293 | * | ||
294 | * Return true if the specified task has the given superior capability | ||
295 | * currently in effect to the initial user namespace, false if not. | ||
296 | * | ||
297 | * Note that this does not set PF_SUPERPRIV on the task. | ||
298 | */ | ||
299 | bool has_capability(struct task_struct *t, int cap) | ||
300 | { | ||
301 | int ret = security_real_capable(t, &init_user_ns, cap); | ||
302 | |||
303 | return (ret == 0); | ||
304 | } | ||
305 | |||
306 | /** | ||
307 | * has_capability - Does a task have a capability in a specific user ns | ||
308 | * @t: The task in question | ||
309 | * @ns: target user namespace | ||
310 | * @cap: The capability to be tested for | ||
311 | * | ||
312 | * Return true if the specified task has the given superior capability | ||
313 | * currently in effect to the specified user namespace, false if not. | ||
314 | * | ||
315 | * Note that this does not set PF_SUPERPRIV on the task. | ||
316 | */ | ||
317 | bool has_ns_capability(struct task_struct *t, | ||
318 | struct user_namespace *ns, int cap) | ||
319 | { | ||
320 | int ret = security_real_capable(t, ns, cap); | ||
321 | |||
322 | return (ret == 0); | ||
323 | } | ||
324 | |||
325 | /** | ||
326 | * has_capability_noaudit - Does a task have a capability (unaudited) | ||
327 | * @t: The task in question | ||
328 | * @cap: The capability to be tested for | ||
329 | * | ||
330 | * Return true if the specified task has the given superior capability | ||
331 | * currently in effect to init_user_ns, false if not. Don't write an | ||
332 | * audit message for the check. | ||
333 | * | ||
334 | * Note that this does not set PF_SUPERPRIV on the task. | ||
335 | */ | ||
336 | bool has_capability_noaudit(struct task_struct *t, int cap) | ||
337 | { | ||
338 | int ret = security_real_capable_noaudit(t, &init_user_ns, cap); | ||
339 | |||
340 | return (ret == 0); | ||
341 | } | ||
342 | |||
343 | /** | ||
293 | * capable - Determine if the current task has a superior capability in effect | 344 | * capable - Determine if the current task has a superior capability in effect |
294 | * @cap: The capability to be tested for | 345 | * @cap: The capability to be tested for |
295 | * | 346 | * |
@@ -299,17 +350,60 @@ error: | |||
299 | * This sets PF_SUPERPRIV on the task if the capability is available on the | 350 | * This sets PF_SUPERPRIV on the task if the capability is available on the |
300 | * assumption that it's about to be used. | 351 | * assumption that it's about to be used. |
301 | */ | 352 | */ |
302 | int capable(int cap) | 353 | bool capable(int cap) |
354 | { | ||
355 | return ns_capable(&init_user_ns, cap); | ||
356 | } | ||
357 | EXPORT_SYMBOL(capable); | ||
358 | |||
359 | /** | ||
360 | * ns_capable - Determine if the current task has a superior capability in effect | ||
361 | * @ns: The usernamespace we want the capability in | ||
362 | * @cap: The capability to be tested for | ||
363 | * | ||
364 | * Return true if the current task has the given superior capability currently | ||
365 | * available for use, false if not. | ||
366 | * | ||
367 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
368 | * assumption that it's about to be used. | ||
369 | */ | ||
370 | bool ns_capable(struct user_namespace *ns, int cap) | ||
303 | { | 371 | { |
304 | if (unlikely(!cap_valid(cap))) { | 372 | if (unlikely(!cap_valid(cap))) { |
305 | printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); | 373 | printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); |
306 | BUG(); | 374 | BUG(); |
307 | } | 375 | } |
308 | 376 | ||
309 | if (security_capable(cap) == 0) { | 377 | if (security_capable(ns, current_cred(), cap) == 0) { |
310 | current->flags |= PF_SUPERPRIV; | 378 | current->flags |= PF_SUPERPRIV; |
311 | return 1; | 379 | return true; |
312 | } | 380 | } |
313 | return 0; | 381 | return false; |
382 | } | ||
383 | EXPORT_SYMBOL(ns_capable); | ||
384 | |||
385 | /** | ||
386 | * task_ns_capable - Determine whether current task has a superior | ||
387 | * capability targeted at a specific task's user namespace. | ||
388 | * @t: The task whose user namespace is targeted. | ||
389 | * @cap: The capability in question. | ||
390 | * | ||
391 | * Return true if it does, false otherwise. | ||
392 | */ | ||
393 | bool task_ns_capable(struct task_struct *t, int cap) | ||
394 | { | ||
395 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); | ||
396 | } | ||
397 | EXPORT_SYMBOL(task_ns_capable); | ||
398 | |||
399 | /** | ||
400 | * nsown_capable - Check superior capability to one's own user_ns | ||
401 | * @cap: The capability in question | ||
402 | * | ||
403 | * Return true if the current task has the given superior capability | ||
404 | * targeted at its own user namespace. | ||
405 | */ | ||
406 | bool nsown_capable(int cap) | ||
407 | { | ||
408 | return ns_capable(current_user_ns(), cap); | ||
314 | } | 409 | } |
315 | EXPORT_SYMBOL(capable); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -52,12 +52,12 @@ | |||
52 | #include <linux/cgroupstats.h> | 52 | #include <linux/cgroupstats.h> |
53 | #include <linux/hash.h> | 53 | #include <linux/hash.h> |
54 | #include <linux/namei.h> | 54 | #include <linux/namei.h> |
55 | #include <linux/smp_lock.h> | ||
56 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
57 | #include <linux/idr.h> | 56 | #include <linux/idr.h> |
58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
59 | #include <linux/eventfd.h> | 58 | #include <linux/eventfd.h> |
60 | #include <linux/poll.h> | 59 | #include <linux/poll.h> |
60 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | ||
61 | 61 | ||
62 | #include <asm/atomic.h> | 62 | #include <asm/atomic.h> |
63 | 63 | ||
@@ -138,7 +138,7 @@ struct css_id { | |||
138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() |
139 | * css_tryget() should be used for avoiding race. | 139 | * css_tryget() should be used for avoiding race. |
140 | */ | 140 | */ |
141 | struct cgroup_subsys_state *css; | 141 | struct cgroup_subsys_state __rcu *css; |
142 | /* | 142 | /* |
143 | * ID of this css. | 143 | * ID of this css. |
144 | */ | 144 | */ |
@@ -158,7 +158,7 @@ struct css_id { | |||
158 | }; | 158 | }; |
159 | 159 | ||
160 | /* | 160 | /* |
161 | * cgroup_event represents events which userspace want to recieve. | 161 | * cgroup_event represents events which userspace want to receive. |
162 | */ | 162 | */ |
163 | struct cgroup_event { | 163 | struct cgroup_event { |
164 | /* | 164 | /* |
@@ -244,6 +244,11 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
244 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 244 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
245 | } | 245 | } |
246 | 246 | ||
247 | static int clone_children(const struct cgroup *cgrp) | ||
248 | { | ||
249 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
250 | } | ||
251 | |||
247 | /* | 252 | /* |
248 | * for_each_subsys() allows you to iterate on each subsystem attached to | 253 | * for_each_subsys() allows you to iterate on each subsystem attached to |
249 | * an active hierarchy | 254 | * an active hierarchy |
@@ -322,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | |||
322 | return &css_set_table[index]; | 327 | return &css_set_table[index]; |
323 | } | 328 | } |
324 | 329 | ||
325 | static void free_css_set_rcu(struct rcu_head *obj) | ||
326 | { | ||
327 | struct css_set *cg = container_of(obj, struct css_set, rcu_head); | ||
328 | kfree(cg); | ||
329 | } | ||
330 | |||
331 | /* We don't maintain the lists running through each css_set to its | 330 | /* We don't maintain the lists running through each css_set to its |
332 | * task until after the first call to cgroup_iter_start(). This | 331 | * task until after the first call to cgroup_iter_start(). This |
333 | * reduces the fork()/exit() overhead for people who have cgroups | 332 | * reduces the fork()/exit() overhead for people who have cgroups |
@@ -371,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
371 | } | 370 | } |
372 | 371 | ||
373 | write_unlock(&css_set_lock); | 372 | write_unlock(&css_set_lock); |
374 | call_rcu(&cg->rcu_head, free_css_set_rcu); | 373 | kfree_rcu(cg, rcu_head); |
375 | } | 374 | } |
376 | 375 | ||
377 | /* | 376 | /* |
@@ -760,6 +759,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
760 | */ | 759 | */ |
761 | 760 | ||
762 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 761 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); |
762 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | ||
763 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 763 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
764 | static int cgroup_populate_dir(struct cgroup *cgrp); | 764 | static int cgroup_populate_dir(struct cgroup *cgrp); |
765 | static const struct inode_operations cgroup_dir_inode_operations; | 765 | static const struct inode_operations cgroup_dir_inode_operations; |
@@ -778,6 +778,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
778 | struct inode *inode = new_inode(sb); | 778 | struct inode *inode = new_inode(sb); |
779 | 779 | ||
780 | if (inode) { | 780 | if (inode) { |
781 | inode->i_ino = get_next_ino(); | ||
781 | inode->i_mode = mode; | 782 | inode->i_mode = mode; |
782 | inode->i_uid = current_fsuid(); | 783 | inode->i_uid = current_fsuid(); |
783 | inode->i_gid = current_fsgid(); | 784 | inode->i_gid = current_fsgid(); |
@@ -806,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
806 | return ret; | 807 | return ret; |
807 | } | 808 | } |
808 | 809 | ||
809 | static void free_cgroup_rcu(struct rcu_head *obj) | ||
810 | { | ||
811 | struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); | ||
812 | |||
813 | kfree(cgrp); | ||
814 | } | ||
815 | |||
816 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 810 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
817 | { | 811 | { |
818 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 812 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -850,11 +844,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
850 | */ | 844 | */ |
851 | BUG_ON(!list_empty(&cgrp->pidlists)); | 845 | BUG_ON(!list_empty(&cgrp->pidlists)); |
852 | 846 | ||
853 | call_rcu(&cgrp->rcu_head, free_cgroup_rcu); | 847 | kfree_rcu(cgrp, rcu_head); |
854 | } | 848 | } |
855 | iput(inode); | 849 | iput(inode); |
856 | } | 850 | } |
857 | 851 | ||
852 | static int cgroup_delete(const struct dentry *d) | ||
853 | { | ||
854 | return 1; | ||
855 | } | ||
856 | |||
858 | static void remove_dir(struct dentry *d) | 857 | static void remove_dir(struct dentry *d) |
859 | { | 858 | { |
860 | struct dentry *parent = dget(d->d_parent); | 859 | struct dentry *parent = dget(d->d_parent); |
@@ -869,25 +868,29 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
869 | struct list_head *node; | 868 | struct list_head *node; |
870 | 869 | ||
871 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 870 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); |
872 | spin_lock(&dcache_lock); | 871 | spin_lock(&dentry->d_lock); |
873 | node = dentry->d_subdirs.next; | 872 | node = dentry->d_subdirs.next; |
874 | while (node != &dentry->d_subdirs) { | 873 | while (node != &dentry->d_subdirs) { |
875 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 874 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
875 | |||
876 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | ||
876 | list_del_init(node); | 877 | list_del_init(node); |
877 | if (d->d_inode) { | 878 | if (d->d_inode) { |
878 | /* This should never be called on a cgroup | 879 | /* This should never be called on a cgroup |
879 | * directory with child cgroups */ | 880 | * directory with child cgroups */ |
880 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 881 | BUG_ON(d->d_inode->i_mode & S_IFDIR); |
881 | d = dget_locked(d); | 882 | dget_dlock(d); |
882 | spin_unlock(&dcache_lock); | 883 | spin_unlock(&d->d_lock); |
884 | spin_unlock(&dentry->d_lock); | ||
883 | d_delete(d); | 885 | d_delete(d); |
884 | simple_unlink(dentry->d_inode, d); | 886 | simple_unlink(dentry->d_inode, d); |
885 | dput(d); | 887 | dput(d); |
886 | spin_lock(&dcache_lock); | 888 | spin_lock(&dentry->d_lock); |
887 | } | 889 | } else |
890 | spin_unlock(&d->d_lock); | ||
888 | node = dentry->d_subdirs.next; | 891 | node = dentry->d_subdirs.next; |
889 | } | 892 | } |
890 | spin_unlock(&dcache_lock); | 893 | spin_unlock(&dentry->d_lock); |
891 | } | 894 | } |
892 | 895 | ||
893 | /* | 896 | /* |
@@ -895,11 +898,16 @@ static void cgroup_clear_directory(struct dentry *dentry) | |||
895 | */ | 898 | */ |
896 | static void cgroup_d_remove_dir(struct dentry *dentry) | 899 | static void cgroup_d_remove_dir(struct dentry *dentry) |
897 | { | 900 | { |
901 | struct dentry *parent; | ||
902 | |||
898 | cgroup_clear_directory(dentry); | 903 | cgroup_clear_directory(dentry); |
899 | 904 | ||
900 | spin_lock(&dcache_lock); | 905 | parent = dentry->d_parent; |
906 | spin_lock(&parent->d_lock); | ||
907 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | ||
901 | list_del_init(&dentry->d_u.d_child); | 908 | list_del_init(&dentry->d_u.d_child); |
902 | spin_unlock(&dcache_lock); | 909 | spin_unlock(&dentry->d_lock); |
910 | spin_unlock(&parent->d_lock); | ||
903 | remove_dir(dentry); | 911 | remove_dir(dentry); |
904 | } | 912 | } |
905 | 913 | ||
@@ -1040,6 +1048,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1040 | seq_puts(seq, ",noprefix"); | 1048 | seq_puts(seq, ",noprefix"); |
1041 | if (strlen(root->release_agent_path)) | 1049 | if (strlen(root->release_agent_path)) |
1042 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1050 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1051 | if (clone_children(&root->top_cgroup)) | ||
1052 | seq_puts(seq, ",clone_children"); | ||
1043 | if (strlen(root->name)) | 1053 | if (strlen(root->name)) |
1044 | seq_printf(seq, ",name=%s", root->name); | 1054 | seq_printf(seq, ",name=%s", root->name); |
1045 | mutex_unlock(&cgroup_mutex); | 1055 | mutex_unlock(&cgroup_mutex); |
@@ -1050,6 +1060,7 @@ struct cgroup_sb_opts { | |||
1050 | unsigned long subsys_bits; | 1060 | unsigned long subsys_bits; |
1051 | unsigned long flags; | 1061 | unsigned long flags; |
1052 | char *release_agent; | 1062 | char *release_agent; |
1063 | bool clone_children; | ||
1053 | char *name; | 1064 | char *name; |
1054 | /* User explicitly requested empty subsystem */ | 1065 | /* User explicitly requested empty subsystem */ |
1055 | bool none; | 1066 | bool none; |
@@ -1066,7 +1077,8 @@ struct cgroup_sb_opts { | |||
1066 | */ | 1077 | */ |
1067 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1078 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1068 | { | 1079 | { |
1069 | char *token, *o = data ?: "all"; | 1080 | char *token, *o = data; |
1081 | bool all_ss = false, one_ss = false; | ||
1070 | unsigned long mask = (unsigned long)-1; | 1082 | unsigned long mask = (unsigned long)-1; |
1071 | int i; | 1083 | int i; |
1072 | bool module_pin_failed = false; | 1084 | bool module_pin_failed = false; |
@@ -1082,22 +1094,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1082 | while ((token = strsep(&o, ",")) != NULL) { | 1094 | while ((token = strsep(&o, ",")) != NULL) { |
1083 | if (!*token) | 1095 | if (!*token) |
1084 | return -EINVAL; | 1096 | return -EINVAL; |
1085 | if (!strcmp(token, "all")) { | 1097 | if (!strcmp(token, "none")) { |
1086 | /* Add all non-disabled subsystems */ | ||
1087 | opts->subsys_bits = 0; | ||
1088 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1089 | struct cgroup_subsys *ss = subsys[i]; | ||
1090 | if (ss == NULL) | ||
1091 | continue; | ||
1092 | if (!ss->disabled) | ||
1093 | opts->subsys_bits |= 1ul << i; | ||
1094 | } | ||
1095 | } else if (!strcmp(token, "none")) { | ||
1096 | /* Explicitly have no subsystems */ | 1098 | /* Explicitly have no subsystems */ |
1097 | opts->none = true; | 1099 | opts->none = true; |
1098 | } else if (!strcmp(token, "noprefix")) { | 1100 | continue; |
1101 | } | ||
1102 | if (!strcmp(token, "all")) { | ||
1103 | /* Mutually exclusive option 'all' + subsystem name */ | ||
1104 | if (one_ss) | ||
1105 | return -EINVAL; | ||
1106 | all_ss = true; | ||
1107 | continue; | ||
1108 | } | ||
1109 | if (!strcmp(token, "noprefix")) { | ||
1099 | set_bit(ROOT_NOPREFIX, &opts->flags); | 1110 | set_bit(ROOT_NOPREFIX, &opts->flags); |
1100 | } else if (!strncmp(token, "release_agent=", 14)) { | 1111 | continue; |
1112 | } | ||
1113 | if (!strcmp(token, "clone_children")) { | ||
1114 | opts->clone_children = true; | ||
1115 | continue; | ||
1116 | } | ||
1117 | if (!strncmp(token, "release_agent=", 14)) { | ||
1101 | /* Specifying two release agents is forbidden */ | 1118 | /* Specifying two release agents is forbidden */ |
1102 | if (opts->release_agent) | 1119 | if (opts->release_agent) |
1103 | return -EINVAL; | 1120 | return -EINVAL; |
@@ -1105,7 +1122,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1105 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); | 1122 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); |
1106 | if (!opts->release_agent) | 1123 | if (!opts->release_agent) |
1107 | return -ENOMEM; | 1124 | return -ENOMEM; |
1108 | } else if (!strncmp(token, "name=", 5)) { | 1125 | continue; |
1126 | } | ||
1127 | if (!strncmp(token, "name=", 5)) { | ||
1109 | const char *name = token + 5; | 1128 | const char *name = token + 5; |
1110 | /* Can't specify an empty name */ | 1129 | /* Can't specify an empty name */ |
1111 | if (!strlen(name)) | 1130 | if (!strlen(name)) |
@@ -1127,20 +1146,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1127 | GFP_KERNEL); | 1146 | GFP_KERNEL); |
1128 | if (!opts->name) | 1147 | if (!opts->name) |
1129 | return -ENOMEM; | 1148 | return -ENOMEM; |
1130 | } else { | 1149 | |
1131 | struct cgroup_subsys *ss; | 1150 | continue; |
1132 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1151 | } |
1133 | ss = subsys[i]; | 1152 | |
1134 | if (ss == NULL) | 1153 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1135 | continue; | 1154 | struct cgroup_subsys *ss = subsys[i]; |
1136 | if (!strcmp(token, ss->name)) { | 1155 | if (ss == NULL) |
1137 | if (!ss->disabled) | 1156 | continue; |
1138 | set_bit(i, &opts->subsys_bits); | 1157 | if (strcmp(token, ss->name)) |
1139 | break; | 1158 | continue; |
1140 | } | 1159 | if (ss->disabled) |
1141 | } | 1160 | continue; |
1142 | if (i == CGROUP_SUBSYS_COUNT) | 1161 | |
1143 | return -ENOENT; | 1162 | /* Mutually exclusive option 'all' + subsystem name */ |
1163 | if (all_ss) | ||
1164 | return -EINVAL; | ||
1165 | set_bit(i, &opts->subsys_bits); | ||
1166 | one_ss = true; | ||
1167 | |||
1168 | break; | ||
1169 | } | ||
1170 | if (i == CGROUP_SUBSYS_COUNT) | ||
1171 | return -ENOENT; | ||
1172 | } | ||
1173 | |||
1174 | /* | ||
1175 | * If the 'all' option was specified select all the subsystems, | ||
1176 | * otherwise 'all, 'none' and a subsystem name options were not | ||
1177 | * specified, let's default to 'all' | ||
1178 | */ | ||
1179 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | ||
1180 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1181 | struct cgroup_subsys *ss = subsys[i]; | ||
1182 | if (ss == NULL) | ||
1183 | continue; | ||
1184 | if (ss->disabled) | ||
1185 | continue; | ||
1186 | set_bit(i, &opts->subsys_bits); | ||
1144 | } | 1187 | } |
1145 | } | 1188 | } |
1146 | 1189 | ||
@@ -1222,7 +1265,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1222 | struct cgroup *cgrp = &root->top_cgroup; | 1265 | struct cgroup *cgrp = &root->top_cgroup; |
1223 | struct cgroup_sb_opts opts; | 1266 | struct cgroup_sb_opts opts; |
1224 | 1267 | ||
1225 | lock_kernel(); | ||
1226 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1268 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1227 | mutex_lock(&cgroup_mutex); | 1269 | mutex_lock(&cgroup_mutex); |
1228 | 1270 | ||
@@ -1255,7 +1297,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1255 | kfree(opts.name); | 1297 | kfree(opts.name); |
1256 | mutex_unlock(&cgroup_mutex); | 1298 | mutex_unlock(&cgroup_mutex); |
1257 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1299 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1258 | unlock_kernel(); | ||
1259 | return ret; | 1300 | return ret; |
1260 | } | 1301 | } |
1261 | 1302 | ||
@@ -1357,6 +1398,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1357 | strcpy(root->release_agent_path, opts->release_agent); | 1398 | strcpy(root->release_agent_path, opts->release_agent); |
1358 | if (opts->name) | 1399 | if (opts->name) |
1359 | strcpy(root->name, opts->name); | 1400 | strcpy(root->name, opts->name); |
1401 | if (opts->clone_children) | ||
1402 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | ||
1360 | return root; | 1403 | return root; |
1361 | } | 1404 | } |
1362 | 1405 | ||
@@ -1400,6 +1443,11 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
1400 | 1443 | ||
1401 | static int cgroup_get_rootdir(struct super_block *sb) | 1444 | static int cgroup_get_rootdir(struct super_block *sb) |
1402 | { | 1445 | { |
1446 | static const struct dentry_operations cgroup_dops = { | ||
1447 | .d_iput = cgroup_diput, | ||
1448 | .d_delete = cgroup_delete, | ||
1449 | }; | ||
1450 | |||
1403 | struct inode *inode = | 1451 | struct inode *inode = |
1404 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | 1452 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); |
1405 | struct dentry *dentry; | 1453 | struct dentry *dentry; |
@@ -1417,12 +1465,14 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1417 | return -ENOMEM; | 1465 | return -ENOMEM; |
1418 | } | 1466 | } |
1419 | sb->s_root = dentry; | 1467 | sb->s_root = dentry; |
1468 | /* for everything else we want ->d_op set */ | ||
1469 | sb->s_d_op = &cgroup_dops; | ||
1420 | return 0; | 1470 | return 0; |
1421 | } | 1471 | } |
1422 | 1472 | ||
1423 | static int cgroup_get_sb(struct file_system_type *fs_type, | 1473 | static struct dentry *cgroup_mount(struct file_system_type *fs_type, |
1424 | int flags, const char *unused_dev_name, | 1474 | int flags, const char *unused_dev_name, |
1425 | void *data, struct vfsmount *mnt) | 1475 | void *data) |
1426 | { | 1476 | { |
1427 | struct cgroup_sb_opts opts; | 1477 | struct cgroup_sb_opts opts; |
1428 | struct cgroupfs_root *root; | 1478 | struct cgroupfs_root *root; |
@@ -1556,10 +1606,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1556 | drop_parsed_module_refcounts(opts.subsys_bits); | 1606 | drop_parsed_module_refcounts(opts.subsys_bits); |
1557 | } | 1607 | } |
1558 | 1608 | ||
1559 | simple_set_mnt(mnt, sb); | ||
1560 | kfree(opts.release_agent); | 1609 | kfree(opts.release_agent); |
1561 | kfree(opts.name); | 1610 | kfree(opts.name); |
1562 | return 0; | 1611 | return dget(sb->s_root); |
1563 | 1612 | ||
1564 | drop_new_super: | 1613 | drop_new_super: |
1565 | deactivate_locked_super(sb); | 1614 | deactivate_locked_super(sb); |
@@ -1568,8 +1617,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1568 | out_err: | 1617 | out_err: |
1569 | kfree(opts.release_agent); | 1618 | kfree(opts.release_agent); |
1570 | kfree(opts.name); | 1619 | kfree(opts.name); |
1571 | 1620 | return ERR_PTR(ret); | |
1572 | return ret; | ||
1573 | } | 1621 | } |
1574 | 1622 | ||
1575 | static void cgroup_kill_sb(struct super_block *sb) { | 1623 | static void cgroup_kill_sb(struct super_block *sb) { |
@@ -1619,7 +1667,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1619 | 1667 | ||
1620 | static struct file_system_type cgroup_fs_type = { | 1668 | static struct file_system_type cgroup_fs_type = { |
1621 | .name = "cgroup", | 1669 | .name = "cgroup", |
1622 | .get_sb = cgroup_get_sb, | 1670 | .mount = cgroup_mount, |
1623 | .kill_sb = cgroup_kill_sb, | 1671 | .kill_sb = cgroup_kill_sb, |
1624 | }; | 1672 | }; |
1625 | 1673 | ||
@@ -1688,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1688 | } | 1736 | } |
1689 | EXPORT_SYMBOL_GPL(cgroup_path); | 1737 | EXPORT_SYMBOL_GPL(cgroup_path); |
1690 | 1738 | ||
1739 | /* | ||
1740 | * cgroup_task_migrate - move a task from one cgroup to another. | ||
1741 | * | ||
1742 | * 'guarantee' is set if the caller promises that a new css_set for the task | ||
1743 | * will already exist. If not set, this function might sleep, and can fail with | ||
1744 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. | ||
1745 | */ | ||
1746 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | ||
1747 | struct task_struct *tsk, bool guarantee) | ||
1748 | { | ||
1749 | struct css_set *oldcg; | ||
1750 | struct css_set *newcg; | ||
1751 | |||
1752 | /* | ||
1753 | * get old css_set. we need to take task_lock and refcount it, because | ||
1754 | * an exiting task can change its css_set to init_css_set and drop its | ||
1755 | * old one without taking cgroup_mutex. | ||
1756 | */ | ||
1757 | task_lock(tsk); | ||
1758 | oldcg = tsk->cgroups; | ||
1759 | get_css_set(oldcg); | ||
1760 | task_unlock(tsk); | ||
1761 | |||
1762 | /* locate or allocate a new css_set for this task. */ | ||
1763 | if (guarantee) { | ||
1764 | /* we know the css_set we want already exists. */ | ||
1765 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
1766 | read_lock(&css_set_lock); | ||
1767 | newcg = find_existing_css_set(oldcg, cgrp, template); | ||
1768 | BUG_ON(!newcg); | ||
1769 | get_css_set(newcg); | ||
1770 | read_unlock(&css_set_lock); | ||
1771 | } else { | ||
1772 | might_sleep(); | ||
1773 | /* find_css_set will give us newcg already referenced. */ | ||
1774 | newcg = find_css_set(oldcg, cgrp); | ||
1775 | if (!newcg) { | ||
1776 | put_css_set(oldcg); | ||
1777 | return -ENOMEM; | ||
1778 | } | ||
1779 | } | ||
1780 | put_css_set(oldcg); | ||
1781 | |||
1782 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ | ||
1783 | task_lock(tsk); | ||
1784 | if (tsk->flags & PF_EXITING) { | ||
1785 | task_unlock(tsk); | ||
1786 | put_css_set(newcg); | ||
1787 | return -ESRCH; | ||
1788 | } | ||
1789 | rcu_assign_pointer(tsk->cgroups, newcg); | ||
1790 | task_unlock(tsk); | ||
1791 | |||
1792 | /* Update the css_set linked lists if we're using them */ | ||
1793 | write_lock(&css_set_lock); | ||
1794 | if (!list_empty(&tsk->cg_list)) | ||
1795 | list_move(&tsk->cg_list, &newcg->tasks); | ||
1796 | write_unlock(&css_set_lock); | ||
1797 | |||
1798 | /* | ||
1799 | * We just gained a reference on oldcg by taking it from the task. As | ||
1800 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | ||
1801 | * it here; it will be freed under RCU. | ||
1802 | */ | ||
1803 | put_css_set(oldcg); | ||
1804 | |||
1805 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | ||
1806 | return 0; | ||
1807 | } | ||
1808 | |||
1691 | /** | 1809 | /** |
1692 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' | 1810 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' |
1693 | * @cgrp: the cgroup the task is attaching to | 1811 | * @cgrp: the cgroup the task is attaching to |
@@ -1698,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); | |||
1698 | */ | 1816 | */ |
1699 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1817 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1700 | { | 1818 | { |
1701 | int retval = 0; | 1819 | int retval; |
1702 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1820 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1703 | struct cgroup *oldcgrp; | 1821 | struct cgroup *oldcgrp; |
1704 | struct css_set *cg; | ||
1705 | struct css_set *newcg; | ||
1706 | struct cgroupfs_root *root = cgrp->root; | 1822 | struct cgroupfs_root *root = cgrp->root; |
1707 | 1823 | ||
1708 | /* Nothing to do if the task is already in that cgroup */ | 1824 | /* Nothing to do if the task is already in that cgroup */ |
@@ -1712,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1712 | 1828 | ||
1713 | for_each_subsys(root, ss) { | 1829 | for_each_subsys(root, ss) { |
1714 | if (ss->can_attach) { | 1830 | if (ss->can_attach) { |
1715 | retval = ss->can_attach(ss, cgrp, tsk, false); | 1831 | retval = ss->can_attach(ss, cgrp, tsk); |
1716 | if (retval) { | 1832 | if (retval) { |
1717 | /* | 1833 | /* |
1718 | * Remember on which subsystem the can_attach() | 1834 | * Remember on which subsystem the can_attach() |
@@ -1724,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1724 | goto out; | 1840 | goto out; |
1725 | } | 1841 | } |
1726 | } | 1842 | } |
1843 | if (ss->can_attach_task) { | ||
1844 | retval = ss->can_attach_task(cgrp, tsk); | ||
1845 | if (retval) { | ||
1846 | failed_ss = ss; | ||
1847 | goto out; | ||
1848 | } | ||
1849 | } | ||
1727 | } | 1850 | } |
1728 | 1851 | ||
1729 | task_lock(tsk); | 1852 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
1730 | cg = tsk->cgroups; | 1853 | if (retval) |
1731 | get_css_set(cg); | ||
1732 | task_unlock(tsk); | ||
1733 | /* | ||
1734 | * Locate or allocate a new css_set for this task, | ||
1735 | * based on its final set of cgroups | ||
1736 | */ | ||
1737 | newcg = find_css_set(cg, cgrp); | ||
1738 | put_css_set(cg); | ||
1739 | if (!newcg) { | ||
1740 | retval = -ENOMEM; | ||
1741 | goto out; | ||
1742 | } | ||
1743 | |||
1744 | task_lock(tsk); | ||
1745 | if (tsk->flags & PF_EXITING) { | ||
1746 | task_unlock(tsk); | ||
1747 | put_css_set(newcg); | ||
1748 | retval = -ESRCH; | ||
1749 | goto out; | 1854 | goto out; |
1750 | } | ||
1751 | rcu_assign_pointer(tsk->cgroups, newcg); | ||
1752 | task_unlock(tsk); | ||
1753 | |||
1754 | /* Update the css_set linked lists if we're using them */ | ||
1755 | write_lock(&css_set_lock); | ||
1756 | if (!list_empty(&tsk->cg_list)) { | ||
1757 | list_del(&tsk->cg_list); | ||
1758 | list_add(&tsk->cg_list, &newcg->tasks); | ||
1759 | } | ||
1760 | write_unlock(&css_set_lock); | ||
1761 | 1855 | ||
1762 | for_each_subsys(root, ss) { | 1856 | for_each_subsys(root, ss) { |
1857 | if (ss->pre_attach) | ||
1858 | ss->pre_attach(cgrp); | ||
1859 | if (ss->attach_task) | ||
1860 | ss->attach_task(cgrp, tsk); | ||
1763 | if (ss->attach) | 1861 | if (ss->attach) |
1764 | ss->attach(ss, cgrp, oldcgrp, tsk, false); | 1862 | ss->attach(ss, cgrp, oldcgrp, tsk); |
1765 | } | 1863 | } |
1766 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1864 | |
1767 | synchronize_rcu(); | 1865 | synchronize_rcu(); |
1768 | put_css_set(cg); | ||
1769 | 1866 | ||
1770 | /* | 1867 | /* |
1771 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1868 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
@@ -1784,7 +1881,7 @@ out: | |||
1784 | */ | 1881 | */ |
1785 | break; | 1882 | break; |
1786 | if (ss->cancel_attach) | 1883 | if (ss->cancel_attach) |
1787 | ss->cancel_attach(ss, cgrp, tsk, false); | 1884 | ss->cancel_attach(ss, cgrp, tsk); |
1788 | } | 1885 | } |
1789 | } | 1886 | } |
1790 | return retval; | 1887 | return retval; |
@@ -1815,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
1815 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 1912 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
1816 | 1913 | ||
1817 | /* | 1914 | /* |
1818 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1915 | * cgroup_attach_proc works in two stages, the first of which prefetches all |
1819 | * held. May take task_lock of task | 1916 | * new css_sets needed (to make sure we have enough memory before committing |
1917 | * to the move) and stores them in a list of entries of the following type. | ||
1918 | * TODO: possible optimization: use css_set->rcu_head for chaining instead | ||
1919 | */ | ||
1920 | struct cg_list_entry { | ||
1921 | struct css_set *cg; | ||
1922 | struct list_head links; | ||
1923 | }; | ||
1924 | |||
1925 | static bool css_set_check_fetched(struct cgroup *cgrp, | ||
1926 | struct task_struct *tsk, struct css_set *cg, | ||
1927 | struct list_head *newcg_list) | ||
1928 | { | ||
1929 | struct css_set *newcg; | ||
1930 | struct cg_list_entry *cg_entry; | ||
1931 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
1932 | |||
1933 | read_lock(&css_set_lock); | ||
1934 | newcg = find_existing_css_set(cg, cgrp, template); | ||
1935 | if (newcg) | ||
1936 | get_css_set(newcg); | ||
1937 | read_unlock(&css_set_lock); | ||
1938 | |||
1939 | /* doesn't exist at all? */ | ||
1940 | if (!newcg) | ||
1941 | return false; | ||
1942 | /* see if it's already in the list */ | ||
1943 | list_for_each_entry(cg_entry, newcg_list, links) { | ||
1944 | if (cg_entry->cg == newcg) { | ||
1945 | put_css_set(newcg); | ||
1946 | return true; | ||
1947 | } | ||
1948 | } | ||
1949 | |||
1950 | /* not found */ | ||
1951 | put_css_set(newcg); | ||
1952 | return false; | ||
1953 | } | ||
1954 | |||
1955 | /* | ||
1956 | * Find the new css_set and store it in the list in preparation for moving the | ||
1957 | * given task to the given cgroup. Returns 0 or -ENOMEM. | ||
1958 | */ | ||
1959 | static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | ||
1960 | struct list_head *newcg_list) | ||
1961 | { | ||
1962 | struct css_set *newcg; | ||
1963 | struct cg_list_entry *cg_entry; | ||
1964 | |||
1965 | /* ensure a new css_set will exist for this thread */ | ||
1966 | newcg = find_css_set(cg, cgrp); | ||
1967 | if (!newcg) | ||
1968 | return -ENOMEM; | ||
1969 | /* add it to the list */ | ||
1970 | cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); | ||
1971 | if (!cg_entry) { | ||
1972 | put_css_set(newcg); | ||
1973 | return -ENOMEM; | ||
1974 | } | ||
1975 | cg_entry->cg = newcg; | ||
1976 | list_add(&cg_entry->links, newcg_list); | ||
1977 | return 0; | ||
1978 | } | ||
1979 | |||
1980 | /** | ||
1981 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup | ||
1982 | * @cgrp: the cgroup to attach to | ||
1983 | * @leader: the threadgroup leader task_struct of the group to be attached | ||
1984 | * | ||
1985 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will | ||
1986 | * take task_lock of each thread in leader's threadgroup individually in turn. | ||
1820 | */ | 1987 | */ |
1821 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) | 1988 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) |
1989 | { | ||
1990 | int retval, i, group_size; | ||
1991 | struct cgroup_subsys *ss, *failed_ss = NULL; | ||
1992 | bool cancel_failed_ss = false; | ||
1993 | /* guaranteed to be initialized later, but the compiler needs this */ | ||
1994 | struct cgroup *oldcgrp = NULL; | ||
1995 | struct css_set *oldcg; | ||
1996 | struct cgroupfs_root *root = cgrp->root; | ||
1997 | /* threadgroup list cursor and array */ | ||
1998 | struct task_struct *tsk; | ||
1999 | struct flex_array *group; | ||
2000 | /* | ||
2001 | * we need to make sure we have css_sets for all the tasks we're | ||
2002 | * going to move -before- we actually start moving them, so that in | ||
2003 | * case we get an ENOMEM we can bail out before making any changes. | ||
2004 | */ | ||
2005 | struct list_head newcg_list; | ||
2006 | struct cg_list_entry *cg_entry, *temp_nobe; | ||
2007 | |||
2008 | /* | ||
2009 | * step 0: in order to do expensive, possibly blocking operations for | ||
2010 | * every thread, we cannot iterate the thread group list, since it needs | ||
2011 | * rcu or tasklist locked. instead, build an array of all threads in the | ||
2012 | * group - threadgroup_fork_lock prevents new threads from appearing, | ||
2013 | * and if threads exit, this will just be an over-estimate. | ||
2014 | */ | ||
2015 | group_size = get_nr_threads(leader); | ||
2016 | /* flex_array supports very large thread-groups better than kmalloc. */ | ||
2017 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, | ||
2018 | GFP_KERNEL); | ||
2019 | if (!group) | ||
2020 | return -ENOMEM; | ||
2021 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | ||
2022 | retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); | ||
2023 | if (retval) | ||
2024 | goto out_free_group_list; | ||
2025 | |||
2026 | /* prevent changes to the threadgroup list while we take a snapshot. */ | ||
2027 | rcu_read_lock(); | ||
2028 | if (!thread_group_leader(leader)) { | ||
2029 | /* | ||
2030 | * a race with de_thread from another thread's exec() may strip | ||
2031 | * us of our leadership, making while_each_thread unsafe to use | ||
2032 | * on this task. if this happens, there is no choice but to | ||
2033 | * throw this task away and try again (from cgroup_procs_write); | ||
2034 | * this is "double-double-toil-and-trouble-check locking". | ||
2035 | */ | ||
2036 | rcu_read_unlock(); | ||
2037 | retval = -EAGAIN; | ||
2038 | goto out_free_group_list; | ||
2039 | } | ||
2040 | /* take a reference on each task in the group to go in the array. */ | ||
2041 | tsk = leader; | ||
2042 | i = 0; | ||
2043 | do { | ||
2044 | /* as per above, nr_threads may decrease, but not increase. */ | ||
2045 | BUG_ON(i >= group_size); | ||
2046 | get_task_struct(tsk); | ||
2047 | /* | ||
2048 | * saying GFP_ATOMIC has no effect here because we did prealloc | ||
2049 | * earlier, but it's good form to communicate our expectations. | ||
2050 | */ | ||
2051 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); | ||
2052 | BUG_ON(retval != 0); | ||
2053 | i++; | ||
2054 | } while_each_thread(leader, tsk); | ||
2055 | /* remember the number of threads in the array for later. */ | ||
2056 | group_size = i; | ||
2057 | rcu_read_unlock(); | ||
2058 | |||
2059 | /* | ||
2060 | * step 1: check that we can legitimately attach to the cgroup. | ||
2061 | */ | ||
2062 | for_each_subsys(root, ss) { | ||
2063 | if (ss->can_attach) { | ||
2064 | retval = ss->can_attach(ss, cgrp, leader); | ||
2065 | if (retval) { | ||
2066 | failed_ss = ss; | ||
2067 | goto out_cancel_attach; | ||
2068 | } | ||
2069 | } | ||
2070 | /* a callback to be run on every thread in the threadgroup. */ | ||
2071 | if (ss->can_attach_task) { | ||
2072 | /* run on each task in the threadgroup. */ | ||
2073 | for (i = 0; i < group_size; i++) { | ||
2074 | tsk = flex_array_get_ptr(group, i); | ||
2075 | retval = ss->can_attach_task(cgrp, tsk); | ||
2076 | if (retval) { | ||
2077 | failed_ss = ss; | ||
2078 | cancel_failed_ss = true; | ||
2079 | goto out_cancel_attach; | ||
2080 | } | ||
2081 | } | ||
2082 | } | ||
2083 | } | ||
2084 | |||
2085 | /* | ||
2086 | * step 2: make sure css_sets exist for all threads to be migrated. | ||
2087 | * we use find_css_set, which allocates a new one if necessary. | ||
2088 | */ | ||
2089 | INIT_LIST_HEAD(&newcg_list); | ||
2090 | for (i = 0; i < group_size; i++) { | ||
2091 | tsk = flex_array_get_ptr(group, i); | ||
2092 | /* nothing to do if this task is already in the cgroup */ | ||
2093 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
2094 | if (cgrp == oldcgrp) | ||
2095 | continue; | ||
2096 | /* get old css_set pointer */ | ||
2097 | task_lock(tsk); | ||
2098 | if (tsk->flags & PF_EXITING) { | ||
2099 | /* ignore this task if it's going away */ | ||
2100 | task_unlock(tsk); | ||
2101 | continue; | ||
2102 | } | ||
2103 | oldcg = tsk->cgroups; | ||
2104 | get_css_set(oldcg); | ||
2105 | task_unlock(tsk); | ||
2106 | /* see if the new one for us is already in the list? */ | ||
2107 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { | ||
2108 | /* was already there, nothing to do. */ | ||
2109 | put_css_set(oldcg); | ||
2110 | } else { | ||
2111 | /* we don't already have it. get new one. */ | ||
2112 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | ||
2113 | put_css_set(oldcg); | ||
2114 | if (retval) | ||
2115 | goto out_list_teardown; | ||
2116 | } | ||
2117 | } | ||
2118 | |||
2119 | /* | ||
2120 | * step 3: now that we're guaranteed success wrt the css_sets, proceed | ||
2121 | * to move all tasks to the new cgroup, calling ss->attach_task for each | ||
2122 | * one along the way. there are no failure cases after here, so this is | ||
2123 | * the commit point. | ||
2124 | */ | ||
2125 | for_each_subsys(root, ss) { | ||
2126 | if (ss->pre_attach) | ||
2127 | ss->pre_attach(cgrp); | ||
2128 | } | ||
2129 | for (i = 0; i < group_size; i++) { | ||
2130 | tsk = flex_array_get_ptr(group, i); | ||
2131 | /* leave current thread as it is if it's already there */ | ||
2132 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
2133 | if (cgrp == oldcgrp) | ||
2134 | continue; | ||
2135 | /* attach each task to each subsystem */ | ||
2136 | for_each_subsys(root, ss) { | ||
2137 | if (ss->attach_task) | ||
2138 | ss->attach_task(cgrp, tsk); | ||
2139 | } | ||
2140 | /* if the thread is PF_EXITING, it can just get skipped. */ | ||
2141 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); | ||
2142 | BUG_ON(retval != 0 && retval != -ESRCH); | ||
2143 | } | ||
2144 | /* nothing is sensitive to fork() after this point. */ | ||
2145 | |||
2146 | /* | ||
2147 | * step 4: do expensive, non-thread-specific subsystem callbacks. | ||
2148 | * TODO: if ever a subsystem needs to know the oldcgrp for each task | ||
2149 | * being moved, this call will need to be reworked to communicate that. | ||
2150 | */ | ||
2151 | for_each_subsys(root, ss) { | ||
2152 | if (ss->attach) | ||
2153 | ss->attach(ss, cgrp, oldcgrp, leader); | ||
2154 | } | ||
2155 | |||
2156 | /* | ||
2157 | * step 5: success! and cleanup | ||
2158 | */ | ||
2159 | synchronize_rcu(); | ||
2160 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2161 | retval = 0; | ||
2162 | out_list_teardown: | ||
2163 | /* clean up the list of prefetched css_sets. */ | ||
2164 | list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { | ||
2165 | list_del(&cg_entry->links); | ||
2166 | put_css_set(cg_entry->cg); | ||
2167 | kfree(cg_entry); | ||
2168 | } | ||
2169 | out_cancel_attach: | ||
2170 | /* same deal as in cgroup_attach_task */ | ||
2171 | if (retval) { | ||
2172 | for_each_subsys(root, ss) { | ||
2173 | if (ss == failed_ss) { | ||
2174 | if (cancel_failed_ss && ss->cancel_attach) | ||
2175 | ss->cancel_attach(ss, cgrp, leader); | ||
2176 | break; | ||
2177 | } | ||
2178 | if (ss->cancel_attach) | ||
2179 | ss->cancel_attach(ss, cgrp, leader); | ||
2180 | } | ||
2181 | } | ||
2182 | /* clean up the array of referenced threads in the group. */ | ||
2183 | for (i = 0; i < group_size; i++) { | ||
2184 | tsk = flex_array_get_ptr(group, i); | ||
2185 | put_task_struct(tsk); | ||
2186 | } | ||
2187 | out_free_group_list: | ||
2188 | flex_array_free(group); | ||
2189 | return retval; | ||
2190 | } | ||
2191 | |||
2192 | /* | ||
2193 | * Find the task_struct of the task to attach by vpid and pass it along to the | ||
2194 | * function to attach either it or all tasks in its threadgroup. Will take | ||
2195 | * cgroup_mutex; may take task_lock of task. | ||
2196 | */ | ||
2197 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | ||
1822 | { | 2198 | { |
1823 | struct task_struct *tsk; | 2199 | struct task_struct *tsk; |
1824 | const struct cred *cred = current_cred(), *tcred; | 2200 | const struct cred *cred = current_cred(), *tcred; |
1825 | int ret; | 2201 | int ret; |
1826 | 2202 | ||
2203 | if (!cgroup_lock_live_group(cgrp)) | ||
2204 | return -ENODEV; | ||
2205 | |||
1827 | if (pid) { | 2206 | if (pid) { |
1828 | rcu_read_lock(); | 2207 | rcu_read_lock(); |
1829 | tsk = find_task_by_vpid(pid); | 2208 | tsk = find_task_by_vpid(pid); |
1830 | if (!tsk || tsk->flags & PF_EXITING) { | 2209 | if (!tsk) { |
2210 | rcu_read_unlock(); | ||
2211 | cgroup_unlock(); | ||
2212 | return -ESRCH; | ||
2213 | } | ||
2214 | if (threadgroup) { | ||
2215 | /* | ||
2216 | * RCU protects this access, since tsk was found in the | ||
2217 | * tid map. a race with de_thread may cause group_leader | ||
2218 | * to stop being the leader, but cgroup_attach_proc will | ||
2219 | * detect it later. | ||
2220 | */ | ||
2221 | tsk = tsk->group_leader; | ||
2222 | } else if (tsk->flags & PF_EXITING) { | ||
2223 | /* optimization for the single-task-only case */ | ||
1831 | rcu_read_unlock(); | 2224 | rcu_read_unlock(); |
2225 | cgroup_unlock(); | ||
1832 | return -ESRCH; | 2226 | return -ESRCH; |
1833 | } | 2227 | } |
1834 | 2228 | ||
2229 | /* | ||
2230 | * even if we're attaching all tasks in the thread group, we | ||
2231 | * only need to check permissions on one of them. | ||
2232 | */ | ||
1835 | tcred = __task_cred(tsk); | 2233 | tcred = __task_cred(tsk); |
1836 | if (cred->euid && | 2234 | if (cred->euid && |
1837 | cred->euid != tcred->uid && | 2235 | cred->euid != tcred->uid && |
1838 | cred->euid != tcred->suid) { | 2236 | cred->euid != tcred->suid) { |
1839 | rcu_read_unlock(); | 2237 | rcu_read_unlock(); |
2238 | cgroup_unlock(); | ||
1840 | return -EACCES; | 2239 | return -EACCES; |
1841 | } | 2240 | } |
1842 | get_task_struct(tsk); | 2241 | get_task_struct(tsk); |
1843 | rcu_read_unlock(); | 2242 | rcu_read_unlock(); |
1844 | } else { | 2243 | } else { |
1845 | tsk = current; | 2244 | if (threadgroup) |
2245 | tsk = current->group_leader; | ||
2246 | else | ||
2247 | tsk = current; | ||
1846 | get_task_struct(tsk); | 2248 | get_task_struct(tsk); |
1847 | } | 2249 | } |
1848 | 2250 | ||
1849 | ret = cgroup_attach_task(cgrp, tsk); | 2251 | if (threadgroup) { |
2252 | threadgroup_fork_write_lock(tsk); | ||
2253 | ret = cgroup_attach_proc(cgrp, tsk); | ||
2254 | threadgroup_fork_write_unlock(tsk); | ||
2255 | } else { | ||
2256 | ret = cgroup_attach_task(cgrp, tsk); | ||
2257 | } | ||
1850 | put_task_struct(tsk); | 2258 | put_task_struct(tsk); |
2259 | cgroup_unlock(); | ||
1851 | return ret; | 2260 | return ret; |
1852 | } | 2261 | } |
1853 | 2262 | ||
1854 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2263 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) |
1855 | { | 2264 | { |
2265 | return attach_task_by_pid(cgrp, pid, false); | ||
2266 | } | ||
2267 | |||
2268 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | ||
2269 | { | ||
1856 | int ret; | 2270 | int ret; |
1857 | if (!cgroup_lock_live_group(cgrp)) | 2271 | do { |
1858 | return -ENODEV; | 2272 | /* |
1859 | ret = attach_task_by_pid(cgrp, pid); | 2273 | * attach_proc fails with -EAGAIN if threadgroup leadership |
1860 | cgroup_unlock(); | 2274 | * changes in the middle of the operation, in which case we need |
2275 | * to find the task_struct for the new leader and start over. | ||
2276 | */ | ||
2277 | ret = attach_task_by_pid(cgrp, tgid, true); | ||
2278 | } while (ret == -EAGAIN); | ||
1861 | return ret; | 2279 | return ret; |
1862 | } | 2280 | } |
1863 | 2281 | ||
@@ -1883,6 +2301,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | |||
1883 | const char *buffer) | 2301 | const char *buffer) |
1884 | { | 2302 | { |
1885 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 2303 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
2304 | if (strlen(buffer) >= PATH_MAX) | ||
2305 | return -EINVAL; | ||
1886 | if (!cgroup_lock_live_group(cgrp)) | 2306 | if (!cgroup_lock_live_group(cgrp)) |
1887 | return -ENODEV; | 2307 | return -ENODEV; |
1888 | strcpy(cgrp->root->release_agent_path, buffer); | 2308 | strcpy(cgrp->root->release_agent_path, buffer); |
@@ -2140,12 +2560,20 @@ static const struct file_operations cgroup_file_operations = { | |||
2140 | }; | 2560 | }; |
2141 | 2561 | ||
2142 | static const struct inode_operations cgroup_dir_inode_operations = { | 2562 | static const struct inode_operations cgroup_dir_inode_operations = { |
2143 | .lookup = simple_lookup, | 2563 | .lookup = cgroup_lookup, |
2144 | .mkdir = cgroup_mkdir, | 2564 | .mkdir = cgroup_mkdir, |
2145 | .rmdir = cgroup_rmdir, | 2565 | .rmdir = cgroup_rmdir, |
2146 | .rename = cgroup_rename, | 2566 | .rename = cgroup_rename, |
2147 | }; | 2567 | }; |
2148 | 2568 | ||
2569 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) | ||
2570 | { | ||
2571 | if (dentry->d_name.len > NAME_MAX) | ||
2572 | return ERR_PTR(-ENAMETOOLONG); | ||
2573 | d_add(dentry, NULL); | ||
2574 | return NULL; | ||
2575 | } | ||
2576 | |||
2149 | /* | 2577 | /* |
2150 | * Check if a file is a control file | 2578 | * Check if a file is a control file |
2151 | */ | 2579 | */ |
@@ -2159,10 +2587,6 @@ static inline struct cftype *__file_cft(struct file *file) | |||
2159 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2587 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, |
2160 | struct super_block *sb) | 2588 | struct super_block *sb) |
2161 | { | 2589 | { |
2162 | static const struct dentry_operations cgroup_dops = { | ||
2163 | .d_iput = cgroup_diput, | ||
2164 | }; | ||
2165 | |||
2166 | struct inode *inode; | 2590 | struct inode *inode; |
2167 | 2591 | ||
2168 | if (!dentry) | 2592 | if (!dentry) |
@@ -2188,7 +2612,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
2188 | inode->i_size = 0; | 2612 | inode->i_size = 0; |
2189 | inode->i_fop = &cgroup_file_operations; | 2613 | inode->i_fop = &cgroup_file_operations; |
2190 | } | 2614 | } |
2191 | dentry->d_op = &cgroup_dops; | ||
2192 | d_instantiate(dentry, inode); | 2615 | d_instantiate(dentry, inode); |
2193 | dget(dentry); /* Extra count - pin the dentry in core */ | 2616 | dget(dentry); /* Extra count - pin the dentry in core */ |
2194 | return 0; | 2617 | return 0; |
@@ -3176,6 +3599,23 @@ fail: | |||
3176 | return ret; | 3599 | return ret; |
3177 | } | 3600 | } |
3178 | 3601 | ||
3602 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | ||
3603 | struct cftype *cft) | ||
3604 | { | ||
3605 | return clone_children(cgrp); | ||
3606 | } | ||
3607 | |||
3608 | static int cgroup_clone_children_write(struct cgroup *cgrp, | ||
3609 | struct cftype *cft, | ||
3610 | u64 val) | ||
3611 | { | ||
3612 | if (val) | ||
3613 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3614 | else | ||
3615 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3616 | return 0; | ||
3617 | } | ||
3618 | |||
3179 | /* | 3619 | /* |
3180 | * for the common functions, 'private' gives the type of file | 3620 | * for the common functions, 'private' gives the type of file |
3181 | */ | 3621 | */ |
@@ -3192,9 +3632,9 @@ static struct cftype files[] = { | |||
3192 | { | 3632 | { |
3193 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | 3633 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", |
3194 | .open = cgroup_procs_open, | 3634 | .open = cgroup_procs_open, |
3195 | /* .write_u64 = cgroup_procs_write, TODO */ | 3635 | .write_u64 = cgroup_procs_write, |
3196 | .release = cgroup_pidlist_release, | 3636 | .release = cgroup_pidlist_release, |
3197 | .mode = S_IRUGO, | 3637 | .mode = S_IRUGO | S_IWUSR, |
3198 | }, | 3638 | }, |
3199 | { | 3639 | { |
3200 | .name = "notify_on_release", | 3640 | .name = "notify_on_release", |
@@ -3206,6 +3646,11 @@ static struct cftype files[] = { | |||
3206 | .write_string = cgroup_write_event_control, | 3646 | .write_string = cgroup_write_event_control, |
3207 | .mode = S_IWUGO, | 3647 | .mode = S_IWUGO, |
3208 | }, | 3648 | }, |
3649 | { | ||
3650 | .name = "cgroup.clone_children", | ||
3651 | .read_u64 = cgroup_clone_children_read, | ||
3652 | .write_u64 = cgroup_clone_children_write, | ||
3653 | }, | ||
3209 | }; | 3654 | }; |
3210 | 3655 | ||
3211 | static struct cftype cft_release_agent = { | 3656 | static struct cftype cft_release_agent = { |
@@ -3335,6 +3780,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3335 | if (notify_on_release(parent)) | 3780 | if (notify_on_release(parent)) |
3336 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3781 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3337 | 3782 | ||
3783 | if (clone_children(parent)) | ||
3784 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3785 | |||
3338 | for_each_subsys(root, ss) { | 3786 | for_each_subsys(root, ss) { |
3339 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 3787 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); |
3340 | 3788 | ||
@@ -3349,6 +3797,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3349 | goto err_destroy; | 3797 | goto err_destroy; |
3350 | } | 3798 | } |
3351 | /* At error, ->destroy() callback has to free assigned ID. */ | 3799 | /* At error, ->destroy() callback has to free assigned ID. */ |
3800 | if (clone_children(parent) && ss->post_clone) | ||
3801 | ss->post_clone(ss, cgrp); | ||
3352 | } | 3802 | } |
3353 | 3803 | ||
3354 | cgroup_lock_hierarchy(root); | 3804 | cgroup_lock_hierarchy(root); |
@@ -3563,17 +4013,15 @@ again: | |||
3563 | spin_lock(&release_list_lock); | 4013 | spin_lock(&release_list_lock); |
3564 | set_bit(CGRP_REMOVED, &cgrp->flags); | 4014 | set_bit(CGRP_REMOVED, &cgrp->flags); |
3565 | if (!list_empty(&cgrp->release_list)) | 4015 | if (!list_empty(&cgrp->release_list)) |
3566 | list_del(&cgrp->release_list); | 4016 | list_del_init(&cgrp->release_list); |
3567 | spin_unlock(&release_list_lock); | 4017 | spin_unlock(&release_list_lock); |
3568 | 4018 | ||
3569 | cgroup_lock_hierarchy(cgrp->root); | 4019 | cgroup_lock_hierarchy(cgrp->root); |
3570 | /* delete this cgroup from parent->children */ | 4020 | /* delete this cgroup from parent->children */ |
3571 | list_del(&cgrp->sibling); | 4021 | list_del_init(&cgrp->sibling); |
3572 | cgroup_unlock_hierarchy(cgrp->root); | 4022 | cgroup_unlock_hierarchy(cgrp->root); |
3573 | 4023 | ||
3574 | spin_lock(&cgrp->dentry->d_lock); | ||
3575 | d = dget(cgrp->dentry); | 4024 | d = dget(cgrp->dentry); |
3576 | spin_unlock(&d->d_lock); | ||
3577 | 4025 | ||
3578 | cgroup_d_remove_dir(d); | 4026 | cgroup_d_remove_dir(d); |
3579 | dput(d); | 4027 | dput(d); |
@@ -3789,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
3789 | subsys[ss->subsys_id] = NULL; | 4237 | subsys[ss->subsys_id] = NULL; |
3790 | 4238 | ||
3791 | /* remove subsystem from rootnode's list of subsystems */ | 4239 | /* remove subsystem from rootnode's list of subsystems */ |
3792 | list_del(&ss->sibling); | 4240 | list_del_init(&ss->sibling); |
3793 | 4241 | ||
3794 | /* | 4242 | /* |
3795 | * disentangle the css from all css_sets attached to the dummytop. as | 4243 | * disentangle the css from all css_sets attached to the dummytop. as |
@@ -4140,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
4140 | */ | 4588 | */ |
4141 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 4589 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
4142 | { | 4590 | { |
4143 | int i; | ||
4144 | struct css_set *cg; | 4591 | struct css_set *cg; |
4145 | 4592 | int i; | |
4146 | if (run_callbacks && need_forkexit_callback) { | ||
4147 | /* | ||
4148 | * modular subsystems can't use callbacks, so no need to lock | ||
4149 | * the subsys array | ||
4150 | */ | ||
4151 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4152 | struct cgroup_subsys *ss = subsys[i]; | ||
4153 | if (ss->exit) | ||
4154 | ss->exit(ss, tsk); | ||
4155 | } | ||
4156 | } | ||
4157 | 4593 | ||
4158 | /* | 4594 | /* |
4159 | * Unlink from the css_set task list if necessary. | 4595 | * Unlink from the css_set task list if necessary. |
@@ -4163,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4163 | if (!list_empty(&tsk->cg_list)) { | 4599 | if (!list_empty(&tsk->cg_list)) { |
4164 | write_lock(&css_set_lock); | 4600 | write_lock(&css_set_lock); |
4165 | if (!list_empty(&tsk->cg_list)) | 4601 | if (!list_empty(&tsk->cg_list)) |
4166 | list_del(&tsk->cg_list); | 4602 | list_del_init(&tsk->cg_list); |
4167 | write_unlock(&css_set_lock); | 4603 | write_unlock(&css_set_lock); |
4168 | } | 4604 | } |
4169 | 4605 | ||
@@ -4171,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4171 | task_lock(tsk); | 4607 | task_lock(tsk); |
4172 | cg = tsk->cgroups; | 4608 | cg = tsk->cgroups; |
4173 | tsk->cgroups = &init_css_set; | 4609 | tsk->cgroups = &init_css_set; |
4174 | task_unlock(tsk); | ||
4175 | if (cg) | ||
4176 | put_css_set_taskexit(cg); | ||
4177 | } | ||
4178 | |||
4179 | /** | ||
4180 | * cgroup_clone - clone the cgroup the given subsystem is attached to | ||
4181 | * @tsk: the task to be moved | ||
4182 | * @subsys: the given subsystem | ||
4183 | * @nodename: the name for the new cgroup | ||
4184 | * | ||
4185 | * Duplicate the current cgroup in the hierarchy that the given | ||
4186 | * subsystem is attached to, and move this task into the new | ||
4187 | * child. | ||
4188 | */ | ||
4189 | int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | ||
4190 | char *nodename) | ||
4191 | { | ||
4192 | struct dentry *dentry; | ||
4193 | int ret = 0; | ||
4194 | struct cgroup *parent, *child; | ||
4195 | struct inode *inode; | ||
4196 | struct css_set *cg; | ||
4197 | struct cgroupfs_root *root; | ||
4198 | struct cgroup_subsys *ss; | ||
4199 | |||
4200 | /* We shouldn't be called by an unregistered subsystem */ | ||
4201 | BUG_ON(!subsys->active); | ||
4202 | |||
4203 | /* First figure out what hierarchy and cgroup we're dealing | ||
4204 | * with, and pin them so we can drop cgroup_mutex */ | ||
4205 | mutex_lock(&cgroup_mutex); | ||
4206 | again: | ||
4207 | root = subsys->root; | ||
4208 | if (root == &rootnode) { | ||
4209 | mutex_unlock(&cgroup_mutex); | ||
4210 | return 0; | ||
4211 | } | ||
4212 | 4610 | ||
4213 | /* Pin the hierarchy */ | 4611 | if (run_callbacks && need_forkexit_callback) { |
4214 | if (!atomic_inc_not_zero(&root->sb->s_active)) { | 4612 | /* |
4215 | /* We race with the final deactivate_super() */ | 4613 | * modular subsystems can't use callbacks, so no need to lock |
4216 | mutex_unlock(&cgroup_mutex); | 4614 | * the subsys array |
4217 | return 0; | 4615 | */ |
4616 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
4617 | struct cgroup_subsys *ss = subsys[i]; | ||
4618 | if (ss->exit) { | ||
4619 | struct cgroup *old_cgrp = | ||
4620 | rcu_dereference_raw(cg->subsys[i])->cgroup; | ||
4621 | struct cgroup *cgrp = task_cgroup(tsk, i); | ||
4622 | ss->exit(ss, cgrp, old_cgrp, tsk); | ||
4623 | } | ||
4624 | } | ||
4218 | } | 4625 | } |
4219 | |||
4220 | /* Keep the cgroup alive */ | ||
4221 | task_lock(tsk); | ||
4222 | parent = task_cgroup(tsk, subsys->subsys_id); | ||
4223 | cg = tsk->cgroups; | ||
4224 | get_css_set(cg); | ||
4225 | task_unlock(tsk); | 4626 | task_unlock(tsk); |
4226 | 4627 | ||
4227 | mutex_unlock(&cgroup_mutex); | 4628 | if (cg) |
4228 | 4629 | put_css_set_taskexit(cg); | |
4229 | /* Now do the VFS work to create a cgroup */ | ||
4230 | inode = parent->dentry->d_inode; | ||
4231 | |||
4232 | /* Hold the parent directory mutex across this operation to | ||
4233 | * stop anyone else deleting the new cgroup */ | ||
4234 | mutex_lock(&inode->i_mutex); | ||
4235 | dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); | ||
4236 | if (IS_ERR(dentry)) { | ||
4237 | printk(KERN_INFO | ||
4238 | "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, | ||
4239 | PTR_ERR(dentry)); | ||
4240 | ret = PTR_ERR(dentry); | ||
4241 | goto out_release; | ||
4242 | } | ||
4243 | |||
4244 | /* Create the cgroup directory, which also creates the cgroup */ | ||
4245 | ret = vfs_mkdir(inode, dentry, 0755); | ||
4246 | child = __d_cgrp(dentry); | ||
4247 | dput(dentry); | ||
4248 | if (ret) { | ||
4249 | printk(KERN_INFO | ||
4250 | "Failed to create cgroup %s: %d\n", nodename, | ||
4251 | ret); | ||
4252 | goto out_release; | ||
4253 | } | ||
4254 | |||
4255 | /* The cgroup now exists. Retake cgroup_mutex and check | ||
4256 | * that we're still in the same state that we thought we | ||
4257 | * were. */ | ||
4258 | mutex_lock(&cgroup_mutex); | ||
4259 | if ((root != subsys->root) || | ||
4260 | (parent != task_cgroup(tsk, subsys->subsys_id))) { | ||
4261 | /* Aargh, we raced ... */ | ||
4262 | mutex_unlock(&inode->i_mutex); | ||
4263 | put_css_set(cg); | ||
4264 | |||
4265 | deactivate_super(root->sb); | ||
4266 | /* The cgroup is still accessible in the VFS, but | ||
4267 | * we're not going to try to rmdir() it at this | ||
4268 | * point. */ | ||
4269 | printk(KERN_INFO | ||
4270 | "Race in cgroup_clone() - leaking cgroup %s\n", | ||
4271 | nodename); | ||
4272 | goto again; | ||
4273 | } | ||
4274 | |||
4275 | /* do any required auto-setup */ | ||
4276 | for_each_subsys(root, ss) { | ||
4277 | if (ss->post_clone) | ||
4278 | ss->post_clone(ss, child); | ||
4279 | } | ||
4280 | |||
4281 | /* All seems fine. Finish by moving the task into the new cgroup */ | ||
4282 | ret = cgroup_attach_task(child, tsk); | ||
4283 | mutex_unlock(&cgroup_mutex); | ||
4284 | |||
4285 | out_release: | ||
4286 | mutex_unlock(&inode->i_mutex); | ||
4287 | |||
4288 | mutex_lock(&cgroup_mutex); | ||
4289 | put_css_set(cg); | ||
4290 | mutex_unlock(&cgroup_mutex); | ||
4291 | deactivate_super(root->sb); | ||
4292 | return ret; | ||
4293 | } | 4630 | } |
4294 | 4631 | ||
4295 | /** | 4632 | /** |
@@ -4530,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
4530 | return ret; | 4867 | return ret; |
4531 | } | 4868 | } |
4532 | 4869 | ||
4533 | static void __free_css_id_cb(struct rcu_head *head) | ||
4534 | { | ||
4535 | struct css_id *id; | ||
4536 | |||
4537 | id = container_of(head, struct css_id, rcu_head); | ||
4538 | kfree(id); | ||
4539 | } | ||
4540 | |||
4541 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 4870 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
4542 | { | 4871 | { |
4543 | struct css_id *id = css->id; | 4872 | struct css_id *id = css->id; |
@@ -4552,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
4552 | spin_lock(&ss->id_lock); | 4881 | spin_lock(&ss->id_lock); |
4553 | idr_remove(&ss->idr, id->id); | 4882 | idr_remove(&ss->idr, id->id); |
4554 | spin_unlock(&ss->id_lock); | 4883 | spin_unlock(&ss->id_lock); |
4555 | call_rcu(&id->rcu_head, __free_css_id_cb); | 4884 | kfree_rcu(id, rcu_head); |
4556 | } | 4885 | } |
4557 | EXPORT_SYMBOL_GPL(free_css_id); | 4886 | EXPORT_SYMBOL_GPL(free_css_id); |
4558 | 4887 | ||
@@ -4723,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
4723 | return ret; | 5052 | return ret; |
4724 | } | 5053 | } |
4725 | 5054 | ||
5055 | /* | ||
5056 | * get corresponding css from file open on cgroupfs directory | ||
5057 | */ | ||
5058 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | ||
5059 | { | ||
5060 | struct cgroup *cgrp; | ||
5061 | struct inode *inode; | ||
5062 | struct cgroup_subsys_state *css; | ||
5063 | |||
5064 | inode = f->f_dentry->d_inode; | ||
5065 | /* check in cgroup filesystem dir */ | ||
5066 | if (inode->i_op != &cgroup_dir_inode_operations) | ||
5067 | return ERR_PTR(-EBADF); | ||
5068 | |||
5069 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | ||
5070 | return ERR_PTR(-EINVAL); | ||
5071 | |||
5072 | /* get cgroup */ | ||
5073 | cgrp = __d_cgrp(f->f_dentry); | ||
5074 | css = cgrp->subsys[id]; | ||
5075 | return css ? css : ERR_PTR(-ENOENT); | ||
5076 | } | ||
5077 | |||
4726 | #ifdef CONFIG_CGROUP_DEBUG | 5078 | #ifdef CONFIG_CGROUP_DEBUG |
4727 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 5079 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, |
4728 | struct cgroup *cont) | 5080 | struct cgroup *cont) |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88f..e691818d7e45 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 48 | struct freezer, css); |
49 | } | 49 | } |
50 | 50 | ||
51 | int cgroup_freezing_or_frozen(struct task_struct *task) | 51 | static inline int __cgroup_freezing_or_frozen(struct task_struct *task) |
52 | { | 52 | { |
53 | struct freezer *freezer; | 53 | enum freezer_state state = task_freezer(task)->state; |
54 | enum freezer_state state; | 54 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); |
55 | } | ||
55 | 56 | ||
57 | int cgroup_freezing_or_frozen(struct task_struct *task) | ||
58 | { | ||
59 | int result; | ||
56 | task_lock(task); | 60 | task_lock(task); |
57 | freezer = task_freezer(task); | 61 | result = __cgroup_freezing_or_frozen(task); |
58 | if (!freezer->css.cgroup->parent) | ||
59 | state = CGROUP_THAWED; /* root cgroup can't be frozen */ | ||
60 | else | ||
61 | state = freezer->state; | ||
62 | task_unlock(task); | 62 | task_unlock(task); |
63 | 63 | return result; | |
64 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); | ||
65 | } | 64 | } |
66 | 65 | ||
67 | /* | 66 | /* |
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
154 | kfree(cgroup_freezer(cgroup)); | 153 | kfree(cgroup_freezer(cgroup)); |
155 | } | 154 | } |
156 | 155 | ||
157 | /* Task is frozen or will freeze immediately when next it gets woken */ | ||
158 | static bool is_task_frozen_enough(struct task_struct *task) | ||
159 | { | ||
160 | return frozen(task) || | ||
161 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
162 | } | ||
163 | |||
164 | /* | 156 | /* |
165 | * The call to cgroup_lock() in the freezer.state write method prevents | 157 | * The call to cgroup_lock() in the freezer.state write method prevents |
166 | * a write to that file racing against an attach, and hence the | 158 | * a write to that file racing against an attach, and hence the |
@@ -168,37 +160,29 @@ static bool is_task_frozen_enough(struct task_struct *task) | |||
168 | */ | 160 | */ |
169 | static int freezer_can_attach(struct cgroup_subsys *ss, | 161 | static int freezer_can_attach(struct cgroup_subsys *ss, |
170 | struct cgroup *new_cgroup, | 162 | struct cgroup *new_cgroup, |
171 | struct task_struct *task, bool threadgroup) | 163 | struct task_struct *task) |
172 | { | 164 | { |
173 | struct freezer *freezer; | 165 | struct freezer *freezer; |
174 | 166 | ||
175 | /* | 167 | /* |
176 | * Anything frozen can't move or be moved to/from. | 168 | * Anything frozen can't move or be moved to/from. |
177 | * | ||
178 | * Since orig_freezer->state == FROZEN means that @task has been | ||
179 | * frozen, so it's sufficient to check the latter condition. | ||
180 | */ | 169 | */ |
181 | 170 | ||
182 | if (is_task_frozen_enough(task)) | ||
183 | return -EBUSY; | ||
184 | |||
185 | freezer = cgroup_freezer(new_cgroup); | 171 | freezer = cgroup_freezer(new_cgroup); |
186 | if (freezer->state == CGROUP_FROZEN) | 172 | if (freezer->state != CGROUP_THAWED) |
187 | return -EBUSY; | 173 | return -EBUSY; |
188 | 174 | ||
189 | if (threadgroup) { | 175 | return 0; |
190 | struct task_struct *c; | 176 | } |
191 | 177 | ||
192 | rcu_read_lock(); | 178 | static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
193 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | 179 | { |
194 | if (is_task_frozen_enough(c)) { | 180 | rcu_read_lock(); |
195 | rcu_read_unlock(); | 181 | if (__cgroup_freezing_or_frozen(tsk)) { |
196 | return -EBUSY; | ||
197 | } | ||
198 | } | ||
199 | rcu_read_unlock(); | 182 | rcu_read_unlock(); |
183 | return -EBUSY; | ||
200 | } | 184 | } |
201 | 185 | rcu_read_unlock(); | |
202 | return 0; | 186 | return 0; |
203 | } | 187 | } |
204 | 188 | ||
@@ -236,31 +220,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |||
236 | /* | 220 | /* |
237 | * caller must hold freezer->lock | 221 | * caller must hold freezer->lock |
238 | */ | 222 | */ |
239 | static void update_freezer_state(struct cgroup *cgroup, | 223 | static void update_if_frozen(struct cgroup *cgroup, |
240 | struct freezer *freezer) | 224 | struct freezer *freezer) |
241 | { | 225 | { |
242 | struct cgroup_iter it; | 226 | struct cgroup_iter it; |
243 | struct task_struct *task; | 227 | struct task_struct *task; |
244 | unsigned int nfrozen = 0, ntotal = 0; | 228 | unsigned int nfrozen = 0, ntotal = 0; |
229 | enum freezer_state old_state = freezer->state; | ||
245 | 230 | ||
246 | cgroup_iter_start(cgroup, &it); | 231 | cgroup_iter_start(cgroup, &it); |
247 | while ((task = cgroup_iter_next(cgroup, &it))) { | 232 | while ((task = cgroup_iter_next(cgroup, &it))) { |
248 | ntotal++; | 233 | ntotal++; |
249 | if (is_task_frozen_enough(task)) | 234 | if (frozen(task)) |
250 | nfrozen++; | 235 | nfrozen++; |
251 | } | 236 | } |
252 | 237 | ||
253 | /* | 238 | if (old_state == CGROUP_THAWED) { |
254 | * Transition to FROZEN when no new tasks can be added ensures | 239 | BUG_ON(nfrozen > 0); |
255 | * that we never exist in the FROZEN state while there are unfrozen | 240 | } else if (old_state == CGROUP_FREEZING) { |
256 | * tasks. | 241 | if (nfrozen == ntotal) |
257 | */ | 242 | freezer->state = CGROUP_FROZEN; |
258 | if (nfrozen == ntotal) | 243 | } else { /* old_state == CGROUP_FROZEN */ |
259 | freezer->state = CGROUP_FROZEN; | 244 | BUG_ON(nfrozen != ntotal); |
260 | else if (nfrozen > 0) | 245 | } |
261 | freezer->state = CGROUP_FREEZING; | 246 | |
262 | else | ||
263 | freezer->state = CGROUP_THAWED; | ||
264 | cgroup_iter_end(cgroup, &it); | 247 | cgroup_iter_end(cgroup, &it); |
265 | } | 248 | } |
266 | 249 | ||
@@ -279,7 +262,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | |||
279 | if (state == CGROUP_FREEZING) { | 262 | if (state == CGROUP_FREEZING) { |
280 | /* We change from FREEZING to FROZEN lazily if the cgroup was | 263 | /* We change from FREEZING to FROZEN lazily if the cgroup was |
281 | * only partially frozen when we exitted write. */ | 264 | * only partially frozen when we exitted write. */ |
282 | update_freezer_state(cgroup, freezer); | 265 | update_if_frozen(cgroup, freezer); |
283 | state = freezer->state; | 266 | state = freezer->state; |
284 | } | 267 | } |
285 | spin_unlock_irq(&freezer->lock); | 268 | spin_unlock_irq(&freezer->lock); |
@@ -301,7 +284,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
301 | while ((task = cgroup_iter_next(cgroup, &it))) { | 284 | while ((task = cgroup_iter_next(cgroup, &it))) { |
302 | if (!freeze_task(task, true)) | 285 | if (!freeze_task(task, true)) |
303 | continue; | 286 | continue; |
304 | if (is_task_frozen_enough(task)) | 287 | if (frozen(task)) |
305 | continue; | 288 | continue; |
306 | if (!freezing(task) && !freezer_should_skip(task)) | 289 | if (!freezing(task) && !freezer_should_skip(task)) |
307 | num_cant_freeze_now++; | 290 | num_cant_freeze_now++; |
@@ -335,7 +318,7 @@ static int freezer_change_state(struct cgroup *cgroup, | |||
335 | 318 | ||
336 | spin_lock_irq(&freezer->lock); | 319 | spin_lock_irq(&freezer->lock); |
337 | 320 | ||
338 | update_freezer_state(cgroup, freezer); | 321 | update_if_frozen(cgroup, freezer); |
339 | if (goal_state == freezer->state) | 322 | if (goal_state == freezer->state) |
340 | goto out; | 323 | goto out; |
341 | 324 | ||
@@ -398,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { | |||
398 | .populate = freezer_populate, | 381 | .populate = freezer_populate, |
399 | .subsys_id = freezer_subsys_id, | 382 | .subsys_id = freezer_subsys_id, |
400 | .can_attach = freezer_can_attach, | 383 | .can_attach = freezer_can_attach, |
384 | .can_attach_task = freezer_can_attach_task, | ||
385 | .pre_attach = NULL, | ||
386 | .attach_task = NULL, | ||
401 | .attach = NULL, | 387 | .attach = NULL, |
402 | .fork = freezer_fork, | 388 | .fork = freezer_fork, |
403 | .exit = NULL, | 389 | .exit = NULL, |
diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a8..fc9eb093acd5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, | |||
52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; | 52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; |
53 | } | 53 | } |
54 | 54 | ||
55 | static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) | ||
56 | { | ||
57 | memset(txc, 0, sizeof(struct timex)); | ||
58 | |||
59 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | ||
60 | __get_user(txc->modes, &utp->modes) || | ||
61 | __get_user(txc->offset, &utp->offset) || | ||
62 | __get_user(txc->freq, &utp->freq) || | ||
63 | __get_user(txc->maxerror, &utp->maxerror) || | ||
64 | __get_user(txc->esterror, &utp->esterror) || | ||
65 | __get_user(txc->status, &utp->status) || | ||
66 | __get_user(txc->constant, &utp->constant) || | ||
67 | __get_user(txc->precision, &utp->precision) || | ||
68 | __get_user(txc->tolerance, &utp->tolerance) || | ||
69 | __get_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
70 | __get_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
71 | __get_user(txc->tick, &utp->tick) || | ||
72 | __get_user(txc->ppsfreq, &utp->ppsfreq) || | ||
73 | __get_user(txc->jitter, &utp->jitter) || | ||
74 | __get_user(txc->shift, &utp->shift) || | ||
75 | __get_user(txc->stabil, &utp->stabil) || | ||
76 | __get_user(txc->jitcnt, &utp->jitcnt) || | ||
77 | __get_user(txc->calcnt, &utp->calcnt) || | ||
78 | __get_user(txc->errcnt, &utp->errcnt) || | ||
79 | __get_user(txc->stbcnt, &utp->stbcnt)) | ||
80 | return -EFAULT; | ||
81 | |||
82 | return 0; | ||
83 | } | ||
84 | |||
85 | static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) | ||
86 | { | ||
87 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | ||
88 | __put_user(txc->modes, &utp->modes) || | ||
89 | __put_user(txc->offset, &utp->offset) || | ||
90 | __put_user(txc->freq, &utp->freq) || | ||
91 | __put_user(txc->maxerror, &utp->maxerror) || | ||
92 | __put_user(txc->esterror, &utp->esterror) || | ||
93 | __put_user(txc->status, &utp->status) || | ||
94 | __put_user(txc->constant, &utp->constant) || | ||
95 | __put_user(txc->precision, &utp->precision) || | ||
96 | __put_user(txc->tolerance, &utp->tolerance) || | ||
97 | __put_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
98 | __put_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
99 | __put_user(txc->tick, &utp->tick) || | ||
100 | __put_user(txc->ppsfreq, &utp->ppsfreq) || | ||
101 | __put_user(txc->jitter, &utp->jitter) || | ||
102 | __put_user(txc->shift, &utp->shift) || | ||
103 | __put_user(txc->stabil, &utp->stabil) || | ||
104 | __put_user(txc->jitcnt, &utp->jitcnt) || | ||
105 | __put_user(txc->calcnt, &utp->calcnt) || | ||
106 | __put_user(txc->errcnt, &utp->errcnt) || | ||
107 | __put_user(txc->stbcnt, &utp->stbcnt) || | ||
108 | __put_user(txc->tai, &utp->tai)) | ||
109 | return -EFAULT; | ||
110 | return 0; | ||
111 | } | ||
112 | |||
55 | asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, | 113 | asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, |
56 | struct timezone __user *tz) | 114 | struct timezone __user *tz) |
57 | { | 115 | { |
@@ -235,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) | |||
235 | return compat_jiffies_to_clock_t(jiffies); | 293 | return compat_jiffies_to_clock_t(jiffies); |
236 | } | 294 | } |
237 | 295 | ||
296 | #ifdef __ARCH_WANT_SYS_SIGPENDING | ||
297 | |||
238 | /* | 298 | /* |
239 | * Assumption: old_sigset_t and compat_old_sigset_t are both | 299 | * Assumption: old_sigset_t and compat_old_sigset_t are both |
240 | * types that can be passed to put_user()/get_user(). | 300 | * types that can be passed to put_user()/get_user(). |
@@ -254,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
254 | return ret; | 314 | return ret; |
255 | } | 315 | } |
256 | 316 | ||
317 | #endif | ||
318 | |||
319 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | ||
320 | |||
257 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 321 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, |
258 | compat_old_sigset_t __user *oset) | 322 | compat_old_sigset_t __user *oset) |
259 | { | 323 | { |
@@ -275,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | |||
275 | return ret; | 339 | return ret; |
276 | } | 340 | } |
277 | 341 | ||
342 | #endif | ||
343 | |||
278 | asmlinkage long compat_sys_setrlimit(unsigned int resource, | 344 | asmlinkage long compat_sys_setrlimit(unsigned int resource, |
279 | struct compat_rlimit __user *rlim) | 345 | struct compat_rlimit __user *rlim) |
280 | { | 346 | { |
@@ -617,6 +683,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, | |||
617 | return err; | 683 | return err; |
618 | } | 684 | } |
619 | 685 | ||
686 | long compat_sys_clock_adjtime(clockid_t which_clock, | ||
687 | struct compat_timex __user *utp) | ||
688 | { | ||
689 | struct timex txc; | ||
690 | mm_segment_t oldfs; | ||
691 | int err, ret; | ||
692 | |||
693 | err = compat_get_timex(&txc, utp); | ||
694 | if (err) | ||
695 | return err; | ||
696 | |||
697 | oldfs = get_fs(); | ||
698 | set_fs(KERNEL_DS); | ||
699 | ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); | ||
700 | set_fs(oldfs); | ||
701 | |||
702 | err = compat_put_timex(utp, &txc); | ||
703 | if (err) | ||
704 | return err; | ||
705 | |||
706 | return ret; | ||
707 | } | ||
708 | |||
620 | long compat_sys_clock_getres(clockid_t which_clock, | 709 | long compat_sys_clock_getres(clockid_t which_clock, |
621 | struct compat_timespec __user *tp) | 710 | struct compat_timespec __user *tp) |
622 | { | 711 | { |
@@ -809,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
809 | { | 898 | { |
810 | compat_sigset_t s32; | 899 | compat_sigset_t s32; |
811 | sigset_t s; | 900 | sigset_t s; |
812 | int sig; | ||
813 | struct timespec t; | 901 | struct timespec t; |
814 | siginfo_t info; | 902 | siginfo_t info; |
815 | long ret, timeout = 0; | 903 | long ret; |
816 | 904 | ||
817 | if (sigsetsize != sizeof(sigset_t)) | 905 | if (sigsetsize != sizeof(sigset_t)) |
818 | return -EINVAL; | 906 | return -EINVAL; |
@@ -820,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
820 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | 908 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) |
821 | return -EFAULT; | 909 | return -EFAULT; |
822 | sigset_from_compat(&s, &s32); | 910 | sigset_from_compat(&s, &s32); |
823 | sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
824 | signotset(&s); | ||
825 | 911 | ||
826 | if (uts) { | 912 | if (uts) { |
827 | if (get_compat_timespec (&t, uts)) | 913 | if (get_compat_timespec(&t, uts)) |
828 | return -EFAULT; | 914 | return -EFAULT; |
829 | if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 | ||
830 | || t.tv_sec < 0) | ||
831 | return -EINVAL; | ||
832 | } | 915 | } |
833 | 916 | ||
834 | spin_lock_irq(¤t->sighand->siglock); | 917 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); |
835 | sig = dequeue_signal(current, &s, &info); | ||
836 | if (!sig) { | ||
837 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
838 | if (uts) | ||
839 | timeout = timespec_to_jiffies(&t) | ||
840 | +(t.tv_sec || t.tv_nsec); | ||
841 | if (timeout) { | ||
842 | current->real_blocked = current->blocked; | ||
843 | sigandsets(¤t->blocked, ¤t->blocked, &s); | ||
844 | |||
845 | recalc_sigpending(); | ||
846 | spin_unlock_irq(¤t->sighand->siglock); | ||
847 | |||
848 | timeout = schedule_timeout_interruptible(timeout); | ||
849 | |||
850 | spin_lock_irq(¤t->sighand->siglock); | ||
851 | sig = dequeue_signal(current, &s, &info); | ||
852 | current->blocked = current->real_blocked; | ||
853 | siginitset(¤t->real_blocked, 0); | ||
854 | recalc_sigpending(); | ||
855 | } | ||
856 | } | ||
857 | spin_unlock_irq(¤t->sighand->siglock); | ||
858 | 918 | ||
859 | if (sig) { | 919 | if (ret > 0 && uinfo) { |
860 | ret = sig; | 920 | if (copy_siginfo_to_user32(uinfo, &info)) |
861 | if (uinfo) { | 921 | ret = -EFAULT; |
862 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
863 | ret = -EFAULT; | ||
864 | } | ||
865 | }else { | ||
866 | ret = timeout?-EINTR:-EAGAIN; | ||
867 | } | 922 | } |
923 | |||
868 | return ret; | 924 | return ret; |
869 | 925 | ||
870 | } | 926 | } |
@@ -951,58 +1007,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
951 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | 1007 | asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) |
952 | { | 1008 | { |
953 | struct timex txc; | 1009 | struct timex txc; |
954 | int ret; | 1010 | int err, ret; |
955 | |||
956 | memset(&txc, 0, sizeof(struct timex)); | ||
957 | 1011 | ||
958 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | 1012 | err = compat_get_timex(&txc, utp); |
959 | __get_user(txc.modes, &utp->modes) || | 1013 | if (err) |
960 | __get_user(txc.offset, &utp->offset) || | 1014 | return err; |
961 | __get_user(txc.freq, &utp->freq) || | ||
962 | __get_user(txc.maxerror, &utp->maxerror) || | ||
963 | __get_user(txc.esterror, &utp->esterror) || | ||
964 | __get_user(txc.status, &utp->status) || | ||
965 | __get_user(txc.constant, &utp->constant) || | ||
966 | __get_user(txc.precision, &utp->precision) || | ||
967 | __get_user(txc.tolerance, &utp->tolerance) || | ||
968 | __get_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
969 | __get_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
970 | __get_user(txc.tick, &utp->tick) || | ||
971 | __get_user(txc.ppsfreq, &utp->ppsfreq) || | ||
972 | __get_user(txc.jitter, &utp->jitter) || | ||
973 | __get_user(txc.shift, &utp->shift) || | ||
974 | __get_user(txc.stabil, &utp->stabil) || | ||
975 | __get_user(txc.jitcnt, &utp->jitcnt) || | ||
976 | __get_user(txc.calcnt, &utp->calcnt) || | ||
977 | __get_user(txc.errcnt, &utp->errcnt) || | ||
978 | __get_user(txc.stbcnt, &utp->stbcnt)) | ||
979 | return -EFAULT; | ||
980 | 1015 | ||
981 | ret = do_adjtimex(&txc); | 1016 | ret = do_adjtimex(&txc); |
982 | 1017 | ||
983 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | 1018 | err = compat_put_timex(utp, &txc); |
984 | __put_user(txc.modes, &utp->modes) || | 1019 | if (err) |
985 | __put_user(txc.offset, &utp->offset) || | 1020 | return err; |
986 | __put_user(txc.freq, &utp->freq) || | ||
987 | __put_user(txc.maxerror, &utp->maxerror) || | ||
988 | __put_user(txc.esterror, &utp->esterror) || | ||
989 | __put_user(txc.status, &utp->status) || | ||
990 | __put_user(txc.constant, &utp->constant) || | ||
991 | __put_user(txc.precision, &utp->precision) || | ||
992 | __put_user(txc.tolerance, &utp->tolerance) || | ||
993 | __put_user(txc.time.tv_sec, &utp->time.tv_sec) || | ||
994 | __put_user(txc.time.tv_usec, &utp->time.tv_usec) || | ||
995 | __put_user(txc.tick, &utp->tick) || | ||
996 | __put_user(txc.ppsfreq, &utp->ppsfreq) || | ||
997 | __put_user(txc.jitter, &utp->jitter) || | ||
998 | __put_user(txc.shift, &utp->shift) || | ||
999 | __put_user(txc.stabil, &utp->stabil) || | ||
1000 | __put_user(txc.jitcnt, &utp->jitcnt) || | ||
1001 | __put_user(txc.calcnt, &utp->calcnt) || | ||
1002 | __put_user(txc.errcnt, &utp->errcnt) || | ||
1003 | __put_user(txc.stbcnt, &utp->stbcnt) || | ||
1004 | __put_user(txc.tai, &utp->tai)) | ||
1005 | ret = -EFAULT; | ||
1006 | 1021 | ||
1007 | return ret; | 1022 | return ret; |
1008 | } | 1023 | } |
diff --git a/kernel/configs.c b/kernel/configs.c index abaee684ecbf..b4066b44a99d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf, | |||
66 | static const struct file_operations ikconfig_file_ops = { | 66 | static const struct file_operations ikconfig_file_ops = { |
67 | .owner = THIS_MODULE, | 67 | .owner = THIS_MODULE, |
68 | .read = ikconfig_read_current, | 68 | .read = ikconfig_read_current, |
69 | .llseek = default_llseek, | ||
69 | }; | 70 | }; |
70 | 71 | ||
71 | static int __init ikconfig_init(void) | 72 | static int __init ikconfig_init(void) |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f18491..12b7458f23b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) | |||
126 | #else /* #if CONFIG_HOTPLUG_CPU */ | 126 | #else /* #if CONFIG_HOTPLUG_CPU */ |
127 | static void cpu_hotplug_begin(void) {} | 127 | static void cpu_hotplug_begin(void) {} |
128 | static void cpu_hotplug_done(void) {} | 128 | static void cpu_hotplug_done(void) {} |
129 | #endif /* #esle #if CONFIG_HOTPLUG_CPU */ | 129 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ |
130 | 130 | ||
131 | /* Need to know about CPUs going up/down? */ | 131 | /* Need to know about CPUs going up/down? */ |
132 | int __ref register_cpu_notifier(struct notifier_block *nb) | 132 | int __ref register_cpu_notifier(struct notifier_block *nb) |
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) | |||
160 | { | 160 | { |
161 | BUG_ON(cpu_notify(val, v)); | 161 | BUG_ON(cpu_notify(val, v)); |
162 | } | 162 | } |
163 | |||
164 | EXPORT_SYMBOL(register_cpu_notifier); | 163 | EXPORT_SYMBOL(register_cpu_notifier); |
165 | 164 | ||
166 | void __ref unregister_cpu_notifier(struct notifier_block *nb) | 165 | void __ref unregister_cpu_notifier(struct notifier_block *nb) |
@@ -189,7 +188,6 @@ static inline void check_for_tasks(int cpu) | |||
189 | } | 188 | } |
190 | 189 | ||
191 | struct take_cpu_down_param { | 190 | struct take_cpu_down_param { |
192 | struct task_struct *caller; | ||
193 | unsigned long mod; | 191 | unsigned long mod; |
194 | void *hcpu; | 192 | void *hcpu; |
195 | }; | 193 | }; |
@@ -198,7 +196,6 @@ struct take_cpu_down_param { | |||
198 | static int __ref take_cpu_down(void *_param) | 196 | static int __ref take_cpu_down(void *_param) |
199 | { | 197 | { |
200 | struct take_cpu_down_param *param = _param; | 198 | struct take_cpu_down_param *param = _param; |
201 | unsigned int cpu = (unsigned long)param->hcpu; | ||
202 | int err; | 199 | int err; |
203 | 200 | ||
204 | /* Ensure this CPU doesn't handle any more interrupts. */ | 201 | /* Ensure this CPU doesn't handle any more interrupts. */ |
@@ -207,12 +204,6 @@ static int __ref take_cpu_down(void *_param) | |||
207 | return err; | 204 | return err; |
208 | 205 | ||
209 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 206 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
210 | |||
211 | if (task_cpu(param->caller) == cpu) | ||
212 | move_task_off_dead_cpu(cpu, param->caller); | ||
213 | /* Force idle task to run as soon as we yield: it should | ||
214 | immediately notice cpu is offline and die quickly. */ | ||
215 | sched_idle_next(); | ||
216 | return 0; | 207 | return 0; |
217 | } | 208 | } |
218 | 209 | ||
@@ -223,7 +214,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
223 | void *hcpu = (void *)(long)cpu; | 214 | void *hcpu = (void *)(long)cpu; |
224 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 215 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
225 | struct take_cpu_down_param tcd_param = { | 216 | struct take_cpu_down_param tcd_param = { |
226 | .caller = current, | ||
227 | .mod = mod, | 217 | .mod = mod, |
228 | .hcpu = hcpu, | 218 | .hcpu = hcpu, |
229 | }; | 219 | }; |
@@ -235,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
235 | return -EINVAL; | 225 | return -EINVAL; |
236 | 226 | ||
237 | cpu_hotplug_begin(); | 227 | cpu_hotplug_begin(); |
228 | |||
238 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); | 229 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); |
239 | if (err) { | 230 | if (err) { |
240 | nr_calls--; | 231 | nr_calls--; |
@@ -253,9 +244,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
253 | } | 244 | } |
254 | BUG_ON(cpu_online(cpu)); | 245 | BUG_ON(cpu_online(cpu)); |
255 | 246 | ||
256 | /* Wait for it to sleep (leaving idle task). */ | 247 | /* |
248 | * The migration_call() CPU_DYING callback will have removed all | ||
249 | * runnable tasks from the cpu, there's only the idle task left now | ||
250 | * that the migration thread is done doing the stop_machine thing. | ||
251 | * | ||
252 | * Wait for the stop thread to go away. | ||
253 | */ | ||
257 | while (!idle_cpu(cpu)) | 254 | while (!idle_cpu(cpu)) |
258 | yield(); | 255 | cpu_relax(); |
259 | 256 | ||
260 | /* This actually kills the CPU. */ | 257 | /* This actually kills the CPU. */ |
261 | __cpu_die(cpu); | 258 | __cpu_die(cpu); |
@@ -306,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
306 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
307 | if (ret) { | 304 | if (ret) { |
308 | nr_calls--; | 305 | nr_calls--; |
309 | printk("%s: attempt to bring up CPU %u failed\n", | 306 | printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", |
310 | __func__, cpu); | 307 | __func__, cpu); |
311 | goto out_notify; | 308 | goto out_notify; |
312 | } | 309 | } |
@@ -386,6 +383,14 @@ out: | |||
386 | #ifdef CONFIG_PM_SLEEP_SMP | 383 | #ifdef CONFIG_PM_SLEEP_SMP |
387 | static cpumask_var_t frozen_cpus; | 384 | static cpumask_var_t frozen_cpus; |
388 | 385 | ||
386 | void __weak arch_disable_nonboot_cpus_begin(void) | ||
387 | { | ||
388 | } | ||
389 | |||
390 | void __weak arch_disable_nonboot_cpus_end(void) | ||
391 | { | ||
392 | } | ||
393 | |||
389 | int disable_nonboot_cpus(void) | 394 | int disable_nonboot_cpus(void) |
390 | { | 395 | { |
391 | int cpu, first_cpu, error = 0; | 396 | int cpu, first_cpu, error = 0; |
@@ -397,6 +402,7 @@ int disable_nonboot_cpus(void) | |||
397 | * with the userspace trying to use the CPU hotplug at the same time | 402 | * with the userspace trying to use the CPU hotplug at the same time |
398 | */ | 403 | */ |
399 | cpumask_clear(frozen_cpus); | 404 | cpumask_clear(frozen_cpus); |
405 | arch_disable_nonboot_cpus_begin(); | ||
400 | 406 | ||
401 | printk("Disabling non-boot CPUs ...\n"); | 407 | printk("Disabling non-boot CPUs ...\n"); |
402 | for_each_online_cpu(cpu) { | 408 | for_each_online_cpu(cpu) { |
@@ -412,6 +418,8 @@ int disable_nonboot_cpus(void) | |||
412 | } | 418 | } |
413 | } | 419 | } |
414 | 420 | ||
421 | arch_disable_nonboot_cpus_end(); | ||
422 | |||
415 | if (!error) { | 423 | if (!error) { |
416 | BUG_ON(num_online_cpus() > 1); | 424 | BUG_ON(num_online_cpus() > 1); |
417 | /* Make sure the CPUs won't be enabled by someone else */ | 425 | /* Make sure the CPUs won't be enabled by someone else */ |
@@ -441,14 +449,14 @@ void __ref enable_nonboot_cpus(void) | |||
441 | if (cpumask_empty(frozen_cpus)) | 449 | if (cpumask_empty(frozen_cpus)) |
442 | goto out; | 450 | goto out; |
443 | 451 | ||
444 | printk("Enabling non-boot CPUs ...\n"); | 452 | printk(KERN_INFO "Enabling non-boot CPUs ...\n"); |
445 | 453 | ||
446 | arch_enable_nonboot_cpus_begin(); | 454 | arch_enable_nonboot_cpus_begin(); |
447 | 455 | ||
448 | for_each_cpu(cpu, frozen_cpus) { | 456 | for_each_cpu(cpu, frozen_cpus) { |
449 | error = _cpu_up(cpu, 1); | 457 | error = _cpu_up(cpu, 1); |
450 | if (!error) { | 458 | if (!error) { |
451 | printk("CPU%d is up\n", cpu); | 459 | printk(KERN_INFO "CPU%d is up\n", cpu); |
452 | continue; | 460 | continue; |
453 | } | 461 | } |
454 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); | 462 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); |
@@ -500,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) | |||
500 | */ | 508 | */ |
501 | 509 | ||
502 | /* cpu_bit_bitmap[0] is empty - so we can back into it */ | 510 | /* cpu_bit_bitmap[0] is empty - so we can back into it */ |
503 | #define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) | 511 | #define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) |
504 | #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) | 512 | #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) |
505 | #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) | 513 | #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) |
506 | #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) | 514 | #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..9c9b7545c810 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); | |||
231 | * users. If someone tries to mount the "cpuset" filesystem, we | 231 | * users. If someone tries to mount the "cpuset" filesystem, we |
232 | * silently switch it to mount "cgroup" instead | 232 | * silently switch it to mount "cgroup" instead |
233 | */ | 233 | */ |
234 | static int cpuset_get_sb(struct file_system_type *fs_type, | 234 | static struct dentry *cpuset_mount(struct file_system_type *fs_type, |
235 | int flags, const char *unused_dev_name, | 235 | int flags, const char *unused_dev_name, void *data) |
236 | void *data, struct vfsmount *mnt) | ||
237 | { | 236 | { |
238 | struct file_system_type *cgroup_fs = get_fs_type("cgroup"); | 237 | struct file_system_type *cgroup_fs = get_fs_type("cgroup"); |
239 | int ret = -ENODEV; | 238 | struct dentry *ret = ERR_PTR(-ENODEV); |
240 | if (cgroup_fs) { | 239 | if (cgroup_fs) { |
241 | char mountopts[] = | 240 | char mountopts[] = |
242 | "cpuset,noprefix," | 241 | "cpuset,noprefix," |
243 | "release_agent=/sbin/cpuset_release_agent"; | 242 | "release_agent=/sbin/cpuset_release_agent"; |
244 | ret = cgroup_fs->get_sb(cgroup_fs, flags, | 243 | ret = cgroup_fs->mount(cgroup_fs, flags, |
245 | unused_dev_name, mountopts, mnt); | 244 | unused_dev_name, mountopts); |
246 | put_filesystem(cgroup_fs); | 245 | put_filesystem(cgroup_fs); |
247 | } | 246 | } |
248 | return ret; | 247 | return ret; |
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type, | |||
250 | 249 | ||
251 | static struct file_system_type cpuset_fs_type = { | 250 | static struct file_system_type cpuset_fs_type = { |
252 | .name = "cpuset", | 251 | .name = "cpuset", |
253 | .get_sb = cpuset_get_sb, | 252 | .mount = cpuset_mount, |
254 | }; | 253 | }; |
255 | 254 | ||
256 | /* | 255 | /* |
@@ -1016,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1016 | struct cpuset *cs; | 1015 | struct cpuset *cs; |
1017 | int migrate; | 1016 | int migrate; |
1018 | const nodemask_t *oldmem = scan->data; | 1017 | const nodemask_t *oldmem = scan->data; |
1019 | NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); | 1018 | static nodemask_t newmems; /* protected by cgroup_mutex */ |
1020 | |||
1021 | if (!newmems) | ||
1022 | return; | ||
1023 | 1019 | ||
1024 | cs = cgroup_cs(scan->cg); | 1020 | cs = cgroup_cs(scan->cg); |
1025 | guarantee_online_mems(cs, newmems); | 1021 | guarantee_online_mems(cs, &newmems); |
1026 | 1022 | ||
1027 | cpuset_change_task_nodemask(p, newmems); | 1023 | cpuset_change_task_nodemask(p, &newmems); |
1028 | |||
1029 | NODEMASK_FREE(newmems); | ||
1030 | 1024 | ||
1031 | mm = get_task_mm(p); | 1025 | mm = get_task_mm(p); |
1032 | if (!mm) | 1026 | if (!mm) |
@@ -1165,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) | |||
1165 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
1166 | { | 1160 | { |
1167 | #ifdef CONFIG_SMP | 1161 | #ifdef CONFIG_SMP |
1168 | if (val < -1 || val >= SD_LV_MAX) | 1162 | if (val < -1 || val >= sched_domain_level_max) |
1169 | return -EINVAL; | 1163 | return -EINVAL; |
1170 | #endif | 1164 | #endif |
1171 | 1165 | ||
@@ -1373,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1373 | return val; | 1367 | return val; |
1374 | } | 1368 | } |
1375 | 1369 | ||
1376 | /* Protected by cgroup_lock */ | ||
1377 | static cpumask_var_t cpus_attach; | ||
1378 | |||
1379 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1370 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1380 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1371 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
1381 | struct task_struct *tsk, bool threadgroup) | 1372 | struct task_struct *tsk) |
1382 | { | 1373 | { |
1383 | int ret; | ||
1384 | struct cpuset *cs = cgroup_cs(cont); | 1374 | struct cpuset *cs = cgroup_cs(cont); |
1385 | 1375 | ||
1386 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1376 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
@@ -1397,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1397 | if (tsk->flags & PF_THREAD_BOUND) | 1387 | if (tsk->flags & PF_THREAD_BOUND) |
1398 | return -EINVAL; | 1388 | return -EINVAL; |
1399 | 1389 | ||
1400 | ret = security_task_setscheduler(tsk, 0, NULL); | ||
1401 | if (ret) | ||
1402 | return ret; | ||
1403 | if (threadgroup) { | ||
1404 | struct task_struct *c; | ||
1405 | |||
1406 | rcu_read_lock(); | ||
1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
1408 | ret = security_task_setscheduler(c, 0, NULL); | ||
1409 | if (ret) { | ||
1410 | rcu_read_unlock(); | ||
1411 | return ret; | ||
1412 | } | ||
1413 | } | ||
1414 | rcu_read_unlock(); | ||
1415 | } | ||
1416 | return 0; | 1390 | return 0; |
1417 | } | 1391 | } |
1418 | 1392 | ||
1419 | static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | 1393 | static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) |
1420 | struct cpuset *cs) | 1394 | { |
1395 | return security_task_setscheduler(task); | ||
1396 | } | ||
1397 | |||
1398 | /* | ||
1399 | * Protected by cgroup_lock. The nodemasks must be stored globally because | ||
1400 | * dynamically allocating them is not allowed in pre_attach, and they must | ||
1401 | * persist among pre_attach, attach_task, and attach. | ||
1402 | */ | ||
1403 | static cpumask_var_t cpus_attach; | ||
1404 | static nodemask_t cpuset_attach_nodemask_from; | ||
1405 | static nodemask_t cpuset_attach_nodemask_to; | ||
1406 | |||
1407 | /* Set-up work for before attaching each task. */ | ||
1408 | static void cpuset_pre_attach(struct cgroup *cont) | ||
1409 | { | ||
1410 | struct cpuset *cs = cgroup_cs(cont); | ||
1411 | |||
1412 | if (cs == &top_cpuset) | ||
1413 | cpumask_copy(cpus_attach, cpu_possible_mask); | ||
1414 | else | ||
1415 | guarantee_online_cpus(cs, cpus_attach); | ||
1416 | |||
1417 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | ||
1418 | } | ||
1419 | |||
1420 | /* Per-thread attachment work. */ | ||
1421 | static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) | ||
1421 | { | 1422 | { |
1422 | int err; | 1423 | int err; |
1424 | struct cpuset *cs = cgroup_cs(cont); | ||
1425 | |||
1423 | /* | 1426 | /* |
1424 | * can_attach beforehand should guarantee that this doesn't fail. | 1427 | * can_attach beforehand should guarantee that this doesn't fail. |
1425 | * TODO: have a better way to handle failure here | 1428 | * TODO: have a better way to handle failure here |
@@ -1427,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | |||
1427 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | 1430 | err = set_cpus_allowed_ptr(tsk, cpus_attach); |
1428 | WARN_ON_ONCE(err); | 1431 | WARN_ON_ONCE(err); |
1429 | 1432 | ||
1430 | cpuset_change_task_nodemask(tsk, to); | 1433 | cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); |
1431 | cpuset_update_task_spread_flag(cs, tsk); | 1434 | cpuset_update_task_spread_flag(cs, tsk); |
1432 | |||
1433 | } | 1435 | } |
1434 | 1436 | ||
1435 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1437 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
1436 | struct cgroup *oldcont, struct task_struct *tsk, | 1438 | struct cgroup *oldcont, struct task_struct *tsk) |
1437 | bool threadgroup) | ||
1438 | { | 1439 | { |
1439 | struct mm_struct *mm; | 1440 | struct mm_struct *mm; |
1440 | struct cpuset *cs = cgroup_cs(cont); | 1441 | struct cpuset *cs = cgroup_cs(cont); |
1441 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1442 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1442 | NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); | ||
1443 | NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); | ||
1444 | |||
1445 | if (from == NULL || to == NULL) | ||
1446 | goto alloc_fail; | ||
1447 | 1443 | ||
1448 | if (cs == &top_cpuset) { | 1444 | /* |
1449 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1445 | * Change mm, possibly for multiple threads in a threadgroup. This is |
1450 | } else { | 1446 | * expensive and may sleep. |
1451 | guarantee_online_cpus(cs, cpus_attach); | 1447 | */ |
1452 | } | 1448 | cpuset_attach_nodemask_from = oldcs->mems_allowed; |
1453 | guarantee_online_mems(cs, to); | 1449 | cpuset_attach_nodemask_to = cs->mems_allowed; |
1454 | |||
1455 | /* do per-task migration stuff possibly for each in the threadgroup */ | ||
1456 | cpuset_attach_task(tsk, to, cs); | ||
1457 | if (threadgroup) { | ||
1458 | struct task_struct *c; | ||
1459 | rcu_read_lock(); | ||
1460 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
1461 | cpuset_attach_task(c, to, cs); | ||
1462 | } | ||
1463 | rcu_read_unlock(); | ||
1464 | } | ||
1465 | |||
1466 | /* change mm; only needs to be done once even if threadgroup */ | ||
1467 | *from = oldcs->mems_allowed; | ||
1468 | *to = cs->mems_allowed; | ||
1469 | mm = get_task_mm(tsk); | 1450 | mm = get_task_mm(tsk); |
1470 | if (mm) { | 1451 | if (mm) { |
1471 | mpol_rebind_mm(mm, to); | 1452 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1472 | if (is_memory_migrate(cs)) | 1453 | if (is_memory_migrate(cs)) |
1473 | cpuset_migrate_mm(mm, from, to); | 1454 | cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, |
1455 | &cpuset_attach_nodemask_to); | ||
1474 | mmput(mm); | 1456 | mmput(mm); |
1475 | } | 1457 | } |
1476 | |||
1477 | alloc_fail: | ||
1478 | NODEMASK_FREE(from); | ||
1479 | NODEMASK_FREE(to); | ||
1480 | } | 1458 | } |
1481 | 1459 | ||
1482 | /* The various types of files and directories in a cpuset file system */ | 1460 | /* The various types of files and directories in a cpuset file system */ |
@@ -1576,8 +1554,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1576 | return -ENODEV; | 1554 | return -ENODEV; |
1577 | 1555 | ||
1578 | trialcs = alloc_trial_cpuset(cs); | 1556 | trialcs = alloc_trial_cpuset(cs); |
1579 | if (!trialcs) | 1557 | if (!trialcs) { |
1580 | return -ENOMEM; | 1558 | retval = -ENOMEM; |
1559 | goto out; | ||
1560 | } | ||
1581 | 1561 | ||
1582 | switch (cft->private) { | 1562 | switch (cft->private) { |
1583 | case FILE_CPULIST: | 1563 | case FILE_CPULIST: |
@@ -1592,6 +1572,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1592 | } | 1572 | } |
1593 | 1573 | ||
1594 | free_trial_cpuset(trialcs); | 1574 | free_trial_cpuset(trialcs); |
1575 | out: | ||
1595 | cgroup_unlock(); | 1576 | cgroup_unlock(); |
1596 | return retval; | 1577 | return retval; |
1597 | } | 1578 | } |
@@ -1608,34 +1589,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1608 | * across a page fault. | 1589 | * across a page fault. |
1609 | */ | 1590 | */ |
1610 | 1591 | ||
1611 | static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | 1592 | static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) |
1612 | { | 1593 | { |
1613 | int ret; | 1594 | size_t count; |
1614 | 1595 | ||
1615 | mutex_lock(&callback_mutex); | 1596 | mutex_lock(&callback_mutex); |
1616 | ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); | 1597 | count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); |
1617 | mutex_unlock(&callback_mutex); | 1598 | mutex_unlock(&callback_mutex); |
1618 | 1599 | ||
1619 | return ret; | 1600 | return count; |
1620 | } | 1601 | } |
1621 | 1602 | ||
1622 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | 1603 | static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) |
1623 | { | 1604 | { |
1624 | NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); | 1605 | size_t count; |
1625 | int retval; | ||
1626 | |||
1627 | if (mask == NULL) | ||
1628 | return -ENOMEM; | ||
1629 | 1606 | ||
1630 | mutex_lock(&callback_mutex); | 1607 | mutex_lock(&callback_mutex); |
1631 | *mask = cs->mems_allowed; | 1608 | count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); |
1632 | mutex_unlock(&callback_mutex); | 1609 | mutex_unlock(&callback_mutex); |
1633 | 1610 | ||
1634 | retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); | 1611 | return count; |
1635 | |||
1636 | NODEMASK_FREE(mask); | ||
1637 | |||
1638 | return retval; | ||
1639 | } | 1612 | } |
1640 | 1613 | ||
1641 | static ssize_t cpuset_common_file_read(struct cgroup *cont, | 1614 | static ssize_t cpuset_common_file_read(struct cgroup *cont, |
@@ -1829,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1829 | } | 1802 | } |
1830 | 1803 | ||
1831 | /* | 1804 | /* |
1832 | * post_clone() is called at the end of cgroup_clone(). | 1805 | * post_clone() is called during cgroup_create() when the |
1833 | * 'cgroup' was just created automatically as a result of | 1806 | * clone_children mount argument was specified. The cgroup |
1834 | * a cgroup_clone(), and the current task is about to | 1807 | * can not yet have any tasks. |
1835 | * be moved into 'cgroup'. | ||
1836 | * | 1808 | * |
1837 | * Currently we refuse to set up the cgroup - thereby | 1809 | * Currently we refuse to set up the cgroup - thereby |
1838 | * refusing the task to be entered, and as a result refusing | 1810 | * refusing the task to be entered, and as a result refusing |
@@ -1860,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, | |||
1860 | cs = cgroup_cs(cgroup); | 1832 | cs = cgroup_cs(cgroup); |
1861 | parent_cs = cgroup_cs(parent); | 1833 | parent_cs = cgroup_cs(parent); |
1862 | 1834 | ||
1835 | mutex_lock(&callback_mutex); | ||
1863 | cs->mems_allowed = parent_cs->mems_allowed; | 1836 | cs->mems_allowed = parent_cs->mems_allowed; |
1864 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); | 1837 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); |
1838 | mutex_unlock(&callback_mutex); | ||
1865 | return; | 1839 | return; |
1866 | } | 1840 | } |
1867 | 1841 | ||
@@ -1929,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1929 | .create = cpuset_create, | 1903 | .create = cpuset_create, |
1930 | .destroy = cpuset_destroy, | 1904 | .destroy = cpuset_destroy, |
1931 | .can_attach = cpuset_can_attach, | 1905 | .can_attach = cpuset_can_attach, |
1906 | .can_attach_task = cpuset_can_attach_task, | ||
1907 | .pre_attach = cpuset_pre_attach, | ||
1908 | .attach_task = cpuset_attach_task, | ||
1932 | .attach = cpuset_attach, | 1909 | .attach = cpuset_attach, |
1933 | .populate = cpuset_populate, | 1910 | .populate = cpuset_populate, |
1934 | .post_clone = cpuset_post_clone, | 1911 | .post_clone = cpuset_post_clone, |
@@ -2064,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2064 | struct cpuset *cp; /* scans cpusets being updated */ | 2041 | struct cpuset *cp; /* scans cpusets being updated */ |
2065 | struct cpuset *child; /* scans child cpusets of cp */ | 2042 | struct cpuset *child; /* scans child cpusets of cp */ |
2066 | struct cgroup *cont; | 2043 | struct cgroup *cont; |
2067 | NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); | 2044 | static nodemask_t oldmems; /* protected by cgroup_mutex */ |
2068 | |||
2069 | if (oldmems == NULL) | ||
2070 | return; | ||
2071 | 2045 | ||
2072 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2046 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
2073 | 2047 | ||
@@ -2084,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2084 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 2058 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
2085 | continue; | 2059 | continue; |
2086 | 2060 | ||
2087 | *oldmems = cp->mems_allowed; | 2061 | oldmems = cp->mems_allowed; |
2088 | 2062 | ||
2089 | /* Remove offline cpus and mems from this cpuset. */ | 2063 | /* Remove offline cpus and mems from this cpuset. */ |
2090 | mutex_lock(&callback_mutex); | 2064 | mutex_lock(&callback_mutex); |
@@ -2100,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2100 | remove_tasks_in_empty_cpuset(cp); | 2074 | remove_tasks_in_empty_cpuset(cp); |
2101 | else { | 2075 | else { |
2102 | update_tasks_cpumask(cp, NULL); | 2076 | update_tasks_cpumask(cp, NULL); |
2103 | update_tasks_nodemask(cp, oldmems, NULL); | 2077 | update_tasks_nodemask(cp, &oldmems, NULL); |
2104 | } | 2078 | } |
2105 | } | 2079 | } |
2106 | NODEMASK_FREE(oldmems); | ||
2107 | } | 2080 | } |
2108 | 2081 | ||
2109 | /* | 2082 | /* |
@@ -2145,19 +2118,16 @@ void cpuset_update_active_cpus(void) | |||
2145 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2118 | static int cpuset_track_online_nodes(struct notifier_block *self, |
2146 | unsigned long action, void *arg) | 2119 | unsigned long action, void *arg) |
2147 | { | 2120 | { |
2148 | NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); | 2121 | static nodemask_t oldmems; /* protected by cgroup_mutex */ |
2149 | |||
2150 | if (oldmems == NULL) | ||
2151 | return NOTIFY_DONE; | ||
2152 | 2122 | ||
2153 | cgroup_lock(); | 2123 | cgroup_lock(); |
2154 | switch (action) { | 2124 | switch (action) { |
2155 | case MEM_ONLINE: | 2125 | case MEM_ONLINE: |
2156 | *oldmems = top_cpuset.mems_allowed; | 2126 | oldmems = top_cpuset.mems_allowed; |
2157 | mutex_lock(&callback_mutex); | 2127 | mutex_lock(&callback_mutex); |
2158 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2128 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2159 | mutex_unlock(&callback_mutex); | 2129 | mutex_unlock(&callback_mutex); |
2160 | update_tasks_nodemask(&top_cpuset, oldmems, NULL); | 2130 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
2161 | break; | 2131 | break; |
2162 | case MEM_OFFLINE: | 2132 | case MEM_OFFLINE: |
2163 | /* | 2133 | /* |
@@ -2171,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2171 | } | 2141 | } |
2172 | cgroup_unlock(); | 2142 | cgroup_unlock(); |
2173 | 2143 | ||
2174 | NODEMASK_FREE(oldmems); | ||
2175 | return NOTIFY_OK; | 2144 | return NOTIFY_OK; |
2176 | } | 2145 | } |
2177 | #endif | 2146 | #endif |
@@ -2221,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
2221 | rcu_read_lock(); | 2190 | rcu_read_lock(); |
2222 | cs = task_cs(tsk); | 2191 | cs = task_cs(tsk); |
2223 | if (cs) | 2192 | if (cs) |
2224 | cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); | 2193 | do_set_cpus_allowed(tsk, cs->cpus_allowed); |
2225 | rcu_read_unlock(); | 2194 | rcu_read_unlock(); |
2226 | 2195 | ||
2227 | /* | 2196 | /* |
@@ -2248,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
2248 | * Like above we can temporary set any mask and rely on | 2217 | * Like above we can temporary set any mask and rely on |
2249 | * set_cpus_allowed_ptr() as synchronization point. | 2218 | * set_cpus_allowed_ptr() as synchronization point. |
2250 | */ | 2219 | */ |
2251 | cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); | 2220 | do_set_cpus_allowed(tsk, cpu_possible_mask); |
2252 | cpu = cpumask_any(cpu_active_mask); | 2221 | cpu = cpumask_any(cpu_active_mask); |
2253 | } | 2222 | } |
2254 | 2223 | ||
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..5f85690285d4 --- /dev/null +++ b/kernel/crash_dump.c | |||
@@ -0,0 +1,34 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/crash_dump.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/errno.h> | ||
5 | #include <linux/module.h> | ||
6 | |||
7 | /* | ||
8 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
9 | * to know the amount of memory that the previous kernel used. | ||
10 | */ | ||
11 | unsigned long saved_max_pfn; | ||
12 | |||
13 | /* | ||
14 | * stores the physical address of elf header of crash image | ||
15 | * | ||
16 | * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by | ||
17 | * is_kdump_kernel() to determine if we are booting after a panic. Hence put | ||
18 | * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. | ||
19 | */ | ||
20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
21 | |||
22 | /* | ||
23 | * elfcorehdr= specifies the location of elf core header stored by the crashed | ||
24 | * kernel. This option will be passed by kexec loader to the capture kernel. | ||
25 | */ | ||
26 | static int __init setup_elfcorehdr(char *arg) | ||
27 | { | ||
28 | char *end; | ||
29 | if (!arg) | ||
30 | return -EINVAL; | ||
31 | elfcorehdr_addr = memparse(arg, &end); | ||
32 | return end > arg ? 0 : -EINVAL; | ||
33 | } | ||
34 | early_param("elfcorehdr", setup_elfcorehdr); | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe7..174fa84eca30 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Task credentials management - see Documentation/credentials.txt | 1 | /* Task credentials management - see Documentation/security/credentials.txt |
2 | * | 2 | * |
3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; | |||
35 | static struct thread_group_cred init_tgcred = { | 35 | static struct thread_group_cred init_tgcred = { |
36 | .usage = ATOMIC_INIT(2), | 36 | .usage = ATOMIC_INIT(2), |
37 | .tgid = 0, | 37 | .tgid = 0, |
38 | .lock = SPIN_LOCK_UNLOCKED, | 38 | .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), |
39 | }; | 39 | }; |
40 | #endif | 40 | #endif |
41 | 41 | ||
@@ -49,11 +49,12 @@ struct cred init_cred = { | |||
49 | .magic = CRED_MAGIC, | 49 | .magic = CRED_MAGIC, |
50 | #endif | 50 | #endif |
51 | .securebits = SECUREBITS_DEFAULT, | 51 | .securebits = SECUREBITS_DEFAULT, |
52 | .cap_inheritable = CAP_INIT_INH_SET, | 52 | .cap_inheritable = CAP_EMPTY_SET, |
53 | .cap_permitted = CAP_FULL_SET, | 53 | .cap_permitted = CAP_FULL_SET, |
54 | .cap_effective = CAP_INIT_EFF_SET, | 54 | .cap_effective = CAP_FULL_SET, |
55 | .cap_bset = CAP_INIT_BSET, | 55 | .cap_bset = CAP_FULL_SET, |
56 | .user = INIT_USER, | 56 | .user = INIT_USER, |
57 | .user_ns = &init_user_ns, | ||
57 | .group_info = &init_groups, | 58 | .group_info = &init_groups, |
58 | #ifdef CONFIG_KEYS | 59 | #ifdef CONFIG_KEYS |
59 | .tgcred = &init_tgcred, | 60 | .tgcred = &init_tgcred, |
@@ -252,13 +253,13 @@ struct cred *cred_alloc_blank(void) | |||
252 | #endif | 253 | #endif |
253 | 254 | ||
254 | atomic_set(&new->usage, 1); | 255 | atomic_set(&new->usage, 1); |
256 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
257 | new->magic = CRED_MAGIC; | ||
258 | #endif | ||
255 | 259 | ||
256 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) | 260 | if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) |
257 | goto error; | 261 | goto error; |
258 | 262 | ||
259 | #ifdef CONFIG_DEBUG_CREDENTIALS | ||
260 | new->magic = CRED_MAGIC; | ||
261 | #endif | ||
262 | return new; | 263 | return new; |
263 | 264 | ||
264 | error: | 265 | error: |
@@ -325,7 +326,7 @@ EXPORT_SYMBOL(prepare_creds); | |||
325 | 326 | ||
326 | /* | 327 | /* |
327 | * Prepare credentials for current to perform an execve() | 328 | * Prepare credentials for current to perform an execve() |
328 | * - The caller must hold current->cred_guard_mutex | 329 | * - The caller must hold ->cred_guard_mutex |
329 | */ | 330 | */ |
330 | struct cred *prepare_exec_creds(void) | 331 | struct cred *prepare_exec_creds(void) |
331 | { | 332 | { |
@@ -384,8 +385,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
384 | struct cred *new; | 385 | struct cred *new; |
385 | int ret; | 386 | int ret; |
386 | 387 | ||
387 | mutex_init(&p->cred_guard_mutex); | ||
388 | |||
389 | if ( | 388 | if ( |
390 | #ifdef CONFIG_KEYS | 389 | #ifdef CONFIG_KEYS |
391 | !p->cred->thread_keyring && | 390 | !p->cred->thread_keyring && |
@@ -412,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
412 | goto error_put; | 411 | goto error_put; |
413 | } | 412 | } |
414 | 413 | ||
414 | /* cache user_ns in cred. Doesn't need a refcount because it will | ||
415 | * stay pinned by cred->user | ||
416 | */ | ||
417 | new->user_ns = new->user->user_ns; | ||
418 | |||
415 | #ifdef CONFIG_KEYS | 419 | #ifdef CONFIG_KEYS |
416 | /* new threads get their own thread keyrings if their parent already | 420 | /* new threads get their own thread keyrings if their parent already |
417 | * had one */ | 421 | * had one */ |
@@ -659,6 +663,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
659 | validate_creds(old); | 663 | validate_creds(old); |
660 | 664 | ||
661 | *new = *old; | 665 | *new = *old; |
666 | atomic_set(&new->usage, 1); | ||
667 | set_cred_subscribers(new, 0); | ||
662 | get_uid(new->user); | 668 | get_uid(new->user); |
663 | get_group_info(new->group_info); | 669 | get_group_info(new->group_info); |
664 | 670 | ||
@@ -676,8 +682,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
676 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) | 682 | if (security_prepare_creds(new, old, GFP_KERNEL) < 0) |
677 | goto error; | 683 | goto error; |
678 | 684 | ||
679 | atomic_set(&new->usage, 1); | ||
680 | set_cred_subscribers(new, 0); | ||
681 | put_cred(old); | 685 | put_cred(old); |
682 | validate_creds(new); | 686 | validate_creds(new); |
683 | return new; | 687 | return new; |
@@ -750,7 +754,11 @@ bool creds_are_invalid(const struct cred *cred) | |||
750 | if (cred->magic != CRED_MAGIC) | 754 | if (cred->magic != CRED_MAGIC) |
751 | return true; | 755 | return true; |
752 | #ifdef CONFIG_SECURITY_SELINUX | 756 | #ifdef CONFIG_SECURITY_SELINUX |
753 | if (selinux_is_enabled()) { | 757 | /* |
758 | * cred->security == NULL if security_cred_alloc_blank() or | ||
759 | * security_prepare_creds() returned an error. | ||
760 | */ | ||
761 | if (selinux_is_enabled() && cred->security) { | ||
754 | if ((unsigned long) cred->security < PAGE_SIZE) | 762 | if ((unsigned long) cred->security < PAGE_SIZE) |
755 | return true; | 763 | return true; |
756 | if ((*(u32 *)cred->security & 0xffffff00) == | 764 | if ((*(u32 *)cred->security & 0xffffff00) == |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index de407c78178d..bad6786dee88 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/pid.h> | 47 | #include <linux/pid.h> |
48 | #include <linux/smp.h> | 48 | #include <linux/smp.h> |
49 | #include <linux/mm.h> | 49 | #include <linux/mm.h> |
50 | #include <linux/rcupdate.h> | ||
50 | 51 | ||
51 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
52 | #include <asm/byteorder.h> | 53 | #include <asm/byteorder.h> |
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { | |||
109 | */ | 110 | */ |
110 | atomic_t kgdb_active = ATOMIC_INIT(-1); | 111 | atomic_t kgdb_active = ATOMIC_INIT(-1); |
111 | EXPORT_SYMBOL_GPL(kgdb_active); | 112 | EXPORT_SYMBOL_GPL(kgdb_active); |
113 | static DEFINE_RAW_SPINLOCK(dbg_master_lock); | ||
114 | static DEFINE_RAW_SPINLOCK(dbg_slave_lock); | ||
112 | 115 | ||
113 | /* | 116 | /* |
114 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early | 117 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early |
115 | * bootup code (which might not have percpu set up yet): | 118 | * bootup code (which might not have percpu set up yet): |
116 | */ | 119 | */ |
117 | static atomic_t passive_cpu_wait[NR_CPUS]; | 120 | static atomic_t masters_in_kgdb; |
118 | static atomic_t cpu_in_kgdb[NR_CPUS]; | 121 | static atomic_t slaves_in_kgdb; |
119 | static atomic_t kgdb_break_tasklet_var; | 122 | static atomic_t kgdb_break_tasklet_var; |
120 | atomic_t kgdb_setting_breakpoint; | 123 | atomic_t kgdb_setting_breakpoint; |
121 | 124 | ||
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) | |||
206 | return 0; | 209 | return 0; |
207 | } | 210 | } |
208 | 211 | ||
209 | /** | ||
210 | * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. | ||
211 | * @regs: Current &struct pt_regs. | ||
212 | * | ||
213 | * This function will be called if the particular architecture must | ||
214 | * disable hardware debugging while it is processing gdb packets or | ||
215 | * handling exception. | ||
216 | */ | ||
217 | void __weak kgdb_disable_hw_debug(struct pt_regs *regs) | ||
218 | { | ||
219 | } | ||
220 | |||
221 | /* | 212 | /* |
222 | * Some architectures need cache flushes when we set/clear a | 213 | * Some architectures need cache flushes when we set/clear a |
223 | * breakpoint: | 214 | * breakpoint: |
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
457 | return 1; | 448 | return 1; |
458 | } | 449 | } |
459 | 450 | ||
460 | static void dbg_cpu_switch(int cpu, int next_cpu) | 451 | static void dbg_touch_watchdogs(void) |
461 | { | 452 | { |
462 | /* Mark the cpu we are switching away from as a slave when it | 453 | touch_softlockup_watchdog_sync(); |
463 | * holds the kgdb_active token. This must be done so that the | 454 | clocksource_touch_watchdog(); |
464 | * that all the cpus wait in for the debug core will not enter | 455 | rcu_cpu_stall_reset(); |
465 | * again as the master. */ | ||
466 | if (cpu == atomic_read(&kgdb_active)) { | ||
467 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; | ||
468 | kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER; | ||
469 | } | ||
470 | kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; | ||
471 | } | 456 | } |
472 | 457 | ||
473 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) | 458 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, |
459 | int exception_state) | ||
474 | { | 460 | { |
475 | unsigned long flags; | 461 | unsigned long flags; |
476 | int sstep_tries = 100; | 462 | int sstep_tries = 100; |
477 | int error; | 463 | int error; |
478 | int i, cpu; | 464 | int cpu; |
479 | int trace_on = 0; | 465 | int trace_on = 0; |
466 | int online_cpus = num_online_cpus(); | ||
467 | |||
468 | kgdb_info[ks->cpu].enter_kgdb++; | ||
469 | kgdb_info[ks->cpu].exception_state |= exception_state; | ||
470 | |||
471 | if (exception_state == DCPU_WANT_MASTER) | ||
472 | atomic_inc(&masters_in_kgdb); | ||
473 | else | ||
474 | atomic_inc(&slaves_in_kgdb); | ||
475 | |||
476 | if (arch_kgdb_ops.disable_hw_break) | ||
477 | arch_kgdb_ops.disable_hw_break(regs); | ||
478 | |||
480 | acquirelock: | 479 | acquirelock: |
481 | /* | 480 | /* |
482 | * Interrupts will be restored by the 'trap return' code, except when | 481 | * Interrupts will be restored by the 'trap return' code, except when |
@@ -489,14 +488,15 @@ acquirelock: | |||
489 | kgdb_info[cpu].task = current; | 488 | kgdb_info[cpu].task = current; |
490 | kgdb_info[cpu].ret_state = 0; | 489 | kgdb_info[cpu].ret_state = 0; |
491 | kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; | 490 | kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; |
492 | /* | ||
493 | * Make sure the above info reaches the primary CPU before | ||
494 | * our cpu_in_kgdb[] flag setting does: | ||
495 | */ | ||
496 | atomic_inc(&cpu_in_kgdb[cpu]); | ||
497 | 491 | ||
498 | if (exception_level == 1) | 492 | /* Make sure the above info reaches the primary CPU */ |
493 | smp_mb(); | ||
494 | |||
495 | if (exception_level == 1) { | ||
496 | if (raw_spin_trylock(&dbg_master_lock)) | ||
497 | atomic_xchg(&kgdb_active, cpu); | ||
499 | goto cpu_master_loop; | 498 | goto cpu_master_loop; |
499 | } | ||
500 | 500 | ||
501 | /* | 501 | /* |
502 | * CPU will loop if it is a slave or request to become a kgdb | 502 | * CPU will loop if it is a slave or request to become a kgdb |
@@ -508,10 +508,12 @@ cpu_loop: | |||
508 | kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; | 508 | kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; |
509 | goto cpu_master_loop; | 509 | goto cpu_master_loop; |
510 | } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { | 510 | } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { |
511 | if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) | 511 | if (raw_spin_trylock(&dbg_master_lock)) { |
512 | atomic_xchg(&kgdb_active, cpu); | ||
512 | break; | 513 | break; |
514 | } | ||
513 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { | 515 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { |
514 | if (!atomic_read(&passive_cpu_wait[cpu])) | 516 | if (!raw_spin_is_locked(&dbg_slave_lock)) |
515 | goto return_normal; | 517 | goto return_normal; |
516 | } else { | 518 | } else { |
517 | return_normal: | 519 | return_normal: |
@@ -522,9 +524,12 @@ return_normal: | |||
522 | arch_kgdb_ops.correct_hw_break(); | 524 | arch_kgdb_ops.correct_hw_break(); |
523 | if (trace_on) | 525 | if (trace_on) |
524 | tracing_on(); | 526 | tracing_on(); |
525 | atomic_dec(&cpu_in_kgdb[cpu]); | 527 | kgdb_info[cpu].exception_state &= |
526 | touch_softlockup_watchdog_sync(); | 528 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
527 | clocksource_touch_watchdog(); | 529 | kgdb_info[cpu].enter_kgdb--; |
530 | smp_mb__before_atomic_dec(); | ||
531 | atomic_dec(&slaves_in_kgdb); | ||
532 | dbg_touch_watchdogs(); | ||
528 | local_irq_restore(flags); | 533 | local_irq_restore(flags); |
529 | return 0; | 534 | return 0; |
530 | } | 535 | } |
@@ -533,7 +538,7 @@ return_normal: | |||
533 | 538 | ||
534 | /* | 539 | /* |
535 | * For single stepping, try to only enter on the processor | 540 | * For single stepping, try to only enter on the processor |
536 | * that was single stepping. To gaurd against a deadlock, the | 541 | * that was single stepping. To guard against a deadlock, the |
537 | * kernel will only try for the value of sstep_tries before | 542 | * kernel will only try for the value of sstep_tries before |
538 | * giving up and continuing on. | 543 | * giving up and continuing on. |
539 | */ | 544 | */ |
@@ -541,8 +546,8 @@ return_normal: | |||
541 | (kgdb_info[cpu].task && | 546 | (kgdb_info[cpu].task && |
542 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { | 547 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { |
543 | atomic_set(&kgdb_active, -1); | 548 | atomic_set(&kgdb_active, -1); |
544 | touch_softlockup_watchdog_sync(); | 549 | raw_spin_unlock(&dbg_master_lock); |
545 | clocksource_touch_watchdog(); | 550 | dbg_touch_watchdogs(); |
546 | local_irq_restore(flags); | 551 | local_irq_restore(flags); |
547 | 552 | ||
548 | goto acquirelock; | 553 | goto acquirelock; |
@@ -563,16 +568,12 @@ return_normal: | |||
563 | if (dbg_io_ops->pre_exception) | 568 | if (dbg_io_ops->pre_exception) |
564 | dbg_io_ops->pre_exception(); | 569 | dbg_io_ops->pre_exception(); |
565 | 570 | ||
566 | kgdb_disable_hw_debug(ks->linux_regs); | ||
567 | |||
568 | /* | 571 | /* |
569 | * Get the passive CPU lock which will hold all the non-primary | 572 | * Get the passive CPU lock which will hold all the non-primary |
570 | * CPU in a spin state while the debugger is active | 573 | * CPU in a spin state while the debugger is active |
571 | */ | 574 | */ |
572 | if (!kgdb_single_step) { | 575 | if (!kgdb_single_step) |
573 | for (i = 0; i < NR_CPUS; i++) | 576 | raw_spin_lock(&dbg_slave_lock); |
574 | atomic_inc(&passive_cpu_wait[i]); | ||
575 | } | ||
576 | 577 | ||
577 | #ifdef CONFIG_SMP | 578 | #ifdef CONFIG_SMP |
578 | /* Signal the other CPUs to enter kgdb_wait() */ | 579 | /* Signal the other CPUs to enter kgdb_wait() */ |
@@ -583,10 +584,9 @@ return_normal: | |||
583 | /* | 584 | /* |
584 | * Wait for the other CPUs to be notified and be waiting for us: | 585 | * Wait for the other CPUs to be notified and be waiting for us: |
585 | */ | 586 | */ |
586 | for_each_online_cpu(i) { | 587 | while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + |
587 | while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) | 588 | atomic_read(&slaves_in_kgdb)) != online_cpus) |
588 | cpu_relax(); | 589 | cpu_relax(); |
589 | } | ||
590 | 590 | ||
591 | /* | 591 | /* |
592 | * At this point the primary processor is completely | 592 | * At this point the primary processor is completely |
@@ -615,7 +615,8 @@ cpu_master_loop: | |||
615 | if (error == DBG_PASS_EVENT) { | 615 | if (error == DBG_PASS_EVENT) { |
616 | dbg_kdb_mode = !dbg_kdb_mode; | 616 | dbg_kdb_mode = !dbg_kdb_mode; |
617 | } else if (error == DBG_SWITCH_CPU_EVENT) { | 617 | } else if (error == DBG_SWITCH_CPU_EVENT) { |
618 | dbg_cpu_switch(cpu, dbg_switch_cpu); | 618 | kgdb_info[dbg_switch_cpu].exception_state |= |
619 | DCPU_NEXT_MASTER; | ||
619 | goto cpu_loop; | 620 | goto cpu_loop; |
620 | } else { | 621 | } else { |
621 | kgdb_info[cpu].ret_state = error; | 622 | kgdb_info[cpu].ret_state = error; |
@@ -627,24 +628,11 @@ cpu_master_loop: | |||
627 | if (dbg_io_ops->post_exception) | 628 | if (dbg_io_ops->post_exception) |
628 | dbg_io_ops->post_exception(); | 629 | dbg_io_ops->post_exception(); |
629 | 630 | ||
630 | atomic_dec(&cpu_in_kgdb[ks->cpu]); | ||
631 | |||
632 | if (!kgdb_single_step) { | 631 | if (!kgdb_single_step) { |
633 | for (i = NR_CPUS-1; i >= 0; i--) | 632 | raw_spin_unlock(&dbg_slave_lock); |
634 | atomic_dec(&passive_cpu_wait[i]); | 633 | /* Wait till all the CPUs have quit from the debugger. */ |
635 | /* | 634 | while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) |
636 | * Wait till all the CPUs have quit from the debugger, | 635 | cpu_relax(); |
637 | * but allow a CPU that hit an exception and is | ||
638 | * waiting to become the master to remain in the debug | ||
639 | * core. | ||
640 | */ | ||
641 | for_each_online_cpu(i) { | ||
642 | while (kgdb_do_roundup && | ||
643 | atomic_read(&cpu_in_kgdb[i]) && | ||
644 | !(kgdb_info[i].exception_state & | ||
645 | DCPU_WANT_MASTER)) | ||
646 | cpu_relax(); | ||
647 | } | ||
648 | } | 636 | } |
649 | 637 | ||
650 | kgdb_restore: | 638 | kgdb_restore: |
@@ -655,12 +643,20 @@ kgdb_restore: | |||
655 | else | 643 | else |
656 | kgdb_sstep_pid = 0; | 644 | kgdb_sstep_pid = 0; |
657 | } | 645 | } |
646 | if (arch_kgdb_ops.correct_hw_break) | ||
647 | arch_kgdb_ops.correct_hw_break(); | ||
658 | if (trace_on) | 648 | if (trace_on) |
659 | tracing_on(); | 649 | tracing_on(); |
650 | |||
651 | kgdb_info[cpu].exception_state &= | ||
652 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | ||
653 | kgdb_info[cpu].enter_kgdb--; | ||
654 | smp_mb__before_atomic_dec(); | ||
655 | atomic_dec(&masters_in_kgdb); | ||
660 | /* Free kgdb_active */ | 656 | /* Free kgdb_active */ |
661 | atomic_set(&kgdb_active, -1); | 657 | atomic_set(&kgdb_active, -1); |
662 | touch_softlockup_watchdog_sync(); | 658 | raw_spin_unlock(&dbg_master_lock); |
663 | clocksource_touch_watchdog(); | 659 | dbg_touch_watchdogs(); |
664 | local_irq_restore(flags); | 660 | local_irq_restore(flags); |
665 | 661 | ||
666 | return kgdb_info[cpu].ret_state; | 662 | return kgdb_info[cpu].ret_state; |
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
678 | { | 674 | { |
679 | struct kgdb_state kgdb_var; | 675 | struct kgdb_state kgdb_var; |
680 | struct kgdb_state *ks = &kgdb_var; | 676 | struct kgdb_state *ks = &kgdb_var; |
681 | int ret; | ||
682 | 677 | ||
683 | ks->cpu = raw_smp_processor_id(); | 678 | ks->cpu = raw_smp_processor_id(); |
684 | ks->ex_vector = evector; | 679 | ks->ex_vector = evector; |
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
689 | 684 | ||
690 | if (kgdb_reenter_check(ks)) | 685 | if (kgdb_reenter_check(ks)) |
691 | return 0; /* Ouch, double exception ! */ | 686 | return 0; /* Ouch, double exception ! */ |
692 | kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; | 687 | if (kgdb_info[ks->cpu].enter_kgdb != 0) |
693 | ret = kgdb_cpu_enter(ks, regs); | 688 | return 0; |
694 | kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | | 689 | |
695 | DCPU_IS_SLAVE); | 690 | return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); |
696 | return ret; | ||
697 | } | 691 | } |
698 | 692 | ||
699 | int kgdb_nmicallback(int cpu, void *regs) | 693 | int kgdb_nmicallback(int cpu, void *regs) |
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
706 | ks->cpu = cpu; | 700 | ks->cpu = cpu; |
707 | ks->linux_regs = regs; | 701 | ks->linux_regs = regs; |
708 | 702 | ||
709 | if (!atomic_read(&cpu_in_kgdb[cpu]) && | 703 | if (kgdb_info[ks->cpu].enter_kgdb == 0 && |
710 | atomic_read(&kgdb_active) != -1 && | 704 | raw_spin_is_locked(&dbg_master_lock)) { |
711 | atomic_read(&kgdb_active) != cpu) { | 705 | kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); |
712 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; | ||
713 | kgdb_cpu_enter(ks, regs); | ||
714 | kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; | ||
715 | return 0; | 706 | return 0; |
716 | } | 707 | } |
717 | #endif | 708 | #endif |
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index c5d753d80f67..3494c28a7e7a 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h | |||
@@ -40,6 +40,7 @@ struct debuggerinfo_struct { | |||
40 | int exception_state; | 40 | int exception_state; |
41 | int ret_state; | 41 | int ret_state; |
42 | int irq_depth; | 42 | int irq_depth; |
43 | int enter_kgdb; | ||
43 | }; | 44 | }; |
44 | 45 | ||
45 | extern struct debuggerinfo_struct kgdb_info[]; | 46 | extern struct debuggerinfo_struct kgdb_info[]; |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd2dfe7..a11db956dd62 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) | |||
1093 | put_packet(remcom_out_buffer); | 1093 | put_packet(remcom_out_buffer); |
1094 | return 0; | 1094 | return 0; |
1095 | } | 1095 | } |
1096 | |||
1097 | /** | ||
1098 | * gdbstub_exit - Send an exit message to GDB | ||
1099 | * @status: The exit code to report. | ||
1100 | */ | ||
1101 | void gdbstub_exit(int status) | ||
1102 | { | ||
1103 | unsigned char checksum, ch, buffer[3]; | ||
1104 | int loop; | ||
1105 | |||
1106 | buffer[0] = 'W'; | ||
1107 | buffer[1] = hex_asc_hi(status); | ||
1108 | buffer[2] = hex_asc_lo(status); | ||
1109 | |||
1110 | dbg_io_ops->write_char('$'); | ||
1111 | checksum = 0; | ||
1112 | |||
1113 | for (loop = 0; loop < 3; loop++) { | ||
1114 | ch = buffer[loop]; | ||
1115 | checksum += ch; | ||
1116 | dbg_io_ops->write_char(ch); | ||
1117 | } | ||
1118 | |||
1119 | dbg_io_ops->write_char('#'); | ||
1120 | dbg_io_ops->write_char(hex_asc_hi(checksum)); | ||
1121 | dbg_io_ops->write_char(hex_asc_lo(checksum)); | ||
1122 | |||
1123 | /* make sure the output is flushed, lest the bootloader clobber it */ | ||
1124 | dbg_io_ops->flush(); | ||
1125 | } | ||
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index bf6e8270e957..dd0b1b7dd02c 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
86 | } | 86 | } |
87 | /* Set initial kdb state variables */ | 87 | /* Set initial kdb state variables */ |
88 | KDB_STATE_CLEAR(KGDB_TRANS); | 88 | KDB_STATE_CLEAR(KGDB_TRANS); |
89 | kdb_initial_cpu = ks->cpu; | 89 | kdb_initial_cpu = atomic_read(&kgdb_active); |
90 | kdb_current_task = kgdb_info[ks->cpu].task; | 90 | kdb_current_task = kgdb_info[ks->cpu].task; |
91 | kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; | 91 | kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; |
92 | /* Remove any breakpoints as needed by kdb and clear single step */ | 92 | /* Remove any breakpoints as needed by kdb and clear single step */ |
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks) | |||
105 | ks->pass_exception = 1; | 105 | ks->pass_exception = 1; |
106 | KDB_FLAG_SET(CATASTROPHIC); | 106 | KDB_FLAG_SET(CATASTROPHIC); |
107 | } | 107 | } |
108 | kdb_initial_cpu = ks->cpu; | ||
109 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { | 108 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { |
110 | KDB_STATE_CLEAR(SSBPT); | 109 | KDB_STATE_CLEAR(SSBPT); |
111 | KDB_STATE_CLEAR(DOING_SS); | 110 | KDB_STATE_CLEAR(DOING_SS); |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index c9b7f4f90bba..96fdaac46a80 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...) | |||
823 | 823 | ||
824 | return r; | 824 | return r; |
825 | } | 825 | } |
826 | 826 | EXPORT_SYMBOL_GPL(kdb_printf); | |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index caf057a3de0e..be14779bcef6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -78,11 +78,11 @@ static unsigned int kdb_continue_catastrophic; | |||
78 | static kdbtab_t *kdb_commands; | 78 | static kdbtab_t *kdb_commands; |
79 | #define KDB_BASE_CMD_MAX 50 | 79 | #define KDB_BASE_CMD_MAX 50 |
80 | static int kdb_max_commands = KDB_BASE_CMD_MAX; | 80 | static int kdb_max_commands = KDB_BASE_CMD_MAX; |
81 | static kdbtab_t kdb_base_commands[50]; | 81 | static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; |
82 | #define for_each_kdbcmd(cmd, num) \ | 82 | #define for_each_kdbcmd(cmd, num) \ |
83 | for ((cmd) = kdb_base_commands, (num) = 0; \ | 83 | for ((cmd) = kdb_base_commands, (num) = 0; \ |
84 | num < kdb_max_commands; \ | 84 | num < kdb_max_commands; \ |
85 | num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) | 85 | num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++) |
86 | 86 | ||
87 | typedef struct _kdbmsg { | 87 | typedef struct _kdbmsg { |
88 | int km_diag; /* kdb diagnostic */ | 88 | int km_diag; /* kdb diagnostic */ |
@@ -441,9 +441,9 @@ static int kdb_check_regs(void) | |||
441 | * symbol name, and offset to the caller. | 441 | * symbol name, and offset to the caller. |
442 | * | 442 | * |
443 | * The argument may consist of a numeric value (decimal or | 443 | * The argument may consist of a numeric value (decimal or |
444 | * hexidecimal), a symbol name, a register name (preceeded by the | 444 | * hexidecimal), a symbol name, a register name (preceded by the |
445 | * percent sign), an environment variable with a numeric value | 445 | * percent sign), an environment variable with a numeric value |
446 | * (preceeded by a dollar sign) or a simple arithmetic expression | 446 | * (preceded by a dollar sign) or a simple arithmetic expression |
447 | * consisting of a symbol name, +/-, and a numeric constant value | 447 | * consisting of a symbol name, +/-, and a numeric constant value |
448 | * (offset). | 448 | * (offset). |
449 | * Parameters: | 449 | * Parameters: |
@@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) | |||
646 | } | 646 | } |
647 | if (!s->usable) | 647 | if (!s->usable) |
648 | return KDB_NOTIMP; | 648 | return KDB_NOTIMP; |
649 | s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); | 649 | s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); |
650 | if (!s->command) { | 650 | if (!s->command) { |
651 | kdb_printf("Could not allocate new kdb_defcmd table for %s\n", | 651 | kdb_printf("Could not allocate new kdb_defcmd table for %s\n", |
652 | cmdstr); | 652 | cmdstr); |
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, | |||
1127 | /* special case below */ | 1127 | /* special case below */ |
1128 | } else { | 1128 | } else { |
1129 | kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", | 1129 | kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", |
1130 | kdb_current, kdb_current->pid); | 1130 | kdb_current, kdb_current ? kdb_current->pid : 0); |
1131 | #if defined(CONFIG_SMP) | 1131 | #if defined(CONFIG_SMP) |
1132 | kdb_printf("on processor %d ", raw_smp_processor_id()); | 1132 | kdb_printf("on processor %d ", raw_smp_processor_id()); |
1133 | #endif | 1133 | #endif |
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) | |||
1335 | * error The hardware-defined error code | 1335 | * error The hardware-defined error code |
1336 | * reason2 kdb's current reason code. | 1336 | * reason2 kdb's current reason code. |
1337 | * Initially error but can change | 1337 | * Initially error but can change |
1338 | * acording to kdb state. | 1338 | * according to kdb state. |
1339 | * db_result Result code from break or debug point. | 1339 | * db_result Result code from break or debug point. |
1340 | * regs The exception frame at time of fault/breakpoint. | 1340 | * regs The exception frame at time of fault/breakpoint. |
1341 | * should always be valid. | 1341 | * should always be valid. |
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv) | |||
1749 | int nextarg; | 1749 | int nextarg; |
1750 | long offset; | 1750 | long offset; |
1751 | 1751 | ||
1752 | if (raw_smp_processor_id() != kdb_initial_cpu) { | ||
1753 | kdb_printf("go must execute on the entry cpu, " | ||
1754 | "please use \"cpu %d\" and then execute go\n", | ||
1755 | kdb_initial_cpu); | ||
1756 | return KDB_BADCPUNUM; | ||
1757 | } | ||
1752 | if (argc == 1) { | 1758 | if (argc == 1) { |
1753 | if (raw_smp_processor_id() != kdb_initial_cpu) { | ||
1754 | kdb_printf("go <address> must be issued from the " | ||
1755 | "initial cpu, do cpu %d first\n", | ||
1756 | kdb_initial_cpu); | ||
1757 | return KDB_ARGCOUNT; | ||
1758 | } | ||
1759 | nextarg = 1; | 1759 | nextarg = 1; |
1760 | diag = kdbgetaddrarg(argc, argv, &nextarg, | 1760 | diag = kdbgetaddrarg(argc, argv, &nextarg, |
1761 | &addr, &offset, NULL); | 1761 | &addr, &offset, NULL); |
@@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv) | |||
2361 | */ | 2361 | */ |
2362 | static int kdb_ll(int argc, const char **argv) | 2362 | static int kdb_ll(int argc, const char **argv) |
2363 | { | 2363 | { |
2364 | int diag; | 2364 | int diag = 0; |
2365 | unsigned long addr; | 2365 | unsigned long addr; |
2366 | long offset = 0; | 2366 | long offset = 0; |
2367 | unsigned long va; | 2367 | unsigned long va; |
@@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv) | |||
2400 | char buf[80]; | 2400 | char buf[80]; |
2401 | 2401 | ||
2402 | if (KDB_FLAG(CMD_INTERRUPT)) | 2402 | if (KDB_FLAG(CMD_INTERRUPT)) |
2403 | return 0; | 2403 | goto out; |
2404 | 2404 | ||
2405 | sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); | 2405 | sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); |
2406 | diag = kdb_parse(buf); | 2406 | diag = kdb_parse(buf); |
2407 | if (diag) | 2407 | if (diag) |
2408 | return diag; | 2408 | goto out; |
2409 | 2409 | ||
2410 | addr = va + linkoffset; | 2410 | addr = va + linkoffset; |
2411 | if (kdb_getword(&va, addr, sizeof(va))) | 2411 | if (kdb_getword(&va, addr, sizeof(va))) |
2412 | return 0; | 2412 | goto out; |
2413 | } | 2413 | } |
2414 | kfree(command); | ||
2415 | 2414 | ||
2416 | return 0; | 2415 | out: |
2416 | kfree(command); | ||
2417 | return diag; | ||
2417 | } | 2418 | } |
2418 | 2419 | ||
2419 | static int kdb_kgdb(int argc, const char **argv) | 2420 | static int kdb_kgdb(int argc, const char **argv) |
@@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv) | |||
2603 | */ | 2604 | */ |
2604 | static int kdb_per_cpu(int argc, const char **argv) | 2605 | static int kdb_per_cpu(int argc, const char **argv) |
2605 | { | 2606 | { |
2606 | char buf[256], fmtstr[64]; | 2607 | char fmtstr[64]; |
2607 | kdb_symtab_t symtab; | 2608 | int cpu, diag, nextarg = 1; |
2608 | cpumask_t suppress = CPU_MASK_NONE; | 2609 | unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; |
2609 | int cpu, diag; | ||
2610 | unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; | ||
2611 | 2610 | ||
2612 | if (argc < 1 || argc > 3) | 2611 | if (argc < 1 || argc > 3) |
2613 | return KDB_ARGCOUNT; | 2612 | return KDB_ARGCOUNT; |
2614 | 2613 | ||
2615 | snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); | 2614 | diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); |
2616 | if (!kdbgetsymval(buf, &symtab)) { | 2615 | if (diag) |
2617 | kdb_printf("%s is not a per_cpu variable\n", argv[1]); | 2616 | return diag; |
2618 | return KDB_BADADDR; | 2617 | |
2619 | } | ||
2620 | if (argc >= 2) { | 2618 | if (argc >= 2) { |
2621 | diag = kdbgetularg(argv[2], &bytesperword); | 2619 | diag = kdbgetularg(argv[2], &bytesperword); |
2622 | if (diag) | 2620 | if (diag) |
@@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv) | |||
2649 | #define KDB_PCU(cpu) 0 | 2647 | #define KDB_PCU(cpu) 0 |
2650 | #endif | 2648 | #endif |
2651 | #endif | 2649 | #endif |
2652 | |||
2653 | for_each_online_cpu(cpu) { | 2650 | for_each_online_cpu(cpu) { |
2651 | if (KDB_FLAG(CMD_INTERRUPT)) | ||
2652 | return 0; | ||
2653 | |||
2654 | if (whichcpu != ~0UL && whichcpu != cpu) | 2654 | if (whichcpu != ~0UL && whichcpu != cpu) |
2655 | continue; | 2655 | continue; |
2656 | addr = symtab.sym_start + KDB_PCU(cpu); | 2656 | addr = symaddr + KDB_PCU(cpu); |
2657 | diag = kdb_getword(&val, addr, bytesperword); | 2657 | diag = kdb_getword(&val, addr, bytesperword); |
2658 | if (diag) { | 2658 | if (diag) { |
2659 | kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " | 2659 | kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " |
2660 | "read, diag=%d\n", cpu, addr, diag); | 2660 | "read, diag=%d\n", cpu, addr, diag); |
2661 | continue; | 2661 | continue; |
2662 | } | 2662 | } |
2663 | #ifdef CONFIG_SMP | ||
2664 | if (!val) { | ||
2665 | cpu_set(cpu, suppress); | ||
2666 | continue; | ||
2667 | } | ||
2668 | #endif /* CONFIG_SMP */ | ||
2669 | kdb_printf("%5d ", cpu); | 2663 | kdb_printf("%5d ", cpu); |
2670 | kdb_md_line(fmtstr, addr, | 2664 | kdb_md_line(fmtstr, addr, |
2671 | bytesperword == KDB_WORD_SIZE, | 2665 | bytesperword == KDB_WORD_SIZE, |
2672 | 1, bytesperword, 1, 1, 0); | 2666 | 1, bytesperword, 1, 1, 0); |
2673 | } | 2667 | } |
2674 | if (cpus_weight(suppress) == 0) | ||
2675 | return 0; | ||
2676 | kdb_printf("Zero suppressed cpu(s):"); | ||
2677 | for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); | ||
2678 | cpu = next_cpu(cpu, suppress)) { | ||
2679 | kdb_printf(" %d", cpu); | ||
2680 | if (cpu == num_possible_cpus() - 1 || | ||
2681 | next_cpu(cpu, suppress) != cpu + 1) | ||
2682 | continue; | ||
2683 | while (cpu < num_possible_cpus() && | ||
2684 | next_cpu(cpu, suppress) == cpu + 1) | ||
2685 | ++cpu; | ||
2686 | kdb_printf("-%d", cpu); | ||
2687 | } | ||
2688 | kdb_printf("\n"); | ||
2689 | |||
2690 | #undef KDB_PCU | 2668 | #undef KDB_PCU |
2691 | |||
2692 | return 0; | 2669 | return 0; |
2693 | } | 2670 | } |
2694 | 2671 | ||
@@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd, | |||
2763 | } | 2740 | } |
2764 | if (kdb_commands) { | 2741 | if (kdb_commands) { |
2765 | memcpy(new, kdb_commands, | 2742 | memcpy(new, kdb_commands, |
2766 | kdb_max_commands * sizeof(*new)); | 2743 | (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); |
2767 | kfree(kdb_commands); | 2744 | kfree(kdb_commands); |
2768 | } | 2745 | } |
2769 | memset(new + kdb_max_commands, 0, | 2746 | memset(new + kdb_max_commands, 0, |
2770 | kdb_command_extend * sizeof(*new)); | 2747 | kdb_command_extend * sizeof(*new)); |
2771 | kdb_commands = new; | 2748 | kdb_commands = new; |
2772 | kp = kdb_commands + kdb_max_commands; | 2749 | kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; |
2773 | kdb_max_commands += kdb_command_extend; | 2750 | kdb_max_commands += kdb_command_extend; |
2774 | } | 2751 | } |
2775 | 2752 | ||
@@ -2783,6 +2760,8 @@ int kdb_register_repeat(char *cmd, | |||
2783 | 2760 | ||
2784 | return 0; | 2761 | return 0; |
2785 | } | 2762 | } |
2763 | EXPORT_SYMBOL_GPL(kdb_register_repeat); | ||
2764 | |||
2786 | 2765 | ||
2787 | /* | 2766 | /* |
2788 | * kdb_register - Compatibility register function for commands that do | 2767 | * kdb_register - Compatibility register function for commands that do |
@@ -2805,6 +2784,7 @@ int kdb_register(char *cmd, | |||
2805 | return kdb_register_repeat(cmd, func, usage, help, minlen, | 2784 | return kdb_register_repeat(cmd, func, usage, help, minlen, |
2806 | KDB_REPEAT_NONE); | 2785 | KDB_REPEAT_NONE); |
2807 | } | 2786 | } |
2787 | EXPORT_SYMBOL_GPL(kdb_register); | ||
2808 | 2788 | ||
2809 | /* | 2789 | /* |
2810 | * kdb_unregister - This function is used to unregister a kernel | 2790 | * kdb_unregister - This function is used to unregister a kernel |
@@ -2823,7 +2803,7 @@ int kdb_unregister(char *cmd) | |||
2823 | /* | 2803 | /* |
2824 | * find the command. | 2804 | * find the command. |
2825 | */ | 2805 | */ |
2826 | for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { | 2806 | for_each_kdbcmd(kp, i) { |
2827 | if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { | 2807 | if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { |
2828 | kp->cmd_name = NULL; | 2808 | kp->cmd_name = NULL; |
2829 | return 0; | 2809 | return 0; |
@@ -2833,6 +2813,7 @@ int kdb_unregister(char *cmd) | |||
2833 | /* Couldn't find it. */ | 2813 | /* Couldn't find it. */ |
2834 | return 1; | 2814 | return 1; |
2835 | } | 2815 | } |
2816 | EXPORT_SYMBOL_GPL(kdb_unregister); | ||
2836 | 2817 | ||
2837 | /* Initialize the kdb command table. */ | 2818 | /* Initialize the kdb command table. */ |
2838 | static void __init kdb_inittab(void) | 2819 | static void __init kdb_inittab(void) |
@@ -2911,7 +2892,7 @@ static void __init kdb_inittab(void) | |||
2911 | "Send a signal to a process", 0, KDB_REPEAT_NONE); | 2892 | "Send a signal to a process", 0, KDB_REPEAT_NONE); |
2912 | kdb_register_repeat("summary", kdb_summary, "", | 2893 | kdb_register_repeat("summary", kdb_summary, "", |
2913 | "Summarize the system", 4, KDB_REPEAT_NONE); | 2894 | "Summarize the system", 4, KDB_REPEAT_NONE); |
2914 | kdb_register_repeat("per_cpu", kdb_per_cpu, "", | 2895 | kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", |
2915 | "Display per_cpu variables", 3, KDB_REPEAT_NONE); | 2896 | "Display per_cpu variables", 3, KDB_REPEAT_NONE); |
2916 | kdb_register_repeat("grephelp", kdb_grep_help, "", | 2897 | kdb_register_repeat("grephelp", kdb_grep_help, "", |
2917 | "Display help on | grep", 0, KDB_REPEAT_NONE); | 2898 | "Display help on | grep", 0, KDB_REPEAT_NONE); |
@@ -2933,7 +2914,7 @@ static void __init kdb_cmd_init(void) | |||
2933 | } | 2914 | } |
2934 | } | 2915 | } |
2935 | 2916 | ||
2936 | /* Intialize kdb_printf, breakpoint tables and kdb state */ | 2917 | /* Initialize kdb_printf, breakpoint tables and kdb state */ |
2937 | void __init kdb_init(int lvl) | 2918 | void __init kdb_init(int lvl) |
2938 | { | 2919 | { |
2939 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; | 2920 | static int kdb_init_lvl = KDB_NOT_INITIALIZED; |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index be775f7e81e0..35d69ed1dfb5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -15,29 +15,6 @@ | |||
15 | #include <linux/kgdb.h> | 15 | #include <linux/kgdb.h> |
16 | #include "../debug_core.h" | 16 | #include "../debug_core.h" |
17 | 17 | ||
18 | /* Kernel Debugger Error codes. Must not overlap with command codes. */ | ||
19 | #define KDB_NOTFOUND (-1) | ||
20 | #define KDB_ARGCOUNT (-2) | ||
21 | #define KDB_BADWIDTH (-3) | ||
22 | #define KDB_BADRADIX (-4) | ||
23 | #define KDB_NOTENV (-5) | ||
24 | #define KDB_NOENVVALUE (-6) | ||
25 | #define KDB_NOTIMP (-7) | ||
26 | #define KDB_ENVFULL (-8) | ||
27 | #define KDB_ENVBUFFULL (-9) | ||
28 | #define KDB_TOOMANYBPT (-10) | ||
29 | #define KDB_TOOMANYDBREGS (-11) | ||
30 | #define KDB_DUPBPT (-12) | ||
31 | #define KDB_BPTNOTFOUND (-13) | ||
32 | #define KDB_BADMODE (-14) | ||
33 | #define KDB_BADINT (-15) | ||
34 | #define KDB_INVADDRFMT (-16) | ||
35 | #define KDB_BADREG (-17) | ||
36 | #define KDB_BADCPUNUM (-18) | ||
37 | #define KDB_BADLENGTH (-19) | ||
38 | #define KDB_NOBP (-20) | ||
39 | #define KDB_BADADDR (-21) | ||
40 | |||
41 | /* Kernel Debugger Command codes. Must not overlap with error codes. */ | 18 | /* Kernel Debugger Command codes. Must not overlap with error codes. */ |
42 | #define KDB_CMD_GO (-1001) | 19 | #define KDB_CMD_GO (-1001) |
43 | #define KDB_CMD_CPU (-1002) | 20 | #define KDB_CMD_CPU (-1002) |
@@ -93,17 +70,6 @@ | |||
93 | */ | 70 | */ |
94 | #define KDB_MAXBPT 16 | 71 | #define KDB_MAXBPT 16 |
95 | 72 | ||
96 | /* Maximum number of arguments to a function */ | ||
97 | #define KDB_MAXARGS 16 | ||
98 | |||
99 | typedef enum { | ||
100 | KDB_REPEAT_NONE = 0, /* Do not repeat this command */ | ||
101 | KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */ | ||
102 | KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */ | ||
103 | } kdb_repeat_t; | ||
104 | |||
105 | typedef int (*kdb_func_t)(int, const char **); | ||
106 | |||
107 | /* Symbol table format returned by kallsyms. */ | 73 | /* Symbol table format returned by kallsyms. */ |
108 | typedef struct __ksymtab { | 74 | typedef struct __ksymtab { |
109 | unsigned long value; /* Address of symbol */ | 75 | unsigned long value; /* Address of symbol */ |
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag); | |||
123 | extern int kallsyms_symbol_complete(char *prefix_name, int max_len); | 89 | extern int kallsyms_symbol_complete(char *prefix_name, int max_len); |
124 | 90 | ||
125 | /* Exported Symbols for kernel loadable modules to use. */ | 91 | /* Exported Symbols for kernel loadable modules to use. */ |
126 | extern int kdb_register(char *, kdb_func_t, char *, char *, short); | ||
127 | extern int kdb_register_repeat(char *, kdb_func_t, char *, char *, | ||
128 | short, kdb_repeat_t); | ||
129 | extern int kdb_unregister(char *); | ||
130 | |||
131 | extern int kdb_getarea_size(void *, unsigned long, size_t); | 92 | extern int kdb_getarea_size(void *, unsigned long, size_t); |
132 | extern int kdb_putarea_size(unsigned long, void *, size_t); | 93 | extern int kdb_putarea_size(unsigned long, void *, size_t); |
133 | 94 | ||
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); | |||
144 | extern int kdb_putword(unsigned long, unsigned long, size_t); | 105 | extern int kdb_putword(unsigned long, unsigned long, size_t); |
145 | 106 | ||
146 | extern int kdbgetularg(const char *, unsigned long *); | 107 | extern int kdbgetularg(const char *, unsigned long *); |
108 | extern int kdbgetu64arg(const char *, u64 *); | ||
147 | extern char *kdbgetenv(const char *); | 109 | extern char *kdbgetenv(const char *); |
148 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, | 110 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, |
149 | long *, char **); | 111 | long *, char **); |
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p); | |||
255 | extern void kdb_print_nameval(const char *name, unsigned long val); | 217 | extern void kdb_print_nameval(const char *name, unsigned long val); |
256 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 218 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); |
257 | extern void kdb_meminfo_proc_show(void); | 219 | extern void kdb_meminfo_proc_show(void); |
258 | #ifdef CONFIG_KALLSYMS | ||
259 | extern const char *kdb_walk_kallsyms(loff_t *pos); | ||
260 | #else /* ! CONFIG_KALLSYMS */ | ||
261 | static inline const char *kdb_walk_kallsyms(loff_t *pos) | ||
262 | { | ||
263 | return NULL; | ||
264 | } | ||
265 | #endif /* ! CONFIG_KALLSYMS */ | ||
266 | extern char *kdb_getstr(char *, size_t, char *); | 220 | extern char *kdb_getstr(char *, size_t, char *); |
267 | 221 | ||
268 | /* Defines for kdb_symbol_print */ | 222 | /* Defines for kdb_symbol_print */ |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485dcb050..5532dd37aa86 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) | |||
545 | * Mask for process state. | 545 | * Mask for process state. |
546 | * Notes: | 546 | * Notes: |
547 | * The mask folds data from several sources into a single long value, so | 547 | * The mask folds data from several sources into a single long value, so |
548 | * be carefull not to overlap the bits. TASK_* bits are in the LSB, | 548 | * be careful not to overlap the bits. TASK_* bits are in the LSB, |
549 | * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there | 549 | * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there |
550 | * is no overlap between TASK_* and EXIT_* but that may not always be | 550 | * is no overlap between TASK_* and EXIT_* but that may not always be |
551 | * true, so EXIT_* bits are shifted left 16 bits before being stored in | 551 | * true, so EXIT_* bits are shifted left 16 bits before being stored in |
diff --git a/kernel/early_res.c b/kernel/early_res.c deleted file mode 100644 index 7bfae887f211..000000000000 --- a/kernel/early_res.c +++ /dev/null | |||
@@ -1,590 +0,0 @@ | |||
1 | /* | ||
2 | * early_res, could be used to replace bootmem | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/bootmem.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/early_res.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/kmemleak.h> | ||
12 | |||
13 | /* | ||
14 | * Early reserved memory areas. | ||
15 | */ | ||
16 | /* | ||
17 | * need to make sure this one is bigger enough before | ||
18 | * find_fw_memmap_area could be used | ||
19 | */ | ||
20 | #define MAX_EARLY_RES_X 32 | ||
21 | |||
22 | struct early_res { | ||
23 | u64 start, end; | ||
24 | char name[15]; | ||
25 | char overlap_ok; | ||
26 | }; | ||
27 | static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; | ||
28 | |||
29 | static int max_early_res __initdata = MAX_EARLY_RES_X; | ||
30 | static struct early_res *early_res __initdata = &early_res_x[0]; | ||
31 | static int early_res_count __initdata; | ||
32 | |||
33 | static int __init find_overlapped_early(u64 start, u64 end) | ||
34 | { | ||
35 | int i; | ||
36 | struct early_res *r; | ||
37 | |||
38 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
39 | r = &early_res[i]; | ||
40 | if (end > r->start && start < r->end) | ||
41 | break; | ||
42 | } | ||
43 | |||
44 | return i; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Drop the i-th range from the early reservation map, | ||
49 | * by copying any higher ranges down one over it, and | ||
50 | * clearing what had been the last slot. | ||
51 | */ | ||
52 | static void __init drop_range(int i) | ||
53 | { | ||
54 | int j; | ||
55 | |||
56 | for (j = i + 1; j < max_early_res && early_res[j].end; j++) | ||
57 | ; | ||
58 | |||
59 | memmove(&early_res[i], &early_res[i + 1], | ||
60 | (j - 1 - i) * sizeof(struct early_res)); | ||
61 | |||
62 | early_res[j - 1].end = 0; | ||
63 | early_res_count--; | ||
64 | } | ||
65 | |||
66 | static void __init drop_range_partial(int i, u64 start, u64 end) | ||
67 | { | ||
68 | u64 common_start, common_end; | ||
69 | u64 old_start, old_end; | ||
70 | |||
71 | old_start = early_res[i].start; | ||
72 | old_end = early_res[i].end; | ||
73 | common_start = max(old_start, start); | ||
74 | common_end = min(old_end, end); | ||
75 | |||
76 | /* no overlap ? */ | ||
77 | if (common_start >= common_end) | ||
78 | return; | ||
79 | |||
80 | if (old_start < common_start) { | ||
81 | /* make head segment */ | ||
82 | early_res[i].end = common_start; | ||
83 | if (old_end > common_end) { | ||
84 | char name[15]; | ||
85 | |||
86 | /* | ||
87 | * Save a local copy of the name, since the | ||
88 | * early_res array could get resized inside | ||
89 | * reserve_early_without_check() -> | ||
90 | * __check_and_double_early_res(), which would | ||
91 | * make the current name pointer invalid. | ||
92 | */ | ||
93 | strncpy(name, early_res[i].name, | ||
94 | sizeof(early_res[i].name) - 1); | ||
95 | /* add another for left over on tail */ | ||
96 | reserve_early_without_check(common_end, old_end, name); | ||
97 | } | ||
98 | return; | ||
99 | } else { | ||
100 | if (old_end > common_end) { | ||
101 | /* reuse the entry for tail left */ | ||
102 | early_res[i].start = common_end; | ||
103 | return; | ||
104 | } | ||
105 | /* all covered */ | ||
106 | drop_range(i); | ||
107 | } | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Split any existing ranges that: | ||
112 | * 1) are marked 'overlap_ok', and | ||
113 | * 2) overlap with the stated range [start, end) | ||
114 | * into whatever portion (if any) of the existing range is entirely | ||
115 | * below or entirely above the stated range. Drop the portion | ||
116 | * of the existing range that overlaps with the stated range, | ||
117 | * which will allow the caller of this routine to then add that | ||
118 | * stated range without conflicting with any existing range. | ||
119 | */ | ||
120 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
121 | { | ||
122 | int i; | ||
123 | struct early_res *r; | ||
124 | u64 lower_start, lower_end; | ||
125 | u64 upper_start, upper_end; | ||
126 | char name[15]; | ||
127 | |||
128 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
129 | r = &early_res[i]; | ||
130 | |||
131 | /* Continue past non-overlapping ranges */ | ||
132 | if (end <= r->start || start >= r->end) | ||
133 | continue; | ||
134 | |||
135 | /* | ||
136 | * Leave non-ok overlaps as is; let caller | ||
137 | * panic "Overlapping early reservations" | ||
138 | * when it hits this overlap. | ||
139 | */ | ||
140 | if (!r->overlap_ok) | ||
141 | return; | ||
142 | |||
143 | /* | ||
144 | * We have an ok overlap. We will drop it from the early | ||
145 | * reservation map, and add back in any non-overlapping | ||
146 | * portions (lower or upper) as separate, overlap_ok, | ||
147 | * non-overlapping ranges. | ||
148 | */ | ||
149 | |||
150 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
151 | strncpy(name, r->name, sizeof(name) - 1); | ||
152 | |||
153 | lower_start = lower_end = 0; | ||
154 | upper_start = upper_end = 0; | ||
155 | if (r->start < start) { | ||
156 | lower_start = r->start; | ||
157 | lower_end = start; | ||
158 | } | ||
159 | if (r->end > end) { | ||
160 | upper_start = end; | ||
161 | upper_end = r->end; | ||
162 | } | ||
163 | |||
164 | /* 2. Drop the original ok overlapping range */ | ||
165 | drop_range(i); | ||
166 | |||
167 | i--; /* resume for-loop on copied down entry */ | ||
168 | |||
169 | /* 3. Add back in any non-overlapping ranges. */ | ||
170 | if (lower_end) | ||
171 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
172 | if (upper_end) | ||
173 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
178 | int overlap_ok) | ||
179 | { | ||
180 | int i; | ||
181 | struct early_res *r; | ||
182 | |||
183 | i = find_overlapped_early(start, end); | ||
184 | if (i >= max_early_res) | ||
185 | panic("Too many early reservations"); | ||
186 | r = &early_res[i]; | ||
187 | if (r->end) | ||
188 | panic("Overlapping early reservations " | ||
189 | "%llx-%llx %s to %llx-%llx %s\n", | ||
190 | start, end - 1, name ? name : "", r->start, | ||
191 | r->end - 1, r->name); | ||
192 | r->start = start; | ||
193 | r->end = end; | ||
194 | r->overlap_ok = overlap_ok; | ||
195 | if (name) | ||
196 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
197 | early_res_count++; | ||
198 | } | ||
199 | |||
200 | /* | ||
201 | * A few early reservtations come here. | ||
202 | * | ||
203 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
204 | * is ok for these reservations to overlap an earlier reservation. | ||
205 | * Rather it means that it is ok for subsequent reservations to | ||
206 | * overlap this one. | ||
207 | * | ||
208 | * Use this entry point to reserve early ranges when you are doing | ||
209 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
210 | * just in case, and don't mind a subsequent overlapping reservation | ||
211 | * that is known to be needed. | ||
212 | * | ||
213 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
214 | * It would be needed if we had two colliding 'overlap_ok' | ||
215 | * reservations, so that the second such would not panic on the | ||
216 | * overlap with the first. We don't have any such as of this | ||
217 | * writing, but might as well tolerate such if it happens in | ||
218 | * the future. | ||
219 | */ | ||
220 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
221 | { | ||
222 | drop_overlaps_that_are_ok(start, end); | ||
223 | __reserve_early(start, end, name, 1); | ||
224 | } | ||
225 | |||
226 | static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) | ||
227 | { | ||
228 | u64 start, end, size, mem; | ||
229 | struct early_res *new; | ||
230 | |||
231 | /* do we have enough slots left ? */ | ||
232 | if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) | ||
233 | return; | ||
234 | |||
235 | /* double it */ | ||
236 | mem = -1ULL; | ||
237 | size = sizeof(struct early_res) * max_early_res * 2; | ||
238 | if (early_res == early_res_x) | ||
239 | start = 0; | ||
240 | else | ||
241 | start = early_res[0].end; | ||
242 | end = ex_start; | ||
243 | if (start + size < end) | ||
244 | mem = find_fw_memmap_area(start, end, size, | ||
245 | sizeof(struct early_res)); | ||
246 | if (mem == -1ULL) { | ||
247 | start = ex_end; | ||
248 | end = get_max_mapped(); | ||
249 | if (start + size < end) | ||
250 | mem = find_fw_memmap_area(start, end, size, | ||
251 | sizeof(struct early_res)); | ||
252 | } | ||
253 | if (mem == -1ULL) | ||
254 | panic("can not find more space for early_res array"); | ||
255 | |||
256 | new = __va(mem); | ||
257 | /* save the first one for own */ | ||
258 | new[0].start = mem; | ||
259 | new[0].end = mem + size; | ||
260 | new[0].overlap_ok = 0; | ||
261 | /* copy old to new */ | ||
262 | if (early_res == early_res_x) { | ||
263 | memcpy(&new[1], &early_res[0], | ||
264 | sizeof(struct early_res) * max_early_res); | ||
265 | memset(&new[max_early_res+1], 0, | ||
266 | sizeof(struct early_res) * (max_early_res - 1)); | ||
267 | early_res_count++; | ||
268 | } else { | ||
269 | memcpy(&new[1], &early_res[1], | ||
270 | sizeof(struct early_res) * (max_early_res - 1)); | ||
271 | memset(&new[max_early_res], 0, | ||
272 | sizeof(struct early_res) * max_early_res); | ||
273 | } | ||
274 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
275 | early_res = new; | ||
276 | max_early_res *= 2; | ||
277 | printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", | ||
278 | max_early_res, mem, mem + size - 1); | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Most early reservations come here. | ||
283 | * | ||
284 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
285 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
286 | * range without risk of panic'ing on an overlapping overlap_ok | ||
287 | * early reservation. | ||
288 | */ | ||
289 | void __init reserve_early(u64 start, u64 end, char *name) | ||
290 | { | ||
291 | if (start >= end) | ||
292 | return; | ||
293 | |||
294 | __check_and_double_early_res(start, end); | ||
295 | |||
296 | drop_overlaps_that_are_ok(start, end); | ||
297 | __reserve_early(start, end, name, 0); | ||
298 | } | ||
299 | |||
300 | void __init reserve_early_without_check(u64 start, u64 end, char *name) | ||
301 | { | ||
302 | struct early_res *r; | ||
303 | |||
304 | if (start >= end) | ||
305 | return; | ||
306 | |||
307 | __check_and_double_early_res(start, end); | ||
308 | |||
309 | r = &early_res[early_res_count]; | ||
310 | |||
311 | r->start = start; | ||
312 | r->end = end; | ||
313 | r->overlap_ok = 0; | ||
314 | if (name) | ||
315 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
316 | early_res_count++; | ||
317 | } | ||
318 | |||
319 | void __init free_early(u64 start, u64 end) | ||
320 | { | ||
321 | struct early_res *r; | ||
322 | int i; | ||
323 | |||
324 | kmemleak_free_part(__va(start), end - start); | ||
325 | |||
326 | i = find_overlapped_early(start, end); | ||
327 | r = &early_res[i]; | ||
328 | if (i >= max_early_res || r->end != end || r->start != start) | ||
329 | panic("free_early on not reserved area: %llx-%llx!", | ||
330 | start, end - 1); | ||
331 | |||
332 | drop_range(i); | ||
333 | } | ||
334 | |||
335 | void __init free_early_partial(u64 start, u64 end) | ||
336 | { | ||
337 | struct early_res *r; | ||
338 | int i; | ||
339 | |||
340 | kmemleak_free_part(__va(start), end - start); | ||
341 | |||
342 | if (start == end) | ||
343 | return; | ||
344 | |||
345 | if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) | ||
346 | return; | ||
347 | |||
348 | try_next: | ||
349 | i = find_overlapped_early(start, end); | ||
350 | if (i >= max_early_res) | ||
351 | return; | ||
352 | |||
353 | r = &early_res[i]; | ||
354 | /* hole ? */ | ||
355 | if (r->end >= end && r->start <= start) { | ||
356 | drop_range_partial(i, start, end); | ||
357 | return; | ||
358 | } | ||
359 | |||
360 | drop_range_partial(i, start, end); | ||
361 | goto try_next; | ||
362 | } | ||
363 | |||
364 | #ifdef CONFIG_NO_BOOTMEM | ||
365 | static void __init subtract_early_res(struct range *range, int az) | ||
366 | { | ||
367 | int i, count; | ||
368 | u64 final_start, final_end; | ||
369 | int idx = 0; | ||
370 | |||
371 | count = 0; | ||
372 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
373 | count++; | ||
374 | |||
375 | /* need to skip first one ?*/ | ||
376 | if (early_res != early_res_x) | ||
377 | idx = 1; | ||
378 | |||
379 | #define DEBUG_PRINT_EARLY_RES 1 | ||
380 | |||
381 | #if DEBUG_PRINT_EARLY_RES | ||
382 | printk(KERN_INFO "Subtract (%d early reservations)\n", count); | ||
383 | #endif | ||
384 | for (i = idx; i < count; i++) { | ||
385 | struct early_res *r = &early_res[i]; | ||
386 | #if DEBUG_PRINT_EARLY_RES | ||
387 | printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, | ||
388 | r->start, r->end, r->name); | ||
389 | #endif | ||
390 | final_start = PFN_DOWN(r->start); | ||
391 | final_end = PFN_UP(r->end); | ||
392 | if (final_start >= final_end) | ||
393 | continue; | ||
394 | subtract_range(range, az, final_start, final_end); | ||
395 | } | ||
396 | |||
397 | } | ||
398 | |||
399 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
400 | { | ||
401 | int i, count; | ||
402 | u64 start = 0, end; | ||
403 | u64 size; | ||
404 | u64 mem; | ||
405 | struct range *range; | ||
406 | int nr_range; | ||
407 | |||
408 | count = 0; | ||
409 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
410 | count++; | ||
411 | |||
412 | count *= 2; | ||
413 | |||
414 | size = sizeof(struct range) * count; | ||
415 | end = get_max_mapped(); | ||
416 | #ifdef MAX_DMA32_PFN | ||
417 | if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) | ||
418 | start = MAX_DMA32_PFN << PAGE_SHIFT; | ||
419 | #endif | ||
420 | mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); | ||
421 | if (mem == -1ULL) | ||
422 | panic("can not find more space for range free"); | ||
423 | |||
424 | range = __va(mem); | ||
425 | /* use early_node_map[] and early_res to get range array at first */ | ||
426 | memset(range, 0, size); | ||
427 | nr_range = 0; | ||
428 | |||
429 | /* need to go over early_node_map to find out good range for node */ | ||
430 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
431 | #ifdef CONFIG_X86_32 | ||
432 | subtract_range(range, count, max_low_pfn, -1ULL); | ||
433 | #endif | ||
434 | subtract_early_res(range, count); | ||
435 | nr_range = clean_sort_range(range, count); | ||
436 | |||
437 | /* need to clear it ? */ | ||
438 | if (nodeid == MAX_NUMNODES) { | ||
439 | memset(&early_res[0], 0, | ||
440 | sizeof(struct early_res) * max_early_res); | ||
441 | early_res = NULL; | ||
442 | max_early_res = 0; | ||
443 | } | ||
444 | |||
445 | *rangep = range; | ||
446 | return nr_range; | ||
447 | } | ||
448 | #else | ||
449 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
450 | { | ||
451 | int i, count; | ||
452 | u64 final_start, final_end; | ||
453 | int idx = 0; | ||
454 | |||
455 | count = 0; | ||
456 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
457 | count++; | ||
458 | |||
459 | /* need to skip first one ?*/ | ||
460 | if (early_res != early_res_x) | ||
461 | idx = 1; | ||
462 | |||
463 | printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
464 | count - idx, max_early_res, start, end); | ||
465 | for (i = idx; i < count; i++) { | ||
466 | struct early_res *r = &early_res[i]; | ||
467 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
468 | r->start, r->end, r->name); | ||
469 | final_start = max(start, r->start); | ||
470 | final_end = min(end, r->end); | ||
471 | if (final_start >= final_end) { | ||
472 | printk(KERN_CONT "\n"); | ||
473 | continue; | ||
474 | } | ||
475 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
476 | final_start, final_end); | ||
477 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
478 | BOOTMEM_DEFAULT); | ||
479 | } | ||
480 | /* clear them */ | ||
481 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
482 | early_res = NULL; | ||
483 | max_early_res = 0; | ||
484 | early_res_count = 0; | ||
485 | } | ||
486 | #endif | ||
487 | |||
488 | /* Check for already reserved areas */ | ||
489 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
490 | { | ||
491 | int i; | ||
492 | u64 addr = *addrp; | ||
493 | int changed = 0; | ||
494 | struct early_res *r; | ||
495 | again: | ||
496 | i = find_overlapped_early(addr, addr + size); | ||
497 | r = &early_res[i]; | ||
498 | if (i < max_early_res && r->end) { | ||
499 | *addrp = addr = round_up(r->end, align); | ||
500 | changed = 1; | ||
501 | goto again; | ||
502 | } | ||
503 | return changed; | ||
504 | } | ||
505 | |||
506 | /* Check for already reserved areas */ | ||
507 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
508 | { | ||
509 | int i; | ||
510 | u64 addr = *addrp, last; | ||
511 | u64 size = *sizep; | ||
512 | int changed = 0; | ||
513 | again: | ||
514 | last = addr + size; | ||
515 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
516 | struct early_res *r = &early_res[i]; | ||
517 | if (last > r->start && addr < r->start) { | ||
518 | size = r->start - addr; | ||
519 | changed = 1; | ||
520 | goto again; | ||
521 | } | ||
522 | if (last > r->end && addr < r->end) { | ||
523 | addr = round_up(r->end, align); | ||
524 | size = last - addr; | ||
525 | changed = 1; | ||
526 | goto again; | ||
527 | } | ||
528 | if (last <= r->end && addr >= r->start) { | ||
529 | (*sizep)++; | ||
530 | return 0; | ||
531 | } | ||
532 | } | ||
533 | if (changed) { | ||
534 | *addrp = addr; | ||
535 | *sizep = size; | ||
536 | } | ||
537 | return changed; | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Find a free area with specified alignment in a specific range. | ||
542 | * only with the area.between start to end is active range from early_node_map | ||
543 | * so they are good as RAM | ||
544 | */ | ||
545 | u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, | ||
546 | u64 size, u64 align) | ||
547 | { | ||
548 | u64 addr, last; | ||
549 | |||
550 | addr = round_up(ei_start, align); | ||
551 | if (addr < start) | ||
552 | addr = round_up(start, align); | ||
553 | if (addr >= ei_last) | ||
554 | goto out; | ||
555 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
556 | ; | ||
557 | last = addr + size; | ||
558 | if (last > ei_last) | ||
559 | goto out; | ||
560 | if (last > end) | ||
561 | goto out; | ||
562 | |||
563 | return addr; | ||
564 | |||
565 | out: | ||
566 | return -1ULL; | ||
567 | } | ||
568 | |||
569 | u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, | ||
570 | u64 *sizep, u64 align) | ||
571 | { | ||
572 | u64 addr, last; | ||
573 | |||
574 | addr = round_up(ei_start, align); | ||
575 | if (addr < start) | ||
576 | addr = round_up(start, align); | ||
577 | if (addr >= ei_last) | ||
578 | goto out; | ||
579 | *sizep = ei_last - addr; | ||
580 | while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) | ||
581 | ; | ||
582 | last = addr + *sizep; | ||
583 | if (last > ei_last) | ||
584 | goto out; | ||
585 | |||
586 | return addr; | ||
587 | |||
588 | out: | ||
589 | return -1ULL; | ||
590 | } | ||
diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 000000000000..1ce23d3d8394 --- /dev/null +++ b/kernel/events/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | ifdef CONFIG_FUNCTION_TRACER | ||
2 | CFLAGS_REMOVE_core.o = -pg | ||
3 | endif | ||
4 | |||
5 | obj-y := core.o | ||
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | ||
diff --git a/kernel/perf_event.c b/kernel/events/core.c index b98bed3d8182..9efe7108ccaf 100644 --- a/kernel/perf_event.c +++ b/kernel/events/core.c | |||
@@ -2,8 +2,8 @@ | |||
2 | * Performance events core code: | 2 | * Performance events core code: |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/smp.h> | 15 | #include <linux/smp.h> |
16 | #include <linux/idr.h> | ||
16 | #include <linux/file.h> | 17 | #include <linux/file.h> |
17 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
@@ -21,7 +22,9 @@ | |||
21 | #include <linux/dcache.h> | 22 | #include <linux/dcache.h> |
22 | #include <linux/percpu.h> | 23 | #include <linux/percpu.h> |
23 | #include <linux/ptrace.h> | 24 | #include <linux/ptrace.h> |
25 | #include <linux/reboot.h> | ||
24 | #include <linux/vmstat.h> | 26 | #include <linux/vmstat.h> |
27 | #include <linux/device.h> | ||
25 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
26 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
27 | #include <linux/rculist.h> | 30 | #include <linux/rculist.h> |
@@ -35,20 +38,104 @@ | |||
35 | 38 | ||
36 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
37 | 40 | ||
38 | /* | 41 | struct remote_function_call { |
39 | * Each CPU has a list of per CPU events: | 42 | struct task_struct *p; |
43 | int (*func)(void *info); | ||
44 | void *info; | ||
45 | int ret; | ||
46 | }; | ||
47 | |||
48 | static void remote_function(void *data) | ||
49 | { | ||
50 | struct remote_function_call *tfc = data; | ||
51 | struct task_struct *p = tfc->p; | ||
52 | |||
53 | if (p) { | ||
54 | tfc->ret = -EAGAIN; | ||
55 | if (task_cpu(p) != smp_processor_id() || !task_curr(p)) | ||
56 | return; | ||
57 | } | ||
58 | |||
59 | tfc->ret = tfc->func(tfc->info); | ||
60 | } | ||
61 | |||
62 | /** | ||
63 | * task_function_call - call a function on the cpu on which a task runs | ||
64 | * @p: the task to evaluate | ||
65 | * @func: the function to be called | ||
66 | * @info: the function call argument | ||
67 | * | ||
68 | * Calls the function @func when the task is currently running. This might | ||
69 | * be on the current CPU, which just calls the function directly | ||
70 | * | ||
71 | * returns: @func return value, or | ||
72 | * -ESRCH - when the process isn't running | ||
73 | * -EAGAIN - when the process moved away | ||
40 | */ | 74 | */ |
41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | 75 | static int |
76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | ||
77 | { | ||
78 | struct remote_function_call data = { | ||
79 | .p = p, | ||
80 | .func = func, | ||
81 | .info = info, | ||
82 | .ret = -ESRCH, /* No such (running) process */ | ||
83 | }; | ||
42 | 84 | ||
43 | int perf_max_events __read_mostly = 1; | 85 | if (task_curr(p)) |
44 | static int perf_reserved_percpu __read_mostly; | 86 | smp_call_function_single(task_cpu(p), remote_function, &data, 1); |
45 | static int perf_overcommit __read_mostly = 1; | 87 | |
88 | return data.ret; | ||
89 | } | ||
90 | |||
91 | /** | ||
92 | * cpu_function_call - call a function on the cpu | ||
93 | * @func: the function to be called | ||
94 | * @info: the function call argument | ||
95 | * | ||
96 | * Calls the function @func on the remote cpu. | ||
97 | * | ||
98 | * returns: @func return value or -ENXIO when the cpu is offline | ||
99 | */ | ||
100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | ||
101 | { | ||
102 | struct remote_function_call data = { | ||
103 | .p = NULL, | ||
104 | .func = func, | ||
105 | .info = info, | ||
106 | .ret = -ENXIO, /* No such CPU */ | ||
107 | }; | ||
108 | |||
109 | smp_call_function_single(cpu, remote_function, &data, 1); | ||
110 | |||
111 | return data.ret; | ||
112 | } | ||
113 | |||
114 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | ||
115 | PERF_FLAG_FD_OUTPUT |\ | ||
116 | PERF_FLAG_PID_CGROUP) | ||
117 | |||
118 | enum event_type_t { | ||
119 | EVENT_FLEXIBLE = 0x1, | ||
120 | EVENT_PINNED = 0x2, | ||
121 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * perf_sched_events : >0 events exist | ||
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | ||
127 | */ | ||
128 | struct jump_label_key perf_sched_events __read_mostly; | ||
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | ||
46 | 130 | ||
47 | static atomic_t nr_events __read_mostly; | ||
48 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 132 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 133 | static atomic_t nr_task_events __read_mostly; |
51 | 134 | ||
135 | static LIST_HEAD(pmus); | ||
136 | static DEFINE_MUTEX(pmus_lock); | ||
137 | static struct srcu_struct pmus_srcu; | ||
138 | |||
52 | /* | 139 | /* |
53 | * perf event paranoia level: | 140 | * perf event paranoia level: |
54 | * -1 - not paranoid at all | 141 | * -1 - not paranoid at all |
@@ -58,58 +145,445 @@ static atomic_t nr_task_events __read_mostly; | |||
58 | */ | 145 | */ |
59 | int sysctl_perf_event_paranoid __read_mostly = 1; | 146 | int sysctl_perf_event_paranoid __read_mostly = 1; |
60 | 147 | ||
61 | int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ | 148 | /* Minimum for 512 kiB + 1 user control page */ |
149 | int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ | ||
62 | 150 | ||
63 | /* | 151 | /* |
64 | * max perf event sample rate | 152 | * max perf event sample rate |
65 | */ | 153 | */ |
66 | int sysctl_perf_event_sample_rate __read_mostly = 100000; | 154 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
155 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
156 | static int max_samples_per_tick __read_mostly = | ||
157 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
158 | |||
159 | int perf_proc_update_handler(struct ctl_table *table, int write, | ||
160 | void __user *buffer, size_t *lenp, | ||
161 | loff_t *ppos) | ||
162 | { | ||
163 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
164 | |||
165 | if (ret || !write) | ||
166 | return ret; | ||
167 | |||
168 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | ||
169 | |||
170 | return 0; | ||
171 | } | ||
67 | 172 | ||
68 | static atomic64_t perf_event_id; | 173 | static atomic64_t perf_event_id; |
69 | 174 | ||
175 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
176 | enum event_type_t event_type); | ||
177 | |||
178 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | ||
179 | enum event_type_t event_type, | ||
180 | struct task_struct *task); | ||
181 | |||
182 | static void update_context_time(struct perf_event_context *ctx); | ||
183 | static u64 perf_event_time(struct perf_event *event); | ||
184 | |||
185 | void __weak perf_event_print_debug(void) { } | ||
186 | |||
187 | extern __weak const char *perf_pmu_name(void) | ||
188 | { | ||
189 | return "pmu"; | ||
190 | } | ||
191 | |||
192 | static inline u64 perf_clock(void) | ||
193 | { | ||
194 | return local_clock(); | ||
195 | } | ||
196 | |||
197 | static inline struct perf_cpu_context * | ||
198 | __get_cpu_context(struct perf_event_context *ctx) | ||
199 | { | ||
200 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
201 | } | ||
202 | |||
203 | #ifdef CONFIG_CGROUP_PERF | ||
204 | |||
70 | /* | 205 | /* |
71 | * Lock for (sysadmin-configurable) event reservations: | 206 | * Must ensure cgroup is pinned (css_get) before calling |
207 | * this function. In other words, we cannot call this function | ||
208 | * if there is no cgroup event for the current CPU context. | ||
72 | */ | 209 | */ |
73 | static DEFINE_SPINLOCK(perf_resource_lock); | 210 | static inline struct perf_cgroup * |
211 | perf_cgroup_from_task(struct task_struct *task) | ||
212 | { | ||
213 | return container_of(task_subsys_state(task, perf_subsys_id), | ||
214 | struct perf_cgroup, css); | ||
215 | } | ||
216 | |||
217 | static inline bool | ||
218 | perf_cgroup_match(struct perf_event *event) | ||
219 | { | ||
220 | struct perf_event_context *ctx = event->ctx; | ||
221 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
222 | |||
223 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | ||
224 | } | ||
225 | |||
226 | static inline void perf_get_cgroup(struct perf_event *event) | ||
227 | { | ||
228 | css_get(&event->cgrp->css); | ||
229 | } | ||
230 | |||
231 | static inline void perf_put_cgroup(struct perf_event *event) | ||
232 | { | ||
233 | css_put(&event->cgrp->css); | ||
234 | } | ||
235 | |||
236 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
237 | { | ||
238 | perf_put_cgroup(event); | ||
239 | event->cgrp = NULL; | ||
240 | } | ||
241 | |||
242 | static inline int is_cgroup_event(struct perf_event *event) | ||
243 | { | ||
244 | return event->cgrp != NULL; | ||
245 | } | ||
246 | |||
247 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
248 | { | ||
249 | struct perf_cgroup_info *t; | ||
250 | |||
251 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
252 | return t->time; | ||
253 | } | ||
254 | |||
255 | static inline void __update_cgrp_time(struct perf_cgroup *cgrp) | ||
256 | { | ||
257 | struct perf_cgroup_info *info; | ||
258 | u64 now; | ||
259 | |||
260 | now = perf_clock(); | ||
261 | |||
262 | info = this_cpu_ptr(cgrp->info); | ||
263 | |||
264 | info->time += now - info->timestamp; | ||
265 | info->timestamp = now; | ||
266 | } | ||
267 | |||
268 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
269 | { | ||
270 | struct perf_cgroup *cgrp_out = cpuctx->cgrp; | ||
271 | if (cgrp_out) | ||
272 | __update_cgrp_time(cgrp_out); | ||
273 | } | ||
274 | |||
275 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
276 | { | ||
277 | struct perf_cgroup *cgrp; | ||
278 | |||
279 | /* | ||
280 | * ensure we access cgroup data only when needed and | ||
281 | * when we know the cgroup is pinned (css_get) | ||
282 | */ | ||
283 | if (!is_cgroup_event(event)) | ||
284 | return; | ||
285 | |||
286 | cgrp = perf_cgroup_from_task(current); | ||
287 | /* | ||
288 | * Do not update time when cgroup is not active | ||
289 | */ | ||
290 | if (cgrp == event->cgrp) | ||
291 | __update_cgrp_time(event->cgrp); | ||
292 | } | ||
293 | |||
294 | static inline void | ||
295 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
296 | struct perf_event_context *ctx) | ||
297 | { | ||
298 | struct perf_cgroup *cgrp; | ||
299 | struct perf_cgroup_info *info; | ||
300 | |||
301 | /* | ||
302 | * ctx->lock held by caller | ||
303 | * ensure we do not access cgroup data | ||
304 | * unless we have the cgroup pinned (css_get) | ||
305 | */ | ||
306 | if (!task || !ctx->nr_cgroups) | ||
307 | return; | ||
308 | |||
309 | cgrp = perf_cgroup_from_task(task); | ||
310 | info = this_cpu_ptr(cgrp->info); | ||
311 | info->timestamp = ctx->timestamp; | ||
312 | } | ||
313 | |||
314 | #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ | ||
315 | #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ | ||
74 | 316 | ||
75 | /* | 317 | /* |
76 | * Architecture provided APIs - weak aliases: | 318 | * reschedule events based on the cgroup constraint of task. |
319 | * | ||
320 | * mode SWOUT : schedule out everything | ||
321 | * mode SWIN : schedule in based on cgroup for next | ||
77 | */ | 322 | */ |
78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | 323 | void perf_cgroup_switch(struct task_struct *task, int mode) |
79 | { | 324 | { |
80 | return NULL; | 325 | struct perf_cpu_context *cpuctx; |
326 | struct pmu *pmu; | ||
327 | unsigned long flags; | ||
328 | |||
329 | /* | ||
330 | * disable interrupts to avoid geting nr_cgroup | ||
331 | * changes via __perf_event_disable(). Also | ||
332 | * avoids preemption. | ||
333 | */ | ||
334 | local_irq_save(flags); | ||
335 | |||
336 | /* | ||
337 | * we reschedule only in the presence of cgroup | ||
338 | * constrained events. | ||
339 | */ | ||
340 | rcu_read_lock(); | ||
341 | |||
342 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
343 | |||
344 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
345 | |||
346 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
347 | |||
348 | /* | ||
349 | * perf_cgroup_events says at least one | ||
350 | * context on this CPU has cgroup events. | ||
351 | * | ||
352 | * ctx->nr_cgroups reports the number of cgroup | ||
353 | * events for a context. | ||
354 | */ | ||
355 | if (cpuctx->ctx.nr_cgroups > 0) { | ||
356 | |||
357 | if (mode & PERF_CGROUP_SWOUT) { | ||
358 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
359 | /* | ||
360 | * must not be done before ctxswout due | ||
361 | * to event_filter_match() in event_sched_out() | ||
362 | */ | ||
363 | cpuctx->cgrp = NULL; | ||
364 | } | ||
365 | |||
366 | if (mode & PERF_CGROUP_SWIN) { | ||
367 | WARN_ON_ONCE(cpuctx->cgrp); | ||
368 | /* set cgrp before ctxsw in to | ||
369 | * allow event_filter_match() to not | ||
370 | * have to pass task around | ||
371 | */ | ||
372 | cpuctx->cgrp = perf_cgroup_from_task(task); | ||
373 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | ||
374 | } | ||
375 | } | ||
376 | |||
377 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
378 | } | ||
379 | |||
380 | rcu_read_unlock(); | ||
381 | |||
382 | local_irq_restore(flags); | ||
81 | } | 383 | } |
82 | 384 | ||
83 | void __weak hw_perf_disable(void) { barrier(); } | 385 | static inline void perf_cgroup_sched_out(struct task_struct *task) |
84 | void __weak hw_perf_enable(void) { barrier(); } | 386 | { |
387 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
388 | } | ||
85 | 389 | ||
86 | void __weak perf_event_print_debug(void) { } | 390 | static inline void perf_cgroup_sched_in(struct task_struct *task) |
391 | { | ||
392 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
393 | } | ||
394 | |||
395 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | ||
396 | struct perf_event_attr *attr, | ||
397 | struct perf_event *group_leader) | ||
398 | { | ||
399 | struct perf_cgroup *cgrp; | ||
400 | struct cgroup_subsys_state *css; | ||
401 | struct file *file; | ||
402 | int ret = 0, fput_needed; | ||
403 | |||
404 | file = fget_light(fd, &fput_needed); | ||
405 | if (!file) | ||
406 | return -EBADF; | ||
407 | |||
408 | css = cgroup_css_from_dir(file, perf_subsys_id); | ||
409 | if (IS_ERR(css)) { | ||
410 | ret = PTR_ERR(css); | ||
411 | goto out; | ||
412 | } | ||
413 | |||
414 | cgrp = container_of(css, struct perf_cgroup, css); | ||
415 | event->cgrp = cgrp; | ||
416 | |||
417 | /* must be done before we fput() the file */ | ||
418 | perf_get_cgroup(event); | ||
87 | 419 | ||
88 | static DEFINE_PER_CPU(int, perf_disable_count); | 420 | /* |
421 | * all events in a group must monitor | ||
422 | * the same cgroup because a task belongs | ||
423 | * to only one perf cgroup at a time | ||
424 | */ | ||
425 | if (group_leader && group_leader->cgrp != cgrp) { | ||
426 | perf_detach_cgroup(event); | ||
427 | ret = -EINVAL; | ||
428 | } | ||
429 | out: | ||
430 | fput_light(file, fput_needed); | ||
431 | return ret; | ||
432 | } | ||
89 | 433 | ||
90 | void perf_disable(void) | 434 | static inline void |
435 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
91 | { | 436 | { |
92 | if (!__get_cpu_var(perf_disable_count)++) | 437 | struct perf_cgroup_info *t; |
93 | hw_perf_disable(); | 438 | t = per_cpu_ptr(event->cgrp->info, event->cpu); |
439 | event->shadow_ctx_time = now - t->timestamp; | ||
94 | } | 440 | } |
95 | 441 | ||
96 | void perf_enable(void) | 442 | static inline void |
443 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
97 | { | 444 | { |
98 | if (!--__get_cpu_var(perf_disable_count)) | 445 | /* |
99 | hw_perf_enable(); | 446 | * when the current task's perf cgroup does not match |
447 | * the event's, we need to remember to call the | ||
448 | * perf_mark_enable() function the first time a task with | ||
449 | * a matching perf cgroup is scheduled in. | ||
450 | */ | ||
451 | if (is_cgroup_event(event) && !perf_cgroup_match(event)) | ||
452 | event->cgrp_defer_enabled = 1; | ||
100 | } | 453 | } |
101 | 454 | ||
102 | static void get_ctx(struct perf_event_context *ctx) | 455 | static inline void |
456 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
457 | struct perf_event_context *ctx) | ||
103 | { | 458 | { |
104 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 459 | struct perf_event *sub; |
460 | u64 tstamp = perf_event_time(event); | ||
461 | |||
462 | if (!event->cgrp_defer_enabled) | ||
463 | return; | ||
464 | |||
465 | event->cgrp_defer_enabled = 0; | ||
466 | |||
467 | event->tstamp_enabled = tstamp - event->total_time_enabled; | ||
468 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
469 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | ||
470 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; | ||
471 | sub->cgrp_defer_enabled = 0; | ||
472 | } | ||
473 | } | ||
105 | } | 474 | } |
475 | #else /* !CONFIG_CGROUP_PERF */ | ||
106 | 476 | ||
107 | static void free_ctx(struct rcu_head *head) | 477 | static inline bool |
478 | perf_cgroup_match(struct perf_event *event) | ||
108 | { | 479 | { |
109 | struct perf_event_context *ctx; | 480 | return true; |
481 | } | ||
482 | |||
483 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
484 | {} | ||
485 | |||
486 | static inline int is_cgroup_event(struct perf_event *event) | ||
487 | { | ||
488 | return 0; | ||
489 | } | ||
490 | |||
491 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
492 | { | ||
493 | return 0; | ||
494 | } | ||
495 | |||
496 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
497 | { | ||
498 | } | ||
110 | 499 | ||
111 | ctx = container_of(head, struct perf_event_context, rcu_head); | 500 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) |
112 | kfree(ctx); | 501 | { |
502 | } | ||
503 | |||
504 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
505 | { | ||
506 | } | ||
507 | |||
508 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
509 | { | ||
510 | } | ||
511 | |||
512 | static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, | ||
513 | struct perf_event_attr *attr, | ||
514 | struct perf_event *group_leader) | ||
515 | { | ||
516 | return -EINVAL; | ||
517 | } | ||
518 | |||
519 | static inline void | ||
520 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
521 | struct perf_event_context *ctx) | ||
522 | { | ||
523 | } | ||
524 | |||
525 | void | ||
526 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | ||
527 | { | ||
528 | } | ||
529 | |||
530 | static inline void | ||
531 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
532 | { | ||
533 | } | ||
534 | |||
535 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
536 | { | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | static inline void | ||
541 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
542 | { | ||
543 | } | ||
544 | |||
545 | static inline void | ||
546 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
547 | struct perf_event_context *ctx) | ||
548 | { | ||
549 | } | ||
550 | #endif | ||
551 | |||
552 | void perf_pmu_disable(struct pmu *pmu) | ||
553 | { | ||
554 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | ||
555 | if (!(*count)++) | ||
556 | pmu->pmu_disable(pmu); | ||
557 | } | ||
558 | |||
559 | void perf_pmu_enable(struct pmu *pmu) | ||
560 | { | ||
561 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | ||
562 | if (!--(*count)) | ||
563 | pmu->pmu_enable(pmu); | ||
564 | } | ||
565 | |||
566 | static DEFINE_PER_CPU(struct list_head, rotation_list); | ||
567 | |||
568 | /* | ||
569 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
570 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
571 | * disabled, while rotate_context is called from IRQ context. | ||
572 | */ | ||
573 | static void perf_pmu_rotate_start(struct pmu *pmu) | ||
574 | { | ||
575 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
576 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
577 | |||
578 | WARN_ON(!irqs_disabled()); | ||
579 | |||
580 | if (list_empty(&cpuctx->rotation_list)) | ||
581 | list_add(&cpuctx->rotation_list, head); | ||
582 | } | ||
583 | |||
584 | static void get_ctx(struct perf_event_context *ctx) | ||
585 | { | ||
586 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | ||
113 | } | 587 | } |
114 | 588 | ||
115 | static void put_ctx(struct perf_event_context *ctx) | 589 | static void put_ctx(struct perf_event_context *ctx) |
@@ -119,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
119 | put_ctx(ctx->parent_ctx); | 593 | put_ctx(ctx->parent_ctx); |
120 | if (ctx->task) | 594 | if (ctx->task) |
121 | put_task_struct(ctx->task); | 595 | put_task_struct(ctx->task); |
122 | call_rcu(&ctx->rcu_head, free_ctx); | 596 | kfree_rcu(ctx, rcu_head); |
123 | } | 597 | } |
124 | } | 598 | } |
125 | 599 | ||
@@ -131,6 +605,28 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
131 | } | 605 | } |
132 | } | 606 | } |
133 | 607 | ||
608 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
609 | { | ||
610 | /* | ||
611 | * only top level events have the pid namespace they were created in | ||
612 | */ | ||
613 | if (event->parent) | ||
614 | event = event->parent; | ||
615 | |||
616 | return task_tgid_nr_ns(p, event->ns); | ||
617 | } | ||
618 | |||
619 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
620 | { | ||
621 | /* | ||
622 | * only top level events have the pid namespace they were created in | ||
623 | */ | ||
624 | if (event->parent) | ||
625 | event = event->parent; | ||
626 | |||
627 | return task_pid_nr_ns(p, event->ns); | ||
628 | } | ||
629 | |||
134 | /* | 630 | /* |
135 | * If we inherit events we want to return the parent event id | 631 | * If we inherit events we want to return the parent event id |
136 | * to userspace. | 632 | * to userspace. |
@@ -151,13 +647,13 @@ static u64 primary_event_id(struct perf_event *event) | |||
151 | * the context could get moved to another task. | 647 | * the context could get moved to another task. |
152 | */ | 648 | */ |
153 | static struct perf_event_context * | 649 | static struct perf_event_context * |
154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) | 650 | perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) |
155 | { | 651 | { |
156 | struct perf_event_context *ctx; | 652 | struct perf_event_context *ctx; |
157 | 653 | ||
158 | rcu_read_lock(); | 654 | rcu_read_lock(); |
159 | retry: | 655 | retry: |
160 | ctx = rcu_dereference(task->perf_event_ctxp); | 656 | ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); |
161 | if (ctx) { | 657 | if (ctx) { |
162 | /* | 658 | /* |
163 | * If this context is a clone of another, it might | 659 | * If this context is a clone of another, it might |
@@ -170,7 +666,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
170 | * can't get swapped on us any more. | 666 | * can't get swapped on us any more. |
171 | */ | 667 | */ |
172 | raw_spin_lock_irqsave(&ctx->lock, *flags); | 668 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { | 669 | if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { |
174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); | 670 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
175 | goto retry; | 671 | goto retry; |
176 | } | 672 | } |
@@ -189,12 +685,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
189 | * can't get swapped to another task. This also increments its | 685 | * can't get swapped to another task. This also increments its |
190 | * reference count so that the context can't get freed. | 686 | * reference count so that the context can't get freed. |
191 | */ | 687 | */ |
192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) | 688 | static struct perf_event_context * |
689 | perf_pin_task_context(struct task_struct *task, int ctxn) | ||
193 | { | 690 | { |
194 | struct perf_event_context *ctx; | 691 | struct perf_event_context *ctx; |
195 | unsigned long flags; | 692 | unsigned long flags; |
196 | 693 | ||
197 | ctx = perf_lock_task_context(task, &flags); | 694 | ctx = perf_lock_task_context(task, ctxn, &flags); |
198 | if (ctx) { | 695 | if (ctx) { |
199 | ++ctx->pin_count; | 696 | ++ctx->pin_count; |
200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 697 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
@@ -209,12 +706,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
209 | raw_spin_lock_irqsave(&ctx->lock, flags); | 706 | raw_spin_lock_irqsave(&ctx->lock, flags); |
210 | --ctx->pin_count; | 707 | --ctx->pin_count; |
211 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 708 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
212 | put_ctx(ctx); | ||
213 | } | ||
214 | |||
215 | static inline u64 perf_clock(void) | ||
216 | { | ||
217 | return local_clock(); | ||
218 | } | 709 | } |
219 | 710 | ||
220 | /* | 711 | /* |
@@ -228,6 +719,16 @@ static void update_context_time(struct perf_event_context *ctx) | |||
228 | ctx->timestamp = now; | 719 | ctx->timestamp = now; |
229 | } | 720 | } |
230 | 721 | ||
722 | static u64 perf_event_time(struct perf_event *event) | ||
723 | { | ||
724 | struct perf_event_context *ctx = event->ctx; | ||
725 | |||
726 | if (is_cgroup_event(event)) | ||
727 | return perf_cgroup_event_time(event); | ||
728 | |||
729 | return ctx ? ctx->time : 0; | ||
730 | } | ||
731 | |||
231 | /* | 732 | /* |
232 | * Update the total_time_enabled and total_time_running fields for a event. | 733 | * Update the total_time_enabled and total_time_running fields for a event. |
233 | */ | 734 | */ |
@@ -239,8 +740,19 @@ static void update_event_times(struct perf_event *event) | |||
239 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 740 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
240 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 741 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
241 | return; | 742 | return; |
242 | 743 | /* | |
243 | if (ctx->is_active) | 744 | * in cgroup mode, time_enabled represents |
745 | * the time the event was enabled AND active | ||
746 | * tasks were in the monitored cgroup. This is | ||
747 | * independent of the activity of the context as | ||
748 | * there may be a mix of cgroup and non-cgroup events. | ||
749 | * | ||
750 | * That is why we treat cgroup events differently | ||
751 | * here. | ||
752 | */ | ||
753 | if (is_cgroup_event(event)) | ||
754 | run_end = perf_event_time(event); | ||
755 | else if (ctx->is_active) | ||
244 | run_end = ctx->time; | 756 | run_end = ctx->time; |
245 | else | 757 | else |
246 | run_end = event->tstamp_stopped; | 758 | run_end = event->tstamp_stopped; |
@@ -250,9 +762,10 @@ static void update_event_times(struct perf_event *event) | |||
250 | if (event->state == PERF_EVENT_STATE_INACTIVE) | 762 | if (event->state == PERF_EVENT_STATE_INACTIVE) |
251 | run_end = event->tstamp_stopped; | 763 | run_end = event->tstamp_stopped; |
252 | else | 764 | else |
253 | run_end = ctx->time; | 765 | run_end = perf_event_time(event); |
254 | 766 | ||
255 | event->total_time_running = run_end - event->tstamp_running; | 767 | event->total_time_running = run_end - event->tstamp_running; |
768 | |||
256 | } | 769 | } |
257 | 770 | ||
258 | /* | 771 | /* |
@@ -301,17 +814,102 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
301 | list_add_tail(&event->group_entry, list); | 814 | list_add_tail(&event->group_entry, list); |
302 | } | 815 | } |
303 | 816 | ||
817 | if (is_cgroup_event(event)) | ||
818 | ctx->nr_cgroups++; | ||
819 | |||
304 | list_add_rcu(&event->event_entry, &ctx->event_list); | 820 | list_add_rcu(&event->event_entry, &ctx->event_list); |
821 | if (!ctx->nr_events) | ||
822 | perf_pmu_rotate_start(ctx->pmu); | ||
305 | ctx->nr_events++; | 823 | ctx->nr_events++; |
306 | if (event->attr.inherit_stat) | 824 | if (event->attr.inherit_stat) |
307 | ctx->nr_stat++; | 825 | ctx->nr_stat++; |
308 | } | 826 | } |
309 | 827 | ||
828 | /* | ||
829 | * Called at perf_event creation and when events are attached/detached from a | ||
830 | * group. | ||
831 | */ | ||
832 | static void perf_event__read_size(struct perf_event *event) | ||
833 | { | ||
834 | int entry = sizeof(u64); /* value */ | ||
835 | int size = 0; | ||
836 | int nr = 1; | ||
837 | |||
838 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | ||
839 | size += sizeof(u64); | ||
840 | |||
841 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
842 | size += sizeof(u64); | ||
843 | |||
844 | if (event->attr.read_format & PERF_FORMAT_ID) | ||
845 | entry += sizeof(u64); | ||
846 | |||
847 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
848 | nr += event->group_leader->nr_siblings; | ||
849 | size += sizeof(u64); | ||
850 | } | ||
851 | |||
852 | size += entry * nr; | ||
853 | event->read_size = size; | ||
854 | } | ||
855 | |||
856 | static void perf_event__header_size(struct perf_event *event) | ||
857 | { | ||
858 | struct perf_sample_data *data; | ||
859 | u64 sample_type = event->attr.sample_type; | ||
860 | u16 size = 0; | ||
861 | |||
862 | perf_event__read_size(event); | ||
863 | |||
864 | if (sample_type & PERF_SAMPLE_IP) | ||
865 | size += sizeof(data->ip); | ||
866 | |||
867 | if (sample_type & PERF_SAMPLE_ADDR) | ||
868 | size += sizeof(data->addr); | ||
869 | |||
870 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
871 | size += sizeof(data->period); | ||
872 | |||
873 | if (sample_type & PERF_SAMPLE_READ) | ||
874 | size += event->read_size; | ||
875 | |||
876 | event->header_size = size; | ||
877 | } | ||
878 | |||
879 | static void perf_event__id_header_size(struct perf_event *event) | ||
880 | { | ||
881 | struct perf_sample_data *data; | ||
882 | u64 sample_type = event->attr.sample_type; | ||
883 | u16 size = 0; | ||
884 | |||
885 | if (sample_type & PERF_SAMPLE_TID) | ||
886 | size += sizeof(data->tid_entry); | ||
887 | |||
888 | if (sample_type & PERF_SAMPLE_TIME) | ||
889 | size += sizeof(data->time); | ||
890 | |||
891 | if (sample_type & PERF_SAMPLE_ID) | ||
892 | size += sizeof(data->id); | ||
893 | |||
894 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
895 | size += sizeof(data->stream_id); | ||
896 | |||
897 | if (sample_type & PERF_SAMPLE_CPU) | ||
898 | size += sizeof(data->cpu_entry); | ||
899 | |||
900 | event->id_header_size = size; | ||
901 | } | ||
902 | |||
310 | static void perf_group_attach(struct perf_event *event) | 903 | static void perf_group_attach(struct perf_event *event) |
311 | { | 904 | { |
312 | struct perf_event *group_leader = event->group_leader; | 905 | struct perf_event *group_leader = event->group_leader, *pos; |
906 | |||
907 | /* | ||
908 | * We can have double attach due to group movement in perf_event_open. | ||
909 | */ | ||
910 | if (event->attach_state & PERF_ATTACH_GROUP) | ||
911 | return; | ||
313 | 912 | ||
314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); | ||
315 | event->attach_state |= PERF_ATTACH_GROUP; | 913 | event->attach_state |= PERF_ATTACH_GROUP; |
316 | 914 | ||
317 | if (group_leader == event) | 915 | if (group_leader == event) |
@@ -323,6 +921,11 @@ static void perf_group_attach(struct perf_event *event) | |||
323 | 921 | ||
324 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 922 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
325 | group_leader->nr_siblings++; | 923 | group_leader->nr_siblings++; |
924 | |||
925 | perf_event__header_size(group_leader); | ||
926 | |||
927 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | ||
928 | perf_event__header_size(pos); | ||
326 | } | 929 | } |
327 | 930 | ||
328 | /* | 931 | /* |
@@ -332,6 +935,7 @@ static void perf_group_attach(struct perf_event *event) | |||
332 | static void | 935 | static void |
333 | list_del_event(struct perf_event *event, struct perf_event_context *ctx) | 936 | list_del_event(struct perf_event *event, struct perf_event_context *ctx) |
334 | { | 937 | { |
938 | struct perf_cpu_context *cpuctx; | ||
335 | /* | 939 | /* |
336 | * We can have double detach due to exit/hot-unplug + close. | 940 | * We can have double detach due to exit/hot-unplug + close. |
337 | */ | 941 | */ |
@@ -340,6 +944,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
340 | 944 | ||
341 | event->attach_state &= ~PERF_ATTACH_CONTEXT; | 945 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
342 | 946 | ||
947 | if (is_cgroup_event(event)) { | ||
948 | ctx->nr_cgroups--; | ||
949 | cpuctx = __get_cpu_context(ctx); | ||
950 | /* | ||
951 | * if there are no more cgroup events | ||
952 | * then cler cgrp to avoid stale pointer | ||
953 | * in update_cgrp_time_from_cpuctx() | ||
954 | */ | ||
955 | if (!ctx->nr_cgroups) | ||
956 | cpuctx->cgrp = NULL; | ||
957 | } | ||
958 | |||
343 | ctx->nr_events--; | 959 | ctx->nr_events--; |
344 | if (event->attr.inherit_stat) | 960 | if (event->attr.inherit_stat) |
345 | ctx->nr_stat--; | 961 | ctx->nr_stat--; |
@@ -381,7 +997,7 @@ static void perf_group_detach(struct perf_event *event) | |||
381 | if (event->group_leader != event) { | 997 | if (event->group_leader != event) { |
382 | list_del_init(&event->group_entry); | 998 | list_del_init(&event->group_entry); |
383 | event->group_leader->nr_siblings--; | 999 | event->group_leader->nr_siblings--; |
384 | return; | 1000 | goto out; |
385 | } | 1001 | } |
386 | 1002 | ||
387 | if (!list_empty(&event->group_entry)) | 1003 | if (!list_empty(&event->group_entry)) |
@@ -400,12 +1016,19 @@ static void perf_group_detach(struct perf_event *event) | |||
400 | /* Inherit group flags from the previous leader */ | 1016 | /* Inherit group flags from the previous leader */ |
401 | sibling->group_flags = event->group_flags; | 1017 | sibling->group_flags = event->group_flags; |
402 | } | 1018 | } |
1019 | |||
1020 | out: | ||
1021 | perf_event__header_size(event->group_leader); | ||
1022 | |||
1023 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | ||
1024 | perf_event__header_size(tmp); | ||
403 | } | 1025 | } |
404 | 1026 | ||
405 | static inline int | 1027 | static inline int |
406 | event_filter_match(struct perf_event *event) | 1028 | event_filter_match(struct perf_event *event) |
407 | { | 1029 | { |
408 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 1030 | return (event->cpu == -1 || event->cpu == smp_processor_id()) |
1031 | && perf_cgroup_match(event); | ||
409 | } | 1032 | } |
410 | 1033 | ||
411 | static void | 1034 | static void |
@@ -413,6 +1036,7 @@ event_sched_out(struct perf_event *event, | |||
413 | struct perf_cpu_context *cpuctx, | 1036 | struct perf_cpu_context *cpuctx, |
414 | struct perf_event_context *ctx) | 1037 | struct perf_event_context *ctx) |
415 | { | 1038 | { |
1039 | u64 tstamp = perf_event_time(event); | ||
416 | u64 delta; | 1040 | u64 delta; |
417 | /* | 1041 | /* |
418 | * An event which could not be activated because of | 1042 | * An event which could not be activated because of |
@@ -422,9 +1046,9 @@ event_sched_out(struct perf_event *event, | |||
422 | */ | 1046 | */ |
423 | if (event->state == PERF_EVENT_STATE_INACTIVE | 1047 | if (event->state == PERF_EVENT_STATE_INACTIVE |
424 | && !event_filter_match(event)) { | 1048 | && !event_filter_match(event)) { |
425 | delta = ctx->time - event->tstamp_stopped; | 1049 | delta = tstamp - event->tstamp_stopped; |
426 | event->tstamp_running += delta; | 1050 | event->tstamp_running += delta; |
427 | event->tstamp_stopped = ctx->time; | 1051 | event->tstamp_stopped = tstamp; |
428 | } | 1052 | } |
429 | 1053 | ||
430 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1054 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
@@ -435,8 +1059,8 @@ event_sched_out(struct perf_event *event, | |||
435 | event->pending_disable = 0; | 1059 | event->pending_disable = 0; |
436 | event->state = PERF_EVENT_STATE_OFF; | 1060 | event->state = PERF_EVENT_STATE_OFF; |
437 | } | 1061 | } |
438 | event->tstamp_stopped = ctx->time; | 1062 | event->tstamp_stopped = tstamp; |
439 | event->pmu->disable(event); | 1063 | event->pmu->del(event, 0); |
440 | event->oncpu = -1; | 1064 | event->oncpu = -1; |
441 | 1065 | ||
442 | if (!is_software_event(event)) | 1066 | if (!is_software_event(event)) |
@@ -472,51 +1096,24 @@ group_sched_out(struct perf_event *group_event, | |||
472 | * We disable the event on the hardware level first. After that we | 1096 | * We disable the event on the hardware level first. After that we |
473 | * remove it from the context list. | 1097 | * remove it from the context list. |
474 | */ | 1098 | */ |
475 | static void __perf_event_remove_from_context(void *info) | 1099 | static int __perf_remove_from_context(void *info) |
476 | { | 1100 | { |
477 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
478 | struct perf_event *event = info; | 1101 | struct perf_event *event = info; |
479 | struct perf_event_context *ctx = event->ctx; | 1102 | struct perf_event_context *ctx = event->ctx; |
480 | 1103 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | |
481 | /* | ||
482 | * If this is a task context, we need to check whether it is | ||
483 | * the current task context of this cpu. If not it has been | ||
484 | * scheduled out before the smp call arrived. | ||
485 | */ | ||
486 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
487 | return; | ||
488 | 1104 | ||
489 | raw_spin_lock(&ctx->lock); | 1105 | raw_spin_lock(&ctx->lock); |
490 | /* | ||
491 | * Protect the list operation against NMI by disabling the | ||
492 | * events on a global level. | ||
493 | */ | ||
494 | perf_disable(); | ||
495 | |||
496 | event_sched_out(event, cpuctx, ctx); | 1106 | event_sched_out(event, cpuctx, ctx); |
497 | |||
498 | list_del_event(event, ctx); | 1107 | list_del_event(event, ctx); |
499 | |||
500 | if (!ctx->task) { | ||
501 | /* | ||
502 | * Allow more per task events with respect to the | ||
503 | * reservation: | ||
504 | */ | ||
505 | cpuctx->max_pertask = | ||
506 | min(perf_max_events - ctx->nr_events, | ||
507 | perf_max_events - perf_reserved_percpu); | ||
508 | } | ||
509 | |||
510 | perf_enable(); | ||
511 | raw_spin_unlock(&ctx->lock); | 1108 | raw_spin_unlock(&ctx->lock); |
1109 | |||
1110 | return 0; | ||
512 | } | 1111 | } |
513 | 1112 | ||
514 | 1113 | ||
515 | /* | 1114 | /* |
516 | * Remove the event from a task's (or a CPU's) list of events. | 1115 | * Remove the event from a task's (or a CPU's) list of events. |
517 | * | 1116 | * |
518 | * Must be called with ctx->mutex held. | ||
519 | * | ||
520 | * CPU events are removed with a smp call. For task events we only | 1117 | * CPU events are removed with a smp call. For task events we only |
521 | * call when the task is on a CPU. | 1118 | * call when the task is on a CPU. |
522 | * | 1119 | * |
@@ -527,60 +1124,62 @@ static void __perf_event_remove_from_context(void *info) | |||
527 | * When called from perf_event_exit_task, it's OK because the | 1124 | * When called from perf_event_exit_task, it's OK because the |
528 | * context has been detached from its task. | 1125 | * context has been detached from its task. |
529 | */ | 1126 | */ |
530 | static void perf_event_remove_from_context(struct perf_event *event) | 1127 | static void perf_remove_from_context(struct perf_event *event) |
531 | { | 1128 | { |
532 | struct perf_event_context *ctx = event->ctx; | 1129 | struct perf_event_context *ctx = event->ctx; |
533 | struct task_struct *task = ctx->task; | 1130 | struct task_struct *task = ctx->task; |
534 | 1131 | ||
1132 | lockdep_assert_held(&ctx->mutex); | ||
1133 | |||
535 | if (!task) { | 1134 | if (!task) { |
536 | /* | 1135 | /* |
537 | * Per cpu events are removed via an smp call and | 1136 | * Per cpu events are removed via an smp call and |
538 | * the removal is always successful. | 1137 | * the removal is always successful. |
539 | */ | 1138 | */ |
540 | smp_call_function_single(event->cpu, | 1139 | cpu_function_call(event->cpu, __perf_remove_from_context, event); |
541 | __perf_event_remove_from_context, | ||
542 | event, 1); | ||
543 | return; | 1140 | return; |
544 | } | 1141 | } |
545 | 1142 | ||
546 | retry: | 1143 | retry: |
547 | task_oncpu_function_call(task, __perf_event_remove_from_context, | 1144 | if (!task_function_call(task, __perf_remove_from_context, event)) |
548 | event); | 1145 | return; |
549 | 1146 | ||
550 | raw_spin_lock_irq(&ctx->lock); | 1147 | raw_spin_lock_irq(&ctx->lock); |
551 | /* | 1148 | /* |
552 | * If the context is active we need to retry the smp call. | 1149 | * If we failed to find a running task, but find the context active now |
1150 | * that we've acquired the ctx->lock, retry. | ||
553 | */ | 1151 | */ |
554 | if (ctx->nr_active && !list_empty(&event->group_entry)) { | 1152 | if (ctx->is_active) { |
555 | raw_spin_unlock_irq(&ctx->lock); | 1153 | raw_spin_unlock_irq(&ctx->lock); |
556 | goto retry; | 1154 | goto retry; |
557 | } | 1155 | } |
558 | 1156 | ||
559 | /* | 1157 | /* |
560 | * The lock prevents that this context is scheduled in so we | 1158 | * Since the task isn't running, its safe to remove the event, us |
561 | * can remove the event safely, if the call above did not | 1159 | * holding the ctx->lock ensures the task won't get scheduled in. |
562 | * succeed. | ||
563 | */ | 1160 | */ |
564 | if (!list_empty(&event->group_entry)) | 1161 | list_del_event(event, ctx); |
565 | list_del_event(event, ctx); | ||
566 | raw_spin_unlock_irq(&ctx->lock); | 1162 | raw_spin_unlock_irq(&ctx->lock); |
567 | } | 1163 | } |
568 | 1164 | ||
569 | /* | 1165 | /* |
570 | * Cross CPU call to disable a performance event | 1166 | * Cross CPU call to disable a performance event |
571 | */ | 1167 | */ |
572 | static void __perf_event_disable(void *info) | 1168 | static int __perf_event_disable(void *info) |
573 | { | 1169 | { |
574 | struct perf_event *event = info; | 1170 | struct perf_event *event = info; |
575 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
576 | struct perf_event_context *ctx = event->ctx; | 1171 | struct perf_event_context *ctx = event->ctx; |
1172 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
577 | 1173 | ||
578 | /* | 1174 | /* |
579 | * If this is a per-task event, need to check whether this | 1175 | * If this is a per-task event, need to check whether this |
580 | * event's task is the current task on this cpu. | 1176 | * event's task is the current task on this cpu. |
1177 | * | ||
1178 | * Can trigger due to concurrent perf_event_context_sched_out() | ||
1179 | * flipping contexts around. | ||
581 | */ | 1180 | */ |
582 | if (ctx->task && cpuctx->task_ctx != ctx) | 1181 | if (ctx->task && cpuctx->task_ctx != ctx) |
583 | return; | 1182 | return -EINVAL; |
584 | 1183 | ||
585 | raw_spin_lock(&ctx->lock); | 1184 | raw_spin_lock(&ctx->lock); |
586 | 1185 | ||
@@ -590,6 +1189,7 @@ static void __perf_event_disable(void *info) | |||
590 | */ | 1189 | */ |
591 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 1190 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
592 | update_context_time(ctx); | 1191 | update_context_time(ctx); |
1192 | update_cgrp_time_from_event(event); | ||
593 | update_group_times(event); | 1193 | update_group_times(event); |
594 | if (event == event->group_leader) | 1194 | if (event == event->group_leader) |
595 | group_sched_out(event, cpuctx, ctx); | 1195 | group_sched_out(event, cpuctx, ctx); |
@@ -599,6 +1199,8 @@ static void __perf_event_disable(void *info) | |||
599 | } | 1199 | } |
600 | 1200 | ||
601 | raw_spin_unlock(&ctx->lock); | 1201 | raw_spin_unlock(&ctx->lock); |
1202 | |||
1203 | return 0; | ||
602 | } | 1204 | } |
603 | 1205 | ||
604 | /* | 1206 | /* |
@@ -623,13 +1225,13 @@ void perf_event_disable(struct perf_event *event) | |||
623 | /* | 1225 | /* |
624 | * Disable the event on the cpu that it's on | 1226 | * Disable the event on the cpu that it's on |
625 | */ | 1227 | */ |
626 | smp_call_function_single(event->cpu, __perf_event_disable, | 1228 | cpu_function_call(event->cpu, __perf_event_disable, event); |
627 | event, 1); | ||
628 | return; | 1229 | return; |
629 | } | 1230 | } |
630 | 1231 | ||
631 | retry: | 1232 | retry: |
632 | task_oncpu_function_call(task, __perf_event_disable, event); | 1233 | if (!task_function_call(task, __perf_event_disable, event)) |
1234 | return; | ||
633 | 1235 | ||
634 | raw_spin_lock_irq(&ctx->lock); | 1236 | raw_spin_lock_irq(&ctx->lock); |
635 | /* | 1237 | /* |
@@ -637,6 +1239,11 @@ void perf_event_disable(struct perf_event *event) | |||
637 | */ | 1239 | */ |
638 | if (event->state == PERF_EVENT_STATE_ACTIVE) { | 1240 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
639 | raw_spin_unlock_irq(&ctx->lock); | 1241 | raw_spin_unlock_irq(&ctx->lock); |
1242 | /* | ||
1243 | * Reload the task pointer, it might have been changed by | ||
1244 | * a concurrent perf_event_context_sched_out(). | ||
1245 | */ | ||
1246 | task = ctx->task; | ||
640 | goto retry; | 1247 | goto retry; |
641 | } | 1248 | } |
642 | 1249 | ||
@@ -648,32 +1255,85 @@ void perf_event_disable(struct perf_event *event) | |||
648 | update_group_times(event); | 1255 | update_group_times(event); |
649 | event->state = PERF_EVENT_STATE_OFF; | 1256 | event->state = PERF_EVENT_STATE_OFF; |
650 | } | 1257 | } |
651 | |||
652 | raw_spin_unlock_irq(&ctx->lock); | 1258 | raw_spin_unlock_irq(&ctx->lock); |
653 | } | 1259 | } |
654 | 1260 | ||
1261 | static void perf_set_shadow_time(struct perf_event *event, | ||
1262 | struct perf_event_context *ctx, | ||
1263 | u64 tstamp) | ||
1264 | { | ||
1265 | /* | ||
1266 | * use the correct time source for the time snapshot | ||
1267 | * | ||
1268 | * We could get by without this by leveraging the | ||
1269 | * fact that to get to this function, the caller | ||
1270 | * has most likely already called update_context_time() | ||
1271 | * and update_cgrp_time_xx() and thus both timestamp | ||
1272 | * are identical (or very close). Given that tstamp is, | ||
1273 | * already adjusted for cgroup, we could say that: | ||
1274 | * tstamp - ctx->timestamp | ||
1275 | * is equivalent to | ||
1276 | * tstamp - cgrp->timestamp. | ||
1277 | * | ||
1278 | * Then, in perf_output_read(), the calculation would | ||
1279 | * work with no changes because: | ||
1280 | * - event is guaranteed scheduled in | ||
1281 | * - no scheduled out in between | ||
1282 | * - thus the timestamp would be the same | ||
1283 | * | ||
1284 | * But this is a bit hairy. | ||
1285 | * | ||
1286 | * So instead, we have an explicit cgroup call to remain | ||
1287 | * within the time time source all along. We believe it | ||
1288 | * is cleaner and simpler to understand. | ||
1289 | */ | ||
1290 | if (is_cgroup_event(event)) | ||
1291 | perf_cgroup_set_shadow_time(event, tstamp); | ||
1292 | else | ||
1293 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
1294 | } | ||
1295 | |||
1296 | #define MAX_INTERRUPTS (~0ULL) | ||
1297 | |||
1298 | static void perf_log_throttle(struct perf_event *event, int enable); | ||
1299 | |||
655 | static int | 1300 | static int |
656 | event_sched_in(struct perf_event *event, | 1301 | event_sched_in(struct perf_event *event, |
657 | struct perf_cpu_context *cpuctx, | 1302 | struct perf_cpu_context *cpuctx, |
658 | struct perf_event_context *ctx) | 1303 | struct perf_event_context *ctx) |
659 | { | 1304 | { |
1305 | u64 tstamp = perf_event_time(event); | ||
1306 | |||
660 | if (event->state <= PERF_EVENT_STATE_OFF) | 1307 | if (event->state <= PERF_EVENT_STATE_OFF) |
661 | return 0; | 1308 | return 0; |
662 | 1309 | ||
663 | event->state = PERF_EVENT_STATE_ACTIVE; | 1310 | event->state = PERF_EVENT_STATE_ACTIVE; |
664 | event->oncpu = smp_processor_id(); | 1311 | event->oncpu = smp_processor_id(); |
1312 | |||
1313 | /* | ||
1314 | * Unthrottle events, since we scheduled we might have missed several | ||
1315 | * ticks already, also for a heavily scheduling task there is little | ||
1316 | * guarantee it'll get a tick in a timely manner. | ||
1317 | */ | ||
1318 | if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { | ||
1319 | perf_log_throttle(event, 1); | ||
1320 | event->hw.interrupts = 0; | ||
1321 | } | ||
1322 | |||
665 | /* | 1323 | /* |
666 | * The new state must be visible before we turn it on in the hardware: | 1324 | * The new state must be visible before we turn it on in the hardware: |
667 | */ | 1325 | */ |
668 | smp_wmb(); | 1326 | smp_wmb(); |
669 | 1327 | ||
670 | if (event->pmu->enable(event)) { | 1328 | if (event->pmu->add(event, PERF_EF_START)) { |
671 | event->state = PERF_EVENT_STATE_INACTIVE; | 1329 | event->state = PERF_EVENT_STATE_INACTIVE; |
672 | event->oncpu = -1; | 1330 | event->oncpu = -1; |
673 | return -EAGAIN; | 1331 | return -EAGAIN; |
674 | } | 1332 | } |
675 | 1333 | ||
676 | event->tstamp_running += ctx->time - event->tstamp_stopped; | 1334 | event->tstamp_running += tstamp - event->tstamp_stopped; |
1335 | |||
1336 | perf_set_shadow_time(event, ctx, tstamp); | ||
677 | 1337 | ||
678 | if (!is_software_event(event)) | 1338 | if (!is_software_event(event)) |
679 | cpuctx->active_oncpu++; | 1339 | cpuctx->active_oncpu++; |
@@ -691,22 +1351,17 @@ group_sched_in(struct perf_event *group_event, | |||
691 | struct perf_event_context *ctx) | 1351 | struct perf_event_context *ctx) |
692 | { | 1352 | { |
693 | struct perf_event *event, *partial_group = NULL; | 1353 | struct perf_event *event, *partial_group = NULL; |
694 | const struct pmu *pmu = group_event->pmu; | 1354 | struct pmu *pmu = group_event->pmu; |
695 | bool txn = false; | 1355 | u64 now = ctx->time; |
1356 | bool simulate = false; | ||
696 | 1357 | ||
697 | if (group_event->state == PERF_EVENT_STATE_OFF) | 1358 | if (group_event->state == PERF_EVENT_STATE_OFF) |
698 | return 0; | 1359 | return 0; |
699 | 1360 | ||
700 | /* Check if group transaction availabe */ | 1361 | pmu->start_txn(pmu); |
701 | if (pmu->start_txn) | ||
702 | txn = true; | ||
703 | |||
704 | if (txn) | ||
705 | pmu->start_txn(pmu); | ||
706 | 1362 | ||
707 | if (event_sched_in(group_event, cpuctx, ctx)) { | 1363 | if (event_sched_in(group_event, cpuctx, ctx)) { |
708 | if (txn) | 1364 | pmu->cancel_txn(pmu); |
709 | pmu->cancel_txn(pmu); | ||
710 | return -EAGAIN; | 1365 | return -EAGAIN; |
711 | } | 1366 | } |
712 | 1367 | ||
@@ -720,23 +1375,38 @@ group_sched_in(struct perf_event *group_event, | |||
720 | } | 1375 | } |
721 | } | 1376 | } |
722 | 1377 | ||
723 | if (!txn || !pmu->commit_txn(pmu)) | 1378 | if (!pmu->commit_txn(pmu)) |
724 | return 0; | 1379 | return 0; |
725 | 1380 | ||
726 | group_error: | 1381 | group_error: |
727 | /* | 1382 | /* |
728 | * Groups can be scheduled in as one unit only, so undo any | 1383 | * Groups can be scheduled in as one unit only, so undo any |
729 | * partial group before returning: | 1384 | * partial group before returning: |
1385 | * The events up to the failed event are scheduled out normally, | ||
1386 | * tstamp_stopped will be updated. | ||
1387 | * | ||
1388 | * The failed events and the remaining siblings need to have | ||
1389 | * their timings updated as if they had gone thru event_sched_in() | ||
1390 | * and event_sched_out(). This is required to get consistent timings | ||
1391 | * across the group. This also takes care of the case where the group | ||
1392 | * could never be scheduled by ensuring tstamp_stopped is set to mark | ||
1393 | * the time the event was actually stopped, such that time delta | ||
1394 | * calculation in update_event_times() is correct. | ||
730 | */ | 1395 | */ |
731 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 1396 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
732 | if (event == partial_group) | 1397 | if (event == partial_group) |
733 | break; | 1398 | simulate = true; |
734 | event_sched_out(event, cpuctx, ctx); | 1399 | |
1400 | if (simulate) { | ||
1401 | event->tstamp_running += now - event->tstamp_stopped; | ||
1402 | event->tstamp_stopped = now; | ||
1403 | } else { | ||
1404 | event_sched_out(event, cpuctx, ctx); | ||
1405 | } | ||
735 | } | 1406 | } |
736 | event_sched_out(group_event, cpuctx, ctx); | 1407 | event_sched_out(group_event, cpuctx, ctx); |
737 | 1408 | ||
738 | if (txn) | 1409 | pmu->cancel_txn(pmu); |
739 | pmu->cancel_txn(pmu); | ||
740 | 1410 | ||
741 | return -EAGAIN; | 1411 | return -EAGAIN; |
742 | } | 1412 | } |
@@ -775,52 +1445,52 @@ static int group_can_go_on(struct perf_event *event, | |||
775 | static void add_event_to_ctx(struct perf_event *event, | 1445 | static void add_event_to_ctx(struct perf_event *event, |
776 | struct perf_event_context *ctx) | 1446 | struct perf_event_context *ctx) |
777 | { | 1447 | { |
1448 | u64 tstamp = perf_event_time(event); | ||
1449 | |||
778 | list_add_event(event, ctx); | 1450 | list_add_event(event, ctx); |
779 | perf_group_attach(event); | 1451 | perf_group_attach(event); |
780 | event->tstamp_enabled = ctx->time; | 1452 | event->tstamp_enabled = tstamp; |
781 | event->tstamp_running = ctx->time; | 1453 | event->tstamp_running = tstamp; |
782 | event->tstamp_stopped = ctx->time; | 1454 | event->tstamp_stopped = tstamp; |
783 | } | 1455 | } |
784 | 1456 | ||
1457 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | ||
1458 | struct task_struct *tsk); | ||
1459 | |||
785 | /* | 1460 | /* |
786 | * Cross CPU call to install and enable a performance event | 1461 | * Cross CPU call to install and enable a performance event |
787 | * | 1462 | * |
788 | * Must be called with ctx->mutex held | 1463 | * Must be called with ctx->mutex held |
789 | */ | 1464 | */ |
790 | static void __perf_install_in_context(void *info) | 1465 | static int __perf_install_in_context(void *info) |
791 | { | 1466 | { |
792 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
793 | struct perf_event *event = info; | 1467 | struct perf_event *event = info; |
794 | struct perf_event_context *ctx = event->ctx; | 1468 | struct perf_event_context *ctx = event->ctx; |
795 | struct perf_event *leader = event->group_leader; | 1469 | struct perf_event *leader = event->group_leader; |
1470 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
796 | int err; | 1471 | int err; |
797 | 1472 | ||
798 | /* | 1473 | /* |
799 | * If this is a task context, we need to check whether it is | 1474 | * In case we're installing a new context to an already running task, |
800 | * the current task context of this cpu. If not it has been | 1475 | * could also happen before perf_event_task_sched_in() on architectures |
801 | * scheduled out before the smp call arrived. | 1476 | * which do context switches with IRQs enabled. |
802 | * Or possibly this is the right context but it isn't | ||
803 | * on this cpu because it had no events. | ||
804 | */ | 1477 | */ |
805 | if (ctx->task && cpuctx->task_ctx != ctx) { | 1478 | if (ctx->task && !cpuctx->task_ctx) |
806 | if (cpuctx->task_ctx || ctx->task != current) | 1479 | perf_event_context_sched_in(ctx, ctx->task); |
807 | return; | ||
808 | cpuctx->task_ctx = ctx; | ||
809 | } | ||
810 | 1480 | ||
811 | raw_spin_lock(&ctx->lock); | 1481 | raw_spin_lock(&ctx->lock); |
812 | ctx->is_active = 1; | 1482 | ctx->is_active = 1; |
813 | update_context_time(ctx); | 1483 | update_context_time(ctx); |
814 | |||
815 | /* | 1484 | /* |
816 | * Protect the list operation against NMI by disabling the | 1485 | * update cgrp time only if current cgrp |
817 | * events on a global level. NOP for non NMI based events. | 1486 | * matches event->cgrp. Must be done before |
1487 | * calling add_event_to_ctx() | ||
818 | */ | 1488 | */ |
819 | perf_disable(); | 1489 | update_cgrp_time_from_event(event); |
820 | 1490 | ||
821 | add_event_to_ctx(event, ctx); | 1491 | add_event_to_ctx(event, ctx); |
822 | 1492 | ||
823 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1493 | if (!event_filter_match(event)) |
824 | goto unlock; | 1494 | goto unlock; |
825 | 1495 | ||
826 | /* | 1496 | /* |
@@ -855,13 +1525,10 @@ static void __perf_install_in_context(void *info) | |||
855 | } | 1525 | } |
856 | } | 1526 | } |
857 | 1527 | ||
858 | if (!err && !ctx->task && cpuctx->max_pertask) | 1528 | unlock: |
859 | cpuctx->max_pertask--; | ||
860 | |||
861 | unlock: | ||
862 | perf_enable(); | ||
863 | |||
864 | raw_spin_unlock(&ctx->lock); | 1529 | raw_spin_unlock(&ctx->lock); |
1530 | |||
1531 | return 0; | ||
865 | } | 1532 | } |
866 | 1533 | ||
867 | /* | 1534 | /* |
@@ -873,8 +1540,6 @@ static void __perf_install_in_context(void *info) | |||
873 | * If the event is attached to a task which is on a CPU we use a smp | 1540 | * If the event is attached to a task which is on a CPU we use a smp |
874 | * call to enable it in the task context. The task might have been | 1541 | * call to enable it in the task context. The task might have been |
875 | * scheduled away, but we check this in the smp call again. | 1542 | * scheduled away, but we check this in the smp call again. |
876 | * | ||
877 | * Must be called with ctx->mutex held. | ||
878 | */ | 1543 | */ |
879 | static void | 1544 | static void |
880 | perf_install_in_context(struct perf_event_context *ctx, | 1545 | perf_install_in_context(struct perf_event_context *ctx, |
@@ -883,36 +1548,38 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
883 | { | 1548 | { |
884 | struct task_struct *task = ctx->task; | 1549 | struct task_struct *task = ctx->task; |
885 | 1550 | ||
1551 | lockdep_assert_held(&ctx->mutex); | ||
1552 | |||
1553 | event->ctx = ctx; | ||
1554 | |||
886 | if (!task) { | 1555 | if (!task) { |
887 | /* | 1556 | /* |
888 | * Per cpu events are installed via an smp call and | 1557 | * Per cpu events are installed via an smp call and |
889 | * the install is always successful. | 1558 | * the install is always successful. |
890 | */ | 1559 | */ |
891 | smp_call_function_single(cpu, __perf_install_in_context, | 1560 | cpu_function_call(cpu, __perf_install_in_context, event); |
892 | event, 1); | ||
893 | return; | 1561 | return; |
894 | } | 1562 | } |
895 | 1563 | ||
896 | retry: | 1564 | retry: |
897 | task_oncpu_function_call(task, __perf_install_in_context, | 1565 | if (!task_function_call(task, __perf_install_in_context, event)) |
898 | event); | 1566 | return; |
899 | 1567 | ||
900 | raw_spin_lock_irq(&ctx->lock); | 1568 | raw_spin_lock_irq(&ctx->lock); |
901 | /* | 1569 | /* |
902 | * we need to retry the smp call. | 1570 | * If we failed to find a running task, but find the context active now |
1571 | * that we've acquired the ctx->lock, retry. | ||
903 | */ | 1572 | */ |
904 | if (ctx->is_active && list_empty(&event->group_entry)) { | 1573 | if (ctx->is_active) { |
905 | raw_spin_unlock_irq(&ctx->lock); | 1574 | raw_spin_unlock_irq(&ctx->lock); |
906 | goto retry; | 1575 | goto retry; |
907 | } | 1576 | } |
908 | 1577 | ||
909 | /* | 1578 | /* |
910 | * The lock prevents that this context is scheduled in so we | 1579 | * Since the task isn't running, its safe to add the event, us holding |
911 | * can add the event safely, if it the call above did not | 1580 | * the ctx->lock ensures the task won't get scheduled in. |
912 | * succeed. | ||
913 | */ | 1581 | */ |
914 | if (list_empty(&event->group_entry)) | 1582 | add_event_to_ctx(event, ctx); |
915 | add_event_to_ctx(event, ctx); | ||
916 | raw_spin_unlock_irq(&ctx->lock); | 1583 | raw_spin_unlock_irq(&ctx->lock); |
917 | } | 1584 | } |
918 | 1585 | ||
@@ -928,46 +1595,48 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
928 | struct perf_event_context *ctx) | 1595 | struct perf_event_context *ctx) |
929 | { | 1596 | { |
930 | struct perf_event *sub; | 1597 | struct perf_event *sub; |
1598 | u64 tstamp = perf_event_time(event); | ||
931 | 1599 | ||
932 | event->state = PERF_EVENT_STATE_INACTIVE; | 1600 | event->state = PERF_EVENT_STATE_INACTIVE; |
933 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 1601 | event->tstamp_enabled = tstamp - event->total_time_enabled; |
934 | list_for_each_entry(sub, &event->sibling_list, group_entry) | 1602 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
935 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) | 1603 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) |
936 | sub->tstamp_enabled = | 1604 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; |
937 | ctx->time - sub->total_time_enabled; | 1605 | } |
938 | } | 1606 | } |
939 | 1607 | ||
940 | /* | 1608 | /* |
941 | * Cross CPU call to enable a performance event | 1609 | * Cross CPU call to enable a performance event |
942 | */ | 1610 | */ |
943 | static void __perf_event_enable(void *info) | 1611 | static int __perf_event_enable(void *info) |
944 | { | 1612 | { |
945 | struct perf_event *event = info; | 1613 | struct perf_event *event = info; |
946 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
947 | struct perf_event_context *ctx = event->ctx; | 1614 | struct perf_event_context *ctx = event->ctx; |
948 | struct perf_event *leader = event->group_leader; | 1615 | struct perf_event *leader = event->group_leader; |
1616 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
949 | int err; | 1617 | int err; |
950 | 1618 | ||
951 | /* | 1619 | if (WARN_ON_ONCE(!ctx->is_active)) |
952 | * If this is a per-task event, need to check whether this | 1620 | return -EINVAL; |
953 | * event's task is the current task on this cpu. | ||
954 | */ | ||
955 | if (ctx->task && cpuctx->task_ctx != ctx) { | ||
956 | if (cpuctx->task_ctx || ctx->task != current) | ||
957 | return; | ||
958 | cpuctx->task_ctx = ctx; | ||
959 | } | ||
960 | 1621 | ||
961 | raw_spin_lock(&ctx->lock); | 1622 | raw_spin_lock(&ctx->lock); |
962 | ctx->is_active = 1; | ||
963 | update_context_time(ctx); | 1623 | update_context_time(ctx); |
964 | 1624 | ||
965 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1625 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
966 | goto unlock; | 1626 | goto unlock; |
1627 | |||
1628 | /* | ||
1629 | * set current task's cgroup time reference point | ||
1630 | */ | ||
1631 | perf_cgroup_set_timestamp(current, ctx); | ||
1632 | |||
967 | __perf_event_mark_enabled(event, ctx); | 1633 | __perf_event_mark_enabled(event, ctx); |
968 | 1634 | ||
969 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1635 | if (!event_filter_match(event)) { |
1636 | if (is_cgroup_event(event)) | ||
1637 | perf_cgroup_defer_enabled(event); | ||
970 | goto unlock; | 1638 | goto unlock; |
1639 | } | ||
971 | 1640 | ||
972 | /* | 1641 | /* |
973 | * If the event is in a group and isn't the group leader, | 1642 | * If the event is in a group and isn't the group leader, |
@@ -979,12 +1648,10 @@ static void __perf_event_enable(void *info) | |||
979 | if (!group_can_go_on(event, cpuctx, 1)) { | 1648 | if (!group_can_go_on(event, cpuctx, 1)) { |
980 | err = -EEXIST; | 1649 | err = -EEXIST; |
981 | } else { | 1650 | } else { |
982 | perf_disable(); | ||
983 | if (event == leader) | 1651 | if (event == leader) |
984 | err = group_sched_in(event, cpuctx, ctx); | 1652 | err = group_sched_in(event, cpuctx, ctx); |
985 | else | 1653 | else |
986 | err = event_sched_in(event, cpuctx, ctx); | 1654 | err = event_sched_in(event, cpuctx, ctx); |
987 | perf_enable(); | ||
988 | } | 1655 | } |
989 | 1656 | ||
990 | if (err) { | 1657 | if (err) { |
@@ -1000,8 +1667,10 @@ static void __perf_event_enable(void *info) | |||
1000 | } | 1667 | } |
1001 | } | 1668 | } |
1002 | 1669 | ||
1003 | unlock: | 1670 | unlock: |
1004 | raw_spin_unlock(&ctx->lock); | 1671 | raw_spin_unlock(&ctx->lock); |
1672 | |||
1673 | return 0; | ||
1005 | } | 1674 | } |
1006 | 1675 | ||
1007 | /* | 1676 | /* |
@@ -1022,8 +1691,7 @@ void perf_event_enable(struct perf_event *event) | |||
1022 | /* | 1691 | /* |
1023 | * Enable the event on the cpu that it's on | 1692 | * Enable the event on the cpu that it's on |
1024 | */ | 1693 | */ |
1025 | smp_call_function_single(event->cpu, __perf_event_enable, | 1694 | cpu_function_call(event->cpu, __perf_event_enable, event); |
1026 | event, 1); | ||
1027 | return; | 1695 | return; |
1028 | } | 1696 | } |
1029 | 1697 | ||
@@ -1041,9 +1709,16 @@ void perf_event_enable(struct perf_event *event) | |||
1041 | if (event->state == PERF_EVENT_STATE_ERROR) | 1709 | if (event->state == PERF_EVENT_STATE_ERROR) |
1042 | event->state = PERF_EVENT_STATE_OFF; | 1710 | event->state = PERF_EVENT_STATE_OFF; |
1043 | 1711 | ||
1044 | retry: | 1712 | retry: |
1713 | if (!ctx->is_active) { | ||
1714 | __perf_event_mark_enabled(event, ctx); | ||
1715 | goto out; | ||
1716 | } | ||
1717 | |||
1045 | raw_spin_unlock_irq(&ctx->lock); | 1718 | raw_spin_unlock_irq(&ctx->lock); |
1046 | task_oncpu_function_call(task, __perf_event_enable, event); | 1719 | |
1720 | if (!task_function_call(task, __perf_event_enable, event)) | ||
1721 | return; | ||
1047 | 1722 | ||
1048 | raw_spin_lock_irq(&ctx->lock); | 1723 | raw_spin_lock_irq(&ctx->lock); |
1049 | 1724 | ||
@@ -1051,17 +1726,16 @@ void perf_event_enable(struct perf_event *event) | |||
1051 | * If the context is active and the event is still off, | 1726 | * If the context is active and the event is still off, |
1052 | * we need to retry the cross-call. | 1727 | * we need to retry the cross-call. |
1053 | */ | 1728 | */ |
1054 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) | 1729 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { |
1730 | /* | ||
1731 | * task could have been flipped by a concurrent | ||
1732 | * perf_event_context_sched_out() | ||
1733 | */ | ||
1734 | task = ctx->task; | ||
1055 | goto retry; | 1735 | goto retry; |
1736 | } | ||
1056 | 1737 | ||
1057 | /* | 1738 | out: |
1058 | * Since we have the lock this context can't be scheduled | ||
1059 | * in, so we can change the state safely. | ||
1060 | */ | ||
1061 | if (event->state == PERF_EVENT_STATE_OFF) | ||
1062 | __perf_event_mark_enabled(event, ctx); | ||
1063 | |||
1064 | out: | ||
1065 | raw_spin_unlock_irq(&ctx->lock); | 1739 | raw_spin_unlock_irq(&ctx->lock); |
1066 | } | 1740 | } |
1067 | 1741 | ||
@@ -1070,7 +1744,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1070 | /* | 1744 | /* |
1071 | * not supported on inherited events | 1745 | * not supported on inherited events |
1072 | */ | 1746 | */ |
1073 | if (event->attr.inherit) | 1747 | if (event->attr.inherit || !is_sampling_event(event)) |
1074 | return -EINVAL; | 1748 | return -EINVAL; |
1075 | 1749 | ||
1076 | atomic_add(refresh, &event->event_limit); | 1750 | atomic_add(refresh, &event->event_limit); |
@@ -1079,12 +1753,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1079 | return 0; | 1753 | return 0; |
1080 | } | 1754 | } |
1081 | 1755 | ||
1082 | enum event_type_t { | ||
1083 | EVENT_FLEXIBLE = 0x1, | ||
1084 | EVENT_PINNED = 0x2, | ||
1085 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | ||
1086 | }; | ||
1087 | |||
1088 | static void ctx_sched_out(struct perf_event_context *ctx, | 1756 | static void ctx_sched_out(struct perf_event_context *ctx, |
1089 | struct perf_cpu_context *cpuctx, | 1757 | struct perf_cpu_context *cpuctx, |
1090 | enum event_type_t event_type) | 1758 | enum event_type_t event_type) |
@@ -1092,26 +1760,27 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1092 | struct perf_event *event; | 1760 | struct perf_event *event; |
1093 | 1761 | ||
1094 | raw_spin_lock(&ctx->lock); | 1762 | raw_spin_lock(&ctx->lock); |
1763 | perf_pmu_disable(ctx->pmu); | ||
1095 | ctx->is_active = 0; | 1764 | ctx->is_active = 0; |
1096 | if (likely(!ctx->nr_events)) | 1765 | if (likely(!ctx->nr_events)) |
1097 | goto out; | 1766 | goto out; |
1098 | update_context_time(ctx); | 1767 | update_context_time(ctx); |
1768 | update_cgrp_time_from_cpuctx(cpuctx); | ||
1099 | 1769 | ||
1100 | perf_disable(); | ||
1101 | if (!ctx->nr_active) | 1770 | if (!ctx->nr_active) |
1102 | goto out_enable; | 1771 | goto out; |
1103 | 1772 | ||
1104 | if (event_type & EVENT_PINNED) | 1773 | if (event_type & EVENT_PINNED) { |
1105 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1774 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1106 | group_sched_out(event, cpuctx, ctx); | 1775 | group_sched_out(event, cpuctx, ctx); |
1776 | } | ||
1107 | 1777 | ||
1108 | if (event_type & EVENT_FLEXIBLE) | 1778 | if (event_type & EVENT_FLEXIBLE) { |
1109 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1779 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1110 | group_sched_out(event, cpuctx, ctx); | 1780 | group_sched_out(event, cpuctx, ctx); |
1111 | 1781 | } | |
1112 | out_enable: | 1782 | out: |
1113 | perf_enable(); | 1783 | perf_pmu_enable(ctx->pmu); |
1114 | out: | ||
1115 | raw_spin_unlock(&ctx->lock); | 1784 | raw_spin_unlock(&ctx->lock); |
1116 | } | 1785 | } |
1117 | 1786 | ||
@@ -1209,34 +1878,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1209 | } | 1878 | } |
1210 | } | 1879 | } |
1211 | 1880 | ||
1212 | /* | 1881 | static void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1213 | * Called from scheduler to remove the events of the current task, | 1882 | struct task_struct *next) |
1214 | * with interrupts disabled. | ||
1215 | * | ||
1216 | * We stop each event and update the event value in event->count. | ||
1217 | * | ||
1218 | * This does not protect us against NMI, but disable() | ||
1219 | * sets the disabled bit in the control field of event _before_ | ||
1220 | * accessing the event control register. If a NMI hits, then it will | ||
1221 | * not restart the event. | ||
1222 | */ | ||
1223 | void perf_event_task_sched_out(struct task_struct *task, | ||
1224 | struct task_struct *next) | ||
1225 | { | 1883 | { |
1226 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1884 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1227 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1228 | struct perf_event_context *next_ctx; | 1885 | struct perf_event_context *next_ctx; |
1229 | struct perf_event_context *parent; | 1886 | struct perf_event_context *parent; |
1887 | struct perf_cpu_context *cpuctx; | ||
1230 | int do_switch = 1; | 1888 | int do_switch = 1; |
1231 | 1889 | ||
1232 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1890 | if (likely(!ctx)) |
1891 | return; | ||
1233 | 1892 | ||
1234 | if (likely(!ctx || !cpuctx->task_ctx)) | 1893 | cpuctx = __get_cpu_context(ctx); |
1894 | if (!cpuctx->task_ctx) | ||
1235 | return; | 1895 | return; |
1236 | 1896 | ||
1237 | rcu_read_lock(); | 1897 | rcu_read_lock(); |
1238 | parent = rcu_dereference(ctx->parent_ctx); | 1898 | parent = rcu_dereference(ctx->parent_ctx); |
1239 | next_ctx = next->perf_event_ctxp; | 1899 | next_ctx = next->perf_event_ctxp[ctxn]; |
1240 | if (parent && next_ctx && | 1900 | if (parent && next_ctx && |
1241 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 1901 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
1242 | /* | 1902 | /* |
@@ -1255,8 +1915,8 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1255 | * XXX do we need a memory barrier of sorts | 1915 | * XXX do we need a memory barrier of sorts |
1256 | * wrt to rcu_dereference() of perf_event_ctxp | 1916 | * wrt to rcu_dereference() of perf_event_ctxp |
1257 | */ | 1917 | */ |
1258 | task->perf_event_ctxp = next_ctx; | 1918 | task->perf_event_ctxp[ctxn] = next_ctx; |
1259 | next->perf_event_ctxp = ctx; | 1919 | next->perf_event_ctxp[ctxn] = ctx; |
1260 | ctx->task = next; | 1920 | ctx->task = next; |
1261 | next_ctx->task = task; | 1921 | next_ctx->task = task; |
1262 | do_switch = 0; | 1922 | do_switch = 0; |
@@ -1274,10 +1934,41 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1274 | } | 1934 | } |
1275 | } | 1935 | } |
1276 | 1936 | ||
1937 | #define for_each_task_context_nr(ctxn) \ | ||
1938 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | ||
1939 | |||
1940 | /* | ||
1941 | * Called from scheduler to remove the events of the current task, | ||
1942 | * with interrupts disabled. | ||
1943 | * | ||
1944 | * We stop each event and update the event value in event->count. | ||
1945 | * | ||
1946 | * This does not protect us against NMI, but disable() | ||
1947 | * sets the disabled bit in the control field of event _before_ | ||
1948 | * accessing the event control register. If a NMI hits, then it will | ||
1949 | * not restart the event. | ||
1950 | */ | ||
1951 | void __perf_event_task_sched_out(struct task_struct *task, | ||
1952 | struct task_struct *next) | ||
1953 | { | ||
1954 | int ctxn; | ||
1955 | |||
1956 | for_each_task_context_nr(ctxn) | ||
1957 | perf_event_context_sched_out(task, ctxn, next); | ||
1958 | |||
1959 | /* | ||
1960 | * if cgroup events exist on this CPU, then we need | ||
1961 | * to check if we have to switch out PMU state. | ||
1962 | * cgroup event are system-wide mode only | ||
1963 | */ | ||
1964 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
1965 | perf_cgroup_sched_out(task); | ||
1966 | } | ||
1967 | |||
1277 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1968 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
1278 | enum event_type_t event_type) | 1969 | enum event_type_t event_type) |
1279 | { | 1970 | { |
1280 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1971 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1281 | 1972 | ||
1282 | if (!cpuctx->task_ctx) | 1973 | if (!cpuctx->task_ctx) |
1283 | return; | 1974 | return; |
@@ -1292,14 +1983,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
1292 | /* | 1983 | /* |
1293 | * Called with IRQs disabled | 1984 | * Called with IRQs disabled |
1294 | */ | 1985 | */ |
1295 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
1296 | { | ||
1297 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
1298 | } | ||
1299 | |||
1300 | /* | ||
1301 | * Called with IRQs disabled | ||
1302 | */ | ||
1303 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 1986 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
1304 | enum event_type_t event_type) | 1987 | enum event_type_t event_type) |
1305 | { | 1988 | { |
@@ -1315,9 +1998,13 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1315 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 1998 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
1316 | if (event->state <= PERF_EVENT_STATE_OFF) | 1999 | if (event->state <= PERF_EVENT_STATE_OFF) |
1317 | continue; | 2000 | continue; |
1318 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 2001 | if (!event_filter_match(event)) |
1319 | continue; | 2002 | continue; |
1320 | 2003 | ||
2004 | /* may need to reset tstamp_enabled */ | ||
2005 | if (is_cgroup_event(event)) | ||
2006 | perf_cgroup_mark_enabled(event, ctx); | ||
2007 | |||
1321 | if (group_can_go_on(event, cpuctx, 1)) | 2008 | if (group_can_go_on(event, cpuctx, 1)) |
1322 | group_sched_in(event, cpuctx, ctx); | 2009 | group_sched_in(event, cpuctx, ctx); |
1323 | 2010 | ||
@@ -1347,29 +2034,36 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1347 | * Listen to the 'cpu' scheduling filter constraint | 2034 | * Listen to the 'cpu' scheduling filter constraint |
1348 | * of events: | 2035 | * of events: |
1349 | */ | 2036 | */ |
1350 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 2037 | if (!event_filter_match(event)) |
1351 | continue; | 2038 | continue; |
1352 | 2039 | ||
1353 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 2040 | /* may need to reset tstamp_enabled */ |
2041 | if (is_cgroup_event(event)) | ||
2042 | perf_cgroup_mark_enabled(event, ctx); | ||
2043 | |||
2044 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | ||
1354 | if (group_sched_in(event, cpuctx, ctx)) | 2045 | if (group_sched_in(event, cpuctx, ctx)) |
1355 | can_add_hw = 0; | 2046 | can_add_hw = 0; |
2047 | } | ||
1356 | } | 2048 | } |
1357 | } | 2049 | } |
1358 | 2050 | ||
1359 | static void | 2051 | static void |
1360 | ctx_sched_in(struct perf_event_context *ctx, | 2052 | ctx_sched_in(struct perf_event_context *ctx, |
1361 | struct perf_cpu_context *cpuctx, | 2053 | struct perf_cpu_context *cpuctx, |
1362 | enum event_type_t event_type) | 2054 | enum event_type_t event_type, |
2055 | struct task_struct *task) | ||
1363 | { | 2056 | { |
2057 | u64 now; | ||
2058 | |||
1364 | raw_spin_lock(&ctx->lock); | 2059 | raw_spin_lock(&ctx->lock); |
1365 | ctx->is_active = 1; | 2060 | ctx->is_active = 1; |
1366 | if (likely(!ctx->nr_events)) | 2061 | if (likely(!ctx->nr_events)) |
1367 | goto out; | 2062 | goto out; |
1368 | 2063 | ||
1369 | ctx->timestamp = perf_clock(); | 2064 | now = perf_clock(); |
1370 | 2065 | ctx->timestamp = now; | |
1371 | perf_disable(); | 2066 | perf_cgroup_set_timestamp(task, ctx); |
1372 | |||
1373 | /* | 2067 | /* |
1374 | * First go through the list and put on any pinned groups | 2068 | * First go through the list and put on any pinned groups |
1375 | * in order to give them the best chance of going on. | 2069 | * in order to give them the best chance of going on. |
@@ -1381,56 +2075,42 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1381 | if (event_type & EVENT_FLEXIBLE) | 2075 | if (event_type & EVENT_FLEXIBLE) |
1382 | ctx_flexible_sched_in(ctx, cpuctx); | 2076 | ctx_flexible_sched_in(ctx, cpuctx); |
1383 | 2077 | ||
1384 | perf_enable(); | 2078 | out: |
1385 | out: | ||
1386 | raw_spin_unlock(&ctx->lock); | 2079 | raw_spin_unlock(&ctx->lock); |
1387 | } | 2080 | } |
1388 | 2081 | ||
1389 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2082 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
1390 | enum event_type_t event_type) | 2083 | enum event_type_t event_type, |
2084 | struct task_struct *task) | ||
1391 | { | 2085 | { |
1392 | struct perf_event_context *ctx = &cpuctx->ctx; | 2086 | struct perf_event_context *ctx = &cpuctx->ctx; |
1393 | 2087 | ||
1394 | ctx_sched_in(ctx, cpuctx, event_type); | 2088 | ctx_sched_in(ctx, cpuctx, event_type, task); |
1395 | } | 2089 | } |
1396 | 2090 | ||
1397 | static void task_ctx_sched_in(struct task_struct *task, | 2091 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
1398 | enum event_type_t event_type) | 2092 | enum event_type_t event_type) |
1399 | { | 2093 | { |
1400 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 2094 | struct perf_cpu_context *cpuctx; |
1401 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1402 | 2095 | ||
1403 | if (likely(!ctx)) | 2096 | cpuctx = __get_cpu_context(ctx); |
1404 | return; | ||
1405 | if (cpuctx->task_ctx == ctx) | 2097 | if (cpuctx->task_ctx == ctx) |
1406 | return; | 2098 | return; |
1407 | ctx_sched_in(ctx, cpuctx, event_type); | 2099 | |
2100 | ctx_sched_in(ctx, cpuctx, event_type, NULL); | ||
1408 | cpuctx->task_ctx = ctx; | 2101 | cpuctx->task_ctx = ctx; |
1409 | } | 2102 | } |
1410 | /* | ||
1411 | * Called from scheduler to add the events of the current task | ||
1412 | * with interrupts disabled. | ||
1413 | * | ||
1414 | * We restore the event value and then enable it. | ||
1415 | * | ||
1416 | * This does not protect us against NMI, but enable() | ||
1417 | * sets the enabled bit in the control field of event _before_ | ||
1418 | * accessing the event control register. If a NMI hits, then it will | ||
1419 | * keep the event running. | ||
1420 | */ | ||
1421 | void perf_event_task_sched_in(struct task_struct *task) | ||
1422 | { | ||
1423 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1424 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1425 | 2103 | ||
1426 | if (likely(!ctx)) | 2104 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
1427 | return; | 2105 | struct task_struct *task) |
2106 | { | ||
2107 | struct perf_cpu_context *cpuctx; | ||
1428 | 2108 | ||
2109 | cpuctx = __get_cpu_context(ctx); | ||
1429 | if (cpuctx->task_ctx == ctx) | 2110 | if (cpuctx->task_ctx == ctx) |
1430 | return; | 2111 | return; |
1431 | 2112 | ||
1432 | perf_disable(); | 2113 | perf_pmu_disable(ctx->pmu); |
1433 | |||
1434 | /* | 2114 | /* |
1435 | * We want to keep the following priority order: | 2115 | * We want to keep the following priority order: |
1436 | * cpu pinned (that don't need to move), task pinned, | 2116 | * cpu pinned (that don't need to move), task pinned, |
@@ -1438,18 +2118,51 @@ void perf_event_task_sched_in(struct task_struct *task) | |||
1438 | */ | 2118 | */ |
1439 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2119 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1440 | 2120 | ||
1441 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | 2121 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); |
1442 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2122 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); |
1443 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | 2123 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
1444 | 2124 | ||
1445 | cpuctx->task_ctx = ctx; | 2125 | cpuctx->task_ctx = ctx; |
1446 | 2126 | ||
1447 | perf_enable(); | 2127 | /* |
2128 | * Since these rotations are per-cpu, we need to ensure the | ||
2129 | * cpu-context we got scheduled on is actually rotating. | ||
2130 | */ | ||
2131 | perf_pmu_rotate_start(ctx->pmu); | ||
2132 | perf_pmu_enable(ctx->pmu); | ||
1448 | } | 2133 | } |
1449 | 2134 | ||
1450 | #define MAX_INTERRUPTS (~0ULL) | 2135 | /* |
2136 | * Called from scheduler to add the events of the current task | ||
2137 | * with interrupts disabled. | ||
2138 | * | ||
2139 | * We restore the event value and then enable it. | ||
2140 | * | ||
2141 | * This does not protect us against NMI, but enable() | ||
2142 | * sets the enabled bit in the control field of event _before_ | ||
2143 | * accessing the event control register. If a NMI hits, then it will | ||
2144 | * keep the event running. | ||
2145 | */ | ||
2146 | void __perf_event_task_sched_in(struct task_struct *task) | ||
2147 | { | ||
2148 | struct perf_event_context *ctx; | ||
2149 | int ctxn; | ||
1451 | 2150 | ||
1452 | static void perf_log_throttle(struct perf_event *event, int enable); | 2151 | for_each_task_context_nr(ctxn) { |
2152 | ctx = task->perf_event_ctxp[ctxn]; | ||
2153 | if (likely(!ctx)) | ||
2154 | continue; | ||
2155 | |||
2156 | perf_event_context_sched_in(ctx, task); | ||
2157 | } | ||
2158 | /* | ||
2159 | * if cgroup events exist on this CPU, then we need | ||
2160 | * to check if we have to switch in PMU state. | ||
2161 | * cgroup event are system-wide mode only | ||
2162 | */ | ||
2163 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
2164 | perf_cgroup_sched_in(task); | ||
2165 | } | ||
1453 | 2166 | ||
1454 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2167 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
1455 | { | 2168 | { |
@@ -1478,7 +2191,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | |||
1478 | * Reduce accuracy by one bit such that @a and @b converge | 2191 | * Reduce accuracy by one bit such that @a and @b converge |
1479 | * to a similar magnitude. | 2192 | * to a similar magnitude. |
1480 | */ | 2193 | */ |
1481 | #define REDUCE_FLS(a, b) \ | 2194 | #define REDUCE_FLS(a, b) \ |
1482 | do { \ | 2195 | do { \ |
1483 | if (a##_fls > b##_fls) { \ | 2196 | if (a##_fls > b##_fls) { \ |
1484 | a >>= 1; \ | 2197 | a >>= 1; \ |
@@ -1524,22 +2237,6 @@ do { \ | |||
1524 | return div64_u64(dividend, divisor); | 2237 | return div64_u64(dividend, divisor); |
1525 | } | 2238 | } |
1526 | 2239 | ||
1527 | static void perf_event_stop(struct perf_event *event) | ||
1528 | { | ||
1529 | if (!event->pmu->stop) | ||
1530 | return event->pmu->disable(event); | ||
1531 | |||
1532 | return event->pmu->stop(event); | ||
1533 | } | ||
1534 | |||
1535 | static int perf_event_start(struct perf_event *event) | ||
1536 | { | ||
1537 | if (!event->pmu->start) | ||
1538 | return event->pmu->enable(event); | ||
1539 | |||
1540 | return event->pmu->start(event); | ||
1541 | } | ||
1542 | |||
1543 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 2240 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1544 | { | 2241 | { |
1545 | struct hw_perf_event *hwc = &event->hw; | 2242 | struct hw_perf_event *hwc = &event->hw; |
@@ -1559,15 +2256,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1559 | hwc->sample_period = sample_period; | 2256 | hwc->sample_period = sample_period; |
1560 | 2257 | ||
1561 | if (local64_read(&hwc->period_left) > 8*sample_period) { | 2258 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
1562 | perf_disable(); | 2259 | event->pmu->stop(event, PERF_EF_UPDATE); |
1563 | perf_event_stop(event); | ||
1564 | local64_set(&hwc->period_left, 0); | 2260 | local64_set(&hwc->period_left, 0); |
1565 | perf_event_start(event); | 2261 | event->pmu->start(event, PERF_EF_RELOAD); |
1566 | perf_enable(); | ||
1567 | } | 2262 | } |
1568 | } | 2263 | } |
1569 | 2264 | ||
1570 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 2265 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) |
1571 | { | 2266 | { |
1572 | struct perf_event *event; | 2267 | struct perf_event *event; |
1573 | struct hw_perf_event *hwc; | 2268 | struct hw_perf_event *hwc; |
@@ -1579,7 +2274,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1579 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2274 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1580 | continue; | 2275 | continue; |
1581 | 2276 | ||
1582 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 2277 | if (!event_filter_match(event)) |
1583 | continue; | 2278 | continue; |
1584 | 2279 | ||
1585 | hwc = &event->hw; | 2280 | hwc = &event->hw; |
@@ -1592,23 +2287,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1592 | */ | 2287 | */ |
1593 | if (interrupts == MAX_INTERRUPTS) { | 2288 | if (interrupts == MAX_INTERRUPTS) { |
1594 | perf_log_throttle(event, 1); | 2289 | perf_log_throttle(event, 1); |
1595 | perf_disable(); | 2290 | event->pmu->start(event, 0); |
1596 | event->pmu->unthrottle(event); | ||
1597 | perf_enable(); | ||
1598 | } | 2291 | } |
1599 | 2292 | ||
1600 | if (!event->attr.freq || !event->attr.sample_freq) | 2293 | if (!event->attr.freq || !event->attr.sample_freq) |
1601 | continue; | 2294 | continue; |
1602 | 2295 | ||
1603 | perf_disable(); | ||
1604 | event->pmu->read(event); | 2296 | event->pmu->read(event); |
1605 | now = local64_read(&event->count); | 2297 | now = local64_read(&event->count); |
1606 | delta = now - hwc->freq_count_stamp; | 2298 | delta = now - hwc->freq_count_stamp; |
1607 | hwc->freq_count_stamp = now; | 2299 | hwc->freq_count_stamp = now; |
1608 | 2300 | ||
1609 | if (delta > 0) | 2301 | if (delta > 0) |
1610 | perf_adjust_period(event, TICK_NSEC, delta); | 2302 | perf_adjust_period(event, period, delta); |
1611 | perf_enable(); | ||
1612 | } | 2303 | } |
1613 | raw_spin_unlock(&ctx->lock); | 2304 | raw_spin_unlock(&ctx->lock); |
1614 | } | 2305 | } |
@@ -1620,38 +2311,48 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
1620 | { | 2311 | { |
1621 | raw_spin_lock(&ctx->lock); | 2312 | raw_spin_lock(&ctx->lock); |
1622 | 2313 | ||
1623 | /* Rotate the first entry last of non-pinned groups */ | 2314 | /* |
1624 | list_rotate_left(&ctx->flexible_groups); | 2315 | * Rotate the first entry last of non-pinned groups. Rotation might be |
2316 | * disabled by the inheritance code. | ||
2317 | */ | ||
2318 | if (!ctx->rotate_disable) | ||
2319 | list_rotate_left(&ctx->flexible_groups); | ||
1625 | 2320 | ||
1626 | raw_spin_unlock(&ctx->lock); | 2321 | raw_spin_unlock(&ctx->lock); |
1627 | } | 2322 | } |
1628 | 2323 | ||
1629 | void perf_event_task_tick(struct task_struct *curr) | 2324 | /* |
2325 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
2326 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
2327 | * disabled, while rotate_context is called from IRQ context. | ||
2328 | */ | ||
2329 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
1630 | { | 2330 | { |
1631 | struct perf_cpu_context *cpuctx; | 2331 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
1632 | struct perf_event_context *ctx; | 2332 | struct perf_event_context *ctx = NULL; |
1633 | int rotate = 0; | 2333 | int rotate = 0, remove = 1; |
1634 | |||
1635 | if (!atomic_read(&nr_events)) | ||
1636 | return; | ||
1637 | 2334 | ||
1638 | cpuctx = &__get_cpu_var(perf_cpu_context); | 2335 | if (cpuctx->ctx.nr_events) { |
1639 | if (cpuctx->ctx.nr_events && | 2336 | remove = 0; |
1640 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 2337 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
1641 | rotate = 1; | 2338 | rotate = 1; |
2339 | } | ||
1642 | 2340 | ||
1643 | ctx = curr->perf_event_ctxp; | 2341 | ctx = cpuctx->task_ctx; |
1644 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | 2342 | if (ctx && ctx->nr_events) { |
1645 | rotate = 1; | 2343 | remove = 0; |
2344 | if (ctx->nr_events != ctx->nr_active) | ||
2345 | rotate = 1; | ||
2346 | } | ||
1646 | 2347 | ||
1647 | perf_ctx_adjust_freq(&cpuctx->ctx); | 2348 | perf_pmu_disable(cpuctx->ctx.pmu); |
2349 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
1648 | if (ctx) | 2350 | if (ctx) |
1649 | perf_ctx_adjust_freq(ctx); | 2351 | perf_ctx_adjust_freq(ctx, interval); |
1650 | 2352 | ||
1651 | if (!rotate) | 2353 | if (!rotate) |
1652 | return; | 2354 | goto done; |
1653 | 2355 | ||
1654 | perf_disable(); | ||
1655 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2356 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1656 | if (ctx) | 2357 | if (ctx) |
1657 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 2358 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
@@ -1660,10 +2361,29 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1660 | if (ctx) | 2361 | if (ctx) |
1661 | rotate_ctx(ctx); | 2362 | rotate_ctx(ctx); |
1662 | 2363 | ||
1663 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2364 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); |
1664 | if (ctx) | 2365 | if (ctx) |
1665 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 2366 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1666 | perf_enable(); | 2367 | |
2368 | done: | ||
2369 | if (remove) | ||
2370 | list_del_init(&cpuctx->rotation_list); | ||
2371 | |||
2372 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2373 | } | ||
2374 | |||
2375 | void perf_event_task_tick(void) | ||
2376 | { | ||
2377 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
2378 | struct perf_cpu_context *cpuctx, *tmp; | ||
2379 | |||
2380 | WARN_ON(!irqs_disabled()); | ||
2381 | |||
2382 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | ||
2383 | if (cpuctx->jiffies_interval == 1 || | ||
2384 | !(jiffies % cpuctx->jiffies_interval)) | ||
2385 | perf_rotate_context(cpuctx); | ||
2386 | } | ||
1667 | } | 2387 | } |
1668 | 2388 | ||
1669 | static int event_enable_on_exec(struct perf_event *event, | 2389 | static int event_enable_on_exec(struct perf_event *event, |
@@ -1685,20 +2405,26 @@ static int event_enable_on_exec(struct perf_event *event, | |||
1685 | * Enable all of a task's events that have been marked enable-on-exec. | 2405 | * Enable all of a task's events that have been marked enable-on-exec. |
1686 | * This expects task == current. | 2406 | * This expects task == current. |
1687 | */ | 2407 | */ |
1688 | static void perf_event_enable_on_exec(struct task_struct *task) | 2408 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
1689 | { | 2409 | { |
1690 | struct perf_event_context *ctx; | ||
1691 | struct perf_event *event; | 2410 | struct perf_event *event; |
1692 | unsigned long flags; | 2411 | unsigned long flags; |
1693 | int enabled = 0; | 2412 | int enabled = 0; |
1694 | int ret; | 2413 | int ret; |
1695 | 2414 | ||
1696 | local_irq_save(flags); | 2415 | local_irq_save(flags); |
1697 | ctx = task->perf_event_ctxp; | ||
1698 | if (!ctx || !ctx->nr_events) | 2416 | if (!ctx || !ctx->nr_events) |
1699 | goto out; | 2417 | goto out; |
1700 | 2418 | ||
1701 | __perf_event_task_sched_out(ctx); | 2419 | /* |
2420 | * We must ctxsw out cgroup events to avoid conflict | ||
2421 | * when invoking perf_task_event_sched_in() later on | ||
2422 | * in this function. Otherwise we end up trying to | ||
2423 | * ctxswin cgroup events which are already scheduled | ||
2424 | * in. | ||
2425 | */ | ||
2426 | perf_cgroup_sched_out(current); | ||
2427 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
1702 | 2428 | ||
1703 | raw_spin_lock(&ctx->lock); | 2429 | raw_spin_lock(&ctx->lock); |
1704 | 2430 | ||
@@ -1722,8 +2448,11 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1722 | 2448 | ||
1723 | raw_spin_unlock(&ctx->lock); | 2449 | raw_spin_unlock(&ctx->lock); |
1724 | 2450 | ||
1725 | perf_event_task_sched_in(task); | 2451 | /* |
1726 | out: | 2452 | * Also calls ctxswin for cgroup events, if any: |
2453 | */ | ||
2454 | perf_event_context_sched_in(ctx, ctx->task); | ||
2455 | out: | ||
1727 | local_irq_restore(flags); | 2456 | local_irq_restore(flags); |
1728 | } | 2457 | } |
1729 | 2458 | ||
@@ -1732,9 +2461,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1732 | */ | 2461 | */ |
1733 | static void __perf_event_read(void *info) | 2462 | static void __perf_event_read(void *info) |
1734 | { | 2463 | { |
1735 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1736 | struct perf_event *event = info; | 2464 | struct perf_event *event = info; |
1737 | struct perf_event_context *ctx = event->ctx; | 2465 | struct perf_event_context *ctx = event->ctx; |
2466 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1738 | 2467 | ||
1739 | /* | 2468 | /* |
1740 | * If this is a task context, we need to check whether it is | 2469 | * If this is a task context, we need to check whether it is |
@@ -1747,11 +2476,14 @@ static void __perf_event_read(void *info) | |||
1747 | return; | 2476 | return; |
1748 | 2477 | ||
1749 | raw_spin_lock(&ctx->lock); | 2478 | raw_spin_lock(&ctx->lock); |
1750 | update_context_time(ctx); | 2479 | if (ctx->is_active) { |
2480 | update_context_time(ctx); | ||
2481 | update_cgrp_time_from_event(event); | ||
2482 | } | ||
1751 | update_event_times(event); | 2483 | update_event_times(event); |
2484 | if (event->state == PERF_EVENT_STATE_ACTIVE) | ||
2485 | event->pmu->read(event); | ||
1752 | raw_spin_unlock(&ctx->lock); | 2486 | raw_spin_unlock(&ctx->lock); |
1753 | |||
1754 | event->pmu->read(event); | ||
1755 | } | 2487 | } |
1756 | 2488 | ||
1757 | static inline u64 perf_event_count(struct perf_event *event) | 2489 | static inline u64 perf_event_count(struct perf_event *event) |
@@ -1773,7 +2505,15 @@ static u64 perf_event_read(struct perf_event *event) | |||
1773 | unsigned long flags; | 2505 | unsigned long flags; |
1774 | 2506 | ||
1775 | raw_spin_lock_irqsave(&ctx->lock, flags); | 2507 | raw_spin_lock_irqsave(&ctx->lock, flags); |
1776 | update_context_time(ctx); | 2508 | /* |
2509 | * may read while context is not active | ||
2510 | * (e.g., thread is blocked), in that case | ||
2511 | * we cannot update context time | ||
2512 | */ | ||
2513 | if (ctx->is_active) { | ||
2514 | update_context_time(ctx); | ||
2515 | update_cgrp_time_from_event(event); | ||
2516 | } | ||
1777 | update_event_times(event); | 2517 | update_event_times(event); |
1778 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2518 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1779 | } | 2519 | } |
@@ -1782,11 +2522,218 @@ static u64 perf_event_read(struct perf_event *event) | |||
1782 | } | 2522 | } |
1783 | 2523 | ||
1784 | /* | 2524 | /* |
1785 | * Initialize the perf_event context in a task_struct: | 2525 | * Callchain support |
1786 | */ | 2526 | */ |
2527 | |||
2528 | struct callchain_cpus_entries { | ||
2529 | struct rcu_head rcu_head; | ||
2530 | struct perf_callchain_entry *cpu_entries[0]; | ||
2531 | }; | ||
2532 | |||
2533 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
2534 | static atomic_t nr_callchain_events; | ||
2535 | static DEFINE_MUTEX(callchain_mutex); | ||
2536 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
2537 | |||
2538 | |||
2539 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
2540 | struct pt_regs *regs) | ||
2541 | { | ||
2542 | } | ||
2543 | |||
2544 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
2545 | struct pt_regs *regs) | ||
2546 | { | ||
2547 | } | ||
2548 | |||
2549 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
2550 | { | ||
2551 | struct callchain_cpus_entries *entries; | ||
2552 | int cpu; | ||
2553 | |||
2554 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
2555 | |||
2556 | for_each_possible_cpu(cpu) | ||
2557 | kfree(entries->cpu_entries[cpu]); | ||
2558 | |||
2559 | kfree(entries); | ||
2560 | } | ||
2561 | |||
2562 | static void release_callchain_buffers(void) | ||
2563 | { | ||
2564 | struct callchain_cpus_entries *entries; | ||
2565 | |||
2566 | entries = callchain_cpus_entries; | ||
2567 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
2568 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
2569 | } | ||
2570 | |||
2571 | static int alloc_callchain_buffers(void) | ||
2572 | { | ||
2573 | int cpu; | ||
2574 | int size; | ||
2575 | struct callchain_cpus_entries *entries; | ||
2576 | |||
2577 | /* | ||
2578 | * We can't use the percpu allocation API for data that can be | ||
2579 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
2580 | * until that gets sorted out. | ||
2581 | */ | ||
2582 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
2583 | |||
2584 | entries = kzalloc(size, GFP_KERNEL); | ||
2585 | if (!entries) | ||
2586 | return -ENOMEM; | ||
2587 | |||
2588 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
2589 | |||
2590 | for_each_possible_cpu(cpu) { | ||
2591 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
2592 | cpu_to_node(cpu)); | ||
2593 | if (!entries->cpu_entries[cpu]) | ||
2594 | goto fail; | ||
2595 | } | ||
2596 | |||
2597 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
2598 | |||
2599 | return 0; | ||
2600 | |||
2601 | fail: | ||
2602 | for_each_possible_cpu(cpu) | ||
2603 | kfree(entries->cpu_entries[cpu]); | ||
2604 | kfree(entries); | ||
2605 | |||
2606 | return -ENOMEM; | ||
2607 | } | ||
2608 | |||
2609 | static int get_callchain_buffers(void) | ||
2610 | { | ||
2611 | int err = 0; | ||
2612 | int count; | ||
2613 | |||
2614 | mutex_lock(&callchain_mutex); | ||
2615 | |||
2616 | count = atomic_inc_return(&nr_callchain_events); | ||
2617 | if (WARN_ON_ONCE(count < 1)) { | ||
2618 | err = -EINVAL; | ||
2619 | goto exit; | ||
2620 | } | ||
2621 | |||
2622 | if (count > 1) { | ||
2623 | /* If the allocation failed, give up */ | ||
2624 | if (!callchain_cpus_entries) | ||
2625 | err = -ENOMEM; | ||
2626 | goto exit; | ||
2627 | } | ||
2628 | |||
2629 | err = alloc_callchain_buffers(); | ||
2630 | if (err) | ||
2631 | release_callchain_buffers(); | ||
2632 | exit: | ||
2633 | mutex_unlock(&callchain_mutex); | ||
2634 | |||
2635 | return err; | ||
2636 | } | ||
2637 | |||
2638 | static void put_callchain_buffers(void) | ||
2639 | { | ||
2640 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
2641 | release_callchain_buffers(); | ||
2642 | mutex_unlock(&callchain_mutex); | ||
2643 | } | ||
2644 | } | ||
2645 | |||
2646 | static int get_recursion_context(int *recursion) | ||
2647 | { | ||
2648 | int rctx; | ||
2649 | |||
2650 | if (in_nmi()) | ||
2651 | rctx = 3; | ||
2652 | else if (in_irq()) | ||
2653 | rctx = 2; | ||
2654 | else if (in_softirq()) | ||
2655 | rctx = 1; | ||
2656 | else | ||
2657 | rctx = 0; | ||
2658 | |||
2659 | if (recursion[rctx]) | ||
2660 | return -1; | ||
2661 | |||
2662 | recursion[rctx]++; | ||
2663 | barrier(); | ||
2664 | |||
2665 | return rctx; | ||
2666 | } | ||
2667 | |||
2668 | static inline void put_recursion_context(int *recursion, int rctx) | ||
2669 | { | ||
2670 | barrier(); | ||
2671 | recursion[rctx]--; | ||
2672 | } | ||
2673 | |||
2674 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
2675 | { | ||
2676 | int cpu; | ||
2677 | struct callchain_cpus_entries *entries; | ||
2678 | |||
2679 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
2680 | if (*rctx == -1) | ||
2681 | return NULL; | ||
2682 | |||
2683 | entries = rcu_dereference(callchain_cpus_entries); | ||
2684 | if (!entries) | ||
2685 | return NULL; | ||
2686 | |||
2687 | cpu = smp_processor_id(); | ||
2688 | |||
2689 | return &entries->cpu_entries[cpu][*rctx]; | ||
2690 | } | ||
2691 | |||
1787 | static void | 2692 | static void |
1788 | __perf_event_init_context(struct perf_event_context *ctx, | 2693 | put_callchain_entry(int rctx) |
1789 | struct task_struct *task) | 2694 | { |
2695 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
2696 | } | ||
2697 | |||
2698 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2699 | { | ||
2700 | int rctx; | ||
2701 | struct perf_callchain_entry *entry; | ||
2702 | |||
2703 | |||
2704 | entry = get_callchain_entry(&rctx); | ||
2705 | if (rctx == -1) | ||
2706 | return NULL; | ||
2707 | |||
2708 | if (!entry) | ||
2709 | goto exit_put; | ||
2710 | |||
2711 | entry->nr = 0; | ||
2712 | |||
2713 | if (!user_mode(regs)) { | ||
2714 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2715 | perf_callchain_kernel(entry, regs); | ||
2716 | if (current->mm) | ||
2717 | regs = task_pt_regs(current); | ||
2718 | else | ||
2719 | regs = NULL; | ||
2720 | } | ||
2721 | |||
2722 | if (regs) { | ||
2723 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2724 | perf_callchain_user(entry, regs); | ||
2725 | } | ||
2726 | |||
2727 | exit_put: | ||
2728 | put_callchain_entry(rctx); | ||
2729 | |||
2730 | return entry; | ||
2731 | } | ||
2732 | |||
2733 | /* | ||
2734 | * Initialize the perf_event context in a task_struct: | ||
2735 | */ | ||
2736 | static void __perf_event_init_context(struct perf_event_context *ctx) | ||
1790 | { | 2737 | { |
1791 | raw_spin_lock_init(&ctx->lock); | 2738 | raw_spin_lock_init(&ctx->lock); |
1792 | mutex_init(&ctx->mutex); | 2739 | mutex_init(&ctx->mutex); |
@@ -1794,25 +2741,73 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
1794 | INIT_LIST_HEAD(&ctx->flexible_groups); | 2741 | INIT_LIST_HEAD(&ctx->flexible_groups); |
1795 | INIT_LIST_HEAD(&ctx->event_list); | 2742 | INIT_LIST_HEAD(&ctx->event_list); |
1796 | atomic_set(&ctx->refcount, 1); | 2743 | atomic_set(&ctx->refcount, 1); |
1797 | ctx->task = task; | ||
1798 | } | 2744 | } |
1799 | 2745 | ||
1800 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) | 2746 | static struct perf_event_context * |
2747 | alloc_perf_context(struct pmu *pmu, struct task_struct *task) | ||
1801 | { | 2748 | { |
1802 | struct perf_event_context *ctx; | 2749 | struct perf_event_context *ctx; |
1803 | struct perf_cpu_context *cpuctx; | 2750 | |
2751 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | ||
2752 | if (!ctx) | ||
2753 | return NULL; | ||
2754 | |||
2755 | __perf_event_init_context(ctx); | ||
2756 | if (task) { | ||
2757 | ctx->task = task; | ||
2758 | get_task_struct(task); | ||
2759 | } | ||
2760 | ctx->pmu = pmu; | ||
2761 | |||
2762 | return ctx; | ||
2763 | } | ||
2764 | |||
2765 | static struct task_struct * | ||
2766 | find_lively_task_by_vpid(pid_t vpid) | ||
2767 | { | ||
1804 | struct task_struct *task; | 2768 | struct task_struct *task; |
1805 | unsigned long flags; | ||
1806 | int err; | 2769 | int err; |
1807 | 2770 | ||
1808 | if (pid == -1 && cpu != -1) { | 2771 | rcu_read_lock(); |
2772 | if (!vpid) | ||
2773 | task = current; | ||
2774 | else | ||
2775 | task = find_task_by_vpid(vpid); | ||
2776 | if (task) | ||
2777 | get_task_struct(task); | ||
2778 | rcu_read_unlock(); | ||
2779 | |||
2780 | if (!task) | ||
2781 | return ERR_PTR(-ESRCH); | ||
2782 | |||
2783 | /* Reuse ptrace permission checks for now. */ | ||
2784 | err = -EACCES; | ||
2785 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | ||
2786 | goto errout; | ||
2787 | |||
2788 | return task; | ||
2789 | errout: | ||
2790 | put_task_struct(task); | ||
2791 | return ERR_PTR(err); | ||
2792 | |||
2793 | } | ||
2794 | |||
2795 | /* | ||
2796 | * Returns a matching context with refcount and pincount. | ||
2797 | */ | ||
2798 | static struct perf_event_context * | ||
2799 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | ||
2800 | { | ||
2801 | struct perf_event_context *ctx; | ||
2802 | struct perf_cpu_context *cpuctx; | ||
2803 | unsigned long flags; | ||
2804 | int ctxn, err; | ||
2805 | |||
2806 | if (!task) { | ||
1809 | /* Must be root to operate on a CPU event: */ | 2807 | /* Must be root to operate on a CPU event: */ |
1810 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2808 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) |
1811 | return ERR_PTR(-EACCES); | 2809 | return ERR_PTR(-EACCES); |
1812 | 2810 | ||
1813 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
1814 | return ERR_PTR(-EINVAL); | ||
1815 | |||
1816 | /* | 2811 | /* |
1817 | * We could be clever and allow to attach a event to an | 2812 | * We could be clever and allow to attach a event to an |
1818 | * offline CPU and activate it when the CPU comes up, but | 2813 | * offline CPU and activate it when the CPU comes up, but |
@@ -1821,67 +2816,64 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) | |||
1821 | if (!cpu_online(cpu)) | 2816 | if (!cpu_online(cpu)) |
1822 | return ERR_PTR(-ENODEV); | 2817 | return ERR_PTR(-ENODEV); |
1823 | 2818 | ||
1824 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 2819 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
1825 | ctx = &cpuctx->ctx; | 2820 | ctx = &cpuctx->ctx; |
1826 | get_ctx(ctx); | 2821 | get_ctx(ctx); |
2822 | ++ctx->pin_count; | ||
1827 | 2823 | ||
1828 | return ctx; | 2824 | return ctx; |
1829 | } | 2825 | } |
1830 | 2826 | ||
1831 | rcu_read_lock(); | 2827 | err = -EINVAL; |
1832 | if (!pid) | 2828 | ctxn = pmu->task_ctx_nr; |
1833 | task = current; | 2829 | if (ctxn < 0) |
1834 | else | ||
1835 | task = find_task_by_vpid(pid); | ||
1836 | if (task) | ||
1837 | get_task_struct(task); | ||
1838 | rcu_read_unlock(); | ||
1839 | |||
1840 | if (!task) | ||
1841 | return ERR_PTR(-ESRCH); | ||
1842 | |||
1843 | /* | ||
1844 | * Can't attach events to a dying task. | ||
1845 | */ | ||
1846 | err = -ESRCH; | ||
1847 | if (task->flags & PF_EXITING) | ||
1848 | goto errout; | ||
1849 | |||
1850 | /* Reuse ptrace permission checks for now. */ | ||
1851 | err = -EACCES; | ||
1852 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | ||
1853 | goto errout; | 2830 | goto errout; |
1854 | 2831 | ||
1855 | retry: | 2832 | retry: |
1856 | ctx = perf_lock_task_context(task, &flags); | 2833 | ctx = perf_lock_task_context(task, ctxn, &flags); |
1857 | if (ctx) { | 2834 | if (ctx) { |
1858 | unclone_ctx(ctx); | 2835 | unclone_ctx(ctx); |
2836 | ++ctx->pin_count; | ||
1859 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2837 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1860 | } | 2838 | } |
1861 | 2839 | ||
1862 | if (!ctx) { | 2840 | if (!ctx) { |
1863 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | 2841 | ctx = alloc_perf_context(pmu, task); |
1864 | err = -ENOMEM; | 2842 | err = -ENOMEM; |
1865 | if (!ctx) | 2843 | if (!ctx) |
1866 | goto errout; | 2844 | goto errout; |
1867 | __perf_event_init_context(ctx, task); | 2845 | |
1868 | get_ctx(ctx); | 2846 | get_ctx(ctx); |
1869 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { | 2847 | |
1870 | /* | 2848 | err = 0; |
1871 | * We raced with some other task; use | 2849 | mutex_lock(&task->perf_event_mutex); |
1872 | * the context they set. | 2850 | /* |
1873 | */ | 2851 | * If it has already passed perf_event_exit_task(). |
2852 | * we must see PF_EXITING, it takes this mutex too. | ||
2853 | */ | ||
2854 | if (task->flags & PF_EXITING) | ||
2855 | err = -ESRCH; | ||
2856 | else if (task->perf_event_ctxp[ctxn]) | ||
2857 | err = -EAGAIN; | ||
2858 | else { | ||
2859 | ++ctx->pin_count; | ||
2860 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | ||
2861 | } | ||
2862 | mutex_unlock(&task->perf_event_mutex); | ||
2863 | |||
2864 | if (unlikely(err)) { | ||
2865 | put_task_struct(task); | ||
1874 | kfree(ctx); | 2866 | kfree(ctx); |
1875 | goto retry; | 2867 | |
2868 | if (err == -EAGAIN) | ||
2869 | goto retry; | ||
2870 | goto errout; | ||
1876 | } | 2871 | } |
1877 | get_task_struct(task); | ||
1878 | } | 2872 | } |
1879 | 2873 | ||
1880 | put_task_struct(task); | ||
1881 | return ctx; | 2874 | return ctx; |
1882 | 2875 | ||
1883 | errout: | 2876 | errout: |
1884 | put_task_struct(task); | ||
1885 | return ERR_PTR(err); | 2877 | return ERR_PTR(err); |
1886 | } | 2878 | } |
1887 | 2879 | ||
@@ -1898,21 +2890,27 @@ static void free_event_rcu(struct rcu_head *head) | |||
1898 | kfree(event); | 2890 | kfree(event); |
1899 | } | 2891 | } |
1900 | 2892 | ||
1901 | static void perf_pending_sync(struct perf_event *event); | ||
1902 | static void perf_buffer_put(struct perf_buffer *buffer); | 2893 | static void perf_buffer_put(struct perf_buffer *buffer); |
1903 | 2894 | ||
1904 | static void free_event(struct perf_event *event) | 2895 | static void free_event(struct perf_event *event) |
1905 | { | 2896 | { |
1906 | perf_pending_sync(event); | 2897 | irq_work_sync(&event->pending); |
1907 | 2898 | ||
1908 | if (!event->parent) { | 2899 | if (!event->parent) { |
1909 | atomic_dec(&nr_events); | 2900 | if (event->attach_state & PERF_ATTACH_TASK) |
2901 | jump_label_dec(&perf_sched_events); | ||
1910 | if (event->attr.mmap || event->attr.mmap_data) | 2902 | if (event->attr.mmap || event->attr.mmap_data) |
1911 | atomic_dec(&nr_mmap_events); | 2903 | atomic_dec(&nr_mmap_events); |
1912 | if (event->attr.comm) | 2904 | if (event->attr.comm) |
1913 | atomic_dec(&nr_comm_events); | 2905 | atomic_dec(&nr_comm_events); |
1914 | if (event->attr.task) | 2906 | if (event->attr.task) |
1915 | atomic_dec(&nr_task_events); | 2907 | atomic_dec(&nr_task_events); |
2908 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
2909 | put_callchain_buffers(); | ||
2910 | if (is_cgroup_event(event)) { | ||
2911 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
2912 | jump_label_dec(&perf_sched_events); | ||
2913 | } | ||
1916 | } | 2914 | } |
1917 | 2915 | ||
1918 | if (event->buffer) { | 2916 | if (event->buffer) { |
@@ -1920,10 +2918,15 @@ static void free_event(struct perf_event *event) | |||
1920 | event->buffer = NULL; | 2918 | event->buffer = NULL; |
1921 | } | 2919 | } |
1922 | 2920 | ||
2921 | if (is_cgroup_event(event)) | ||
2922 | perf_detach_cgroup(event); | ||
2923 | |||
1923 | if (event->destroy) | 2924 | if (event->destroy) |
1924 | event->destroy(event); | 2925 | event->destroy(event); |
1925 | 2926 | ||
1926 | put_ctx(event->ctx); | 2927 | if (event->ctx) |
2928 | put_ctx(event->ctx); | ||
2929 | |||
1927 | call_rcu(&event->rcu_head, free_event_rcu); | 2930 | call_rcu(&event->rcu_head, free_event_rcu); |
1928 | } | 2931 | } |
1929 | 2932 | ||
@@ -1957,11 +2960,6 @@ int perf_event_release_kernel(struct perf_event *event) | |||
1957 | raw_spin_unlock_irq(&ctx->lock); | 2960 | raw_spin_unlock_irq(&ctx->lock); |
1958 | mutex_unlock(&ctx->mutex); | 2961 | mutex_unlock(&ctx->mutex); |
1959 | 2962 | ||
1960 | mutex_lock(&event->owner->perf_event_mutex); | ||
1961 | list_del_init(&event->owner_entry); | ||
1962 | mutex_unlock(&event->owner->perf_event_mutex); | ||
1963 | put_task_struct(event->owner); | ||
1964 | |||
1965 | free_event(event); | 2963 | free_event(event); |
1966 | 2964 | ||
1967 | return 0; | 2965 | return 0; |
@@ -1974,35 +2972,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
1974 | static int perf_release(struct inode *inode, struct file *file) | 2972 | static int perf_release(struct inode *inode, struct file *file) |
1975 | { | 2973 | { |
1976 | struct perf_event *event = file->private_data; | 2974 | struct perf_event *event = file->private_data; |
2975 | struct task_struct *owner; | ||
1977 | 2976 | ||
1978 | file->private_data = NULL; | 2977 | file->private_data = NULL; |
1979 | 2978 | ||
1980 | return perf_event_release_kernel(event); | 2979 | rcu_read_lock(); |
1981 | } | 2980 | owner = ACCESS_ONCE(event->owner); |
1982 | 2981 | /* | |
1983 | static int perf_event_read_size(struct perf_event *event) | 2982 | * Matches the smp_wmb() in perf_event_exit_task(). If we observe |
1984 | { | 2983 | * !owner it means the list deletion is complete and we can indeed |
1985 | int entry = sizeof(u64); /* value */ | 2984 | * free this event, otherwise we need to serialize on |
1986 | int size = 0; | 2985 | * owner->perf_event_mutex. |
1987 | int nr = 1; | 2986 | */ |
1988 | 2987 | smp_read_barrier_depends(); | |
1989 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 2988 | if (owner) { |
1990 | size += sizeof(u64); | 2989 | /* |
1991 | 2990 | * Since delayed_put_task_struct() also drops the last | |
1992 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 2991 | * task reference we can safely take a new reference |
1993 | size += sizeof(u64); | 2992 | * while holding the rcu_read_lock(). |
1994 | 2993 | */ | |
1995 | if (event->attr.read_format & PERF_FORMAT_ID) | 2994 | get_task_struct(owner); |
1996 | entry += sizeof(u64); | ||
1997 | |||
1998 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | ||
1999 | nr += event->group_leader->nr_siblings; | ||
2000 | size += sizeof(u64); | ||
2001 | } | 2995 | } |
2996 | rcu_read_unlock(); | ||
2002 | 2997 | ||
2003 | size += entry * nr; | 2998 | if (owner) { |
2999 | mutex_lock(&owner->perf_event_mutex); | ||
3000 | /* | ||
3001 | * We have to re-check the event->owner field, if it is cleared | ||
3002 | * we raced with perf_event_exit_task(), acquiring the mutex | ||
3003 | * ensured they're done, and we can proceed with freeing the | ||
3004 | * event. | ||
3005 | */ | ||
3006 | if (event->owner) | ||
3007 | list_del_init(&event->owner_entry); | ||
3008 | mutex_unlock(&owner->perf_event_mutex); | ||
3009 | put_task_struct(owner); | ||
3010 | } | ||
2004 | 3011 | ||
2005 | return size; | 3012 | return perf_event_release_kernel(event); |
2006 | } | 3013 | } |
2007 | 3014 | ||
2008 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 3015 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
@@ -2119,7 +3126,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | |||
2119 | if (event->state == PERF_EVENT_STATE_ERROR) | 3126 | if (event->state == PERF_EVENT_STATE_ERROR) |
2120 | return 0; | 3127 | return 0; |
2121 | 3128 | ||
2122 | if (count < perf_event_read_size(event)) | 3129 | if (count < event->read_size) |
2123 | return -ENOSPC; | 3130 | return -ENOSPC; |
2124 | 3131 | ||
2125 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3132 | WARN_ON_ONCE(event->ctx->parent_ctx); |
@@ -2205,7 +3212,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
2205 | int ret = 0; | 3212 | int ret = 0; |
2206 | u64 value; | 3213 | u64 value; |
2207 | 3214 | ||
2208 | if (!event->attr.sample_period) | 3215 | if (!is_sampling_event(event)) |
2209 | return -EINVAL; | 3216 | return -EINVAL; |
2210 | 3217 | ||
2211 | if (copy_from_user(&value, arg, sizeof(value))) | 3218 | if (copy_from_user(&value, arg, sizeof(value))) |
@@ -2342,6 +3349,9 @@ int perf_event_task_disable(void) | |||
2342 | 3349 | ||
2343 | static int perf_event_index(struct perf_event *event) | 3350 | static int perf_event_index(struct perf_event *event) |
2344 | { | 3351 | { |
3352 | if (event->hw.state & PERF_HES_STOPPED) | ||
3353 | return 0; | ||
3354 | |||
2345 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3355 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2346 | return 0; | 3356 | return 0; |
2347 | 3357 | ||
@@ -2845,16 +3855,7 @@ void perf_event_wakeup(struct perf_event *event) | |||
2845 | } | 3855 | } |
2846 | } | 3856 | } |
2847 | 3857 | ||
2848 | /* | 3858 | static void perf_pending_event(struct irq_work *entry) |
2849 | * Pending wakeups | ||
2850 | * | ||
2851 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
2852 | * | ||
2853 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
2854 | * single linked list and use cmpxchg() to add entries lockless. | ||
2855 | */ | ||
2856 | |||
2857 | static void perf_pending_event(struct perf_pending_entry *entry) | ||
2858 | { | 3859 | { |
2859 | struct perf_event *event = container_of(entry, | 3860 | struct perf_event *event = container_of(entry, |
2860 | struct perf_event, pending); | 3861 | struct perf_event, pending); |
@@ -2870,99 +3871,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) | |||
2870 | } | 3871 | } |
2871 | } | 3872 | } |
2872 | 3873 | ||
2873 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
2874 | |||
2875 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
2876 | PENDING_TAIL, | ||
2877 | }; | ||
2878 | |||
2879 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
2880 | void (*func)(struct perf_pending_entry *)) | ||
2881 | { | ||
2882 | struct perf_pending_entry **head; | ||
2883 | |||
2884 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
2885 | return; | ||
2886 | |||
2887 | entry->func = func; | ||
2888 | |||
2889 | head = &get_cpu_var(perf_pending_head); | ||
2890 | |||
2891 | do { | ||
2892 | entry->next = *head; | ||
2893 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
2894 | |||
2895 | set_perf_event_pending(); | ||
2896 | |||
2897 | put_cpu_var(perf_pending_head); | ||
2898 | } | ||
2899 | |||
2900 | static int __perf_pending_run(void) | ||
2901 | { | ||
2902 | struct perf_pending_entry *list; | ||
2903 | int nr = 0; | ||
2904 | |||
2905 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
2906 | while (list != PENDING_TAIL) { | ||
2907 | void (*func)(struct perf_pending_entry *); | ||
2908 | struct perf_pending_entry *entry = list; | ||
2909 | |||
2910 | list = list->next; | ||
2911 | |||
2912 | func = entry->func; | ||
2913 | entry->next = NULL; | ||
2914 | /* | ||
2915 | * Ensure we observe the unqueue before we issue the wakeup, | ||
2916 | * so that we won't be waiting forever. | ||
2917 | * -- see perf_not_pending(). | ||
2918 | */ | ||
2919 | smp_wmb(); | ||
2920 | |||
2921 | func(entry); | ||
2922 | nr++; | ||
2923 | } | ||
2924 | |||
2925 | return nr; | ||
2926 | } | ||
2927 | |||
2928 | static inline int perf_not_pending(struct perf_event *event) | ||
2929 | { | ||
2930 | /* | ||
2931 | * If we flush on whatever cpu we run, there is a chance we don't | ||
2932 | * need to wait. | ||
2933 | */ | ||
2934 | get_cpu(); | ||
2935 | __perf_pending_run(); | ||
2936 | put_cpu(); | ||
2937 | |||
2938 | /* | ||
2939 | * Ensure we see the proper queue state before going to sleep | ||
2940 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
2941 | */ | ||
2942 | smp_rmb(); | ||
2943 | return event->pending.next == NULL; | ||
2944 | } | ||
2945 | |||
2946 | static void perf_pending_sync(struct perf_event *event) | ||
2947 | { | ||
2948 | wait_event(event->waitq, perf_not_pending(event)); | ||
2949 | } | ||
2950 | |||
2951 | void perf_event_do_pending(void) | ||
2952 | { | ||
2953 | __perf_pending_run(); | ||
2954 | } | ||
2955 | |||
2956 | /* | ||
2957 | * Callchain support -- arch specific | ||
2958 | */ | ||
2959 | |||
2960 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2961 | { | ||
2962 | return NULL; | ||
2963 | } | ||
2964 | |||
2965 | |||
2966 | /* | 3874 | /* |
2967 | * We assume there is only KVM supporting the callbacks. | 3875 | * We assume there is only KVM supporting the callbacks. |
2968 | * Later on, we might change it to a list if there is | 3876 | * Later on, we might change it to a list if there is |
@@ -3012,8 +3920,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
3012 | 3920 | ||
3013 | if (handle->nmi) { | 3921 | if (handle->nmi) { |
3014 | handle->event->pending_wakeup = 1; | 3922 | handle->event->pending_wakeup = 1; |
3015 | perf_pending_queue(&handle->event->pending, | 3923 | irq_work_queue(&handle->event->pending); |
3016 | perf_pending_event); | ||
3017 | } else | 3924 | } else |
3018 | perf_event_wakeup(handle->event); | 3925 | perf_event_wakeup(handle->event); |
3019 | } | 3926 | } |
@@ -3069,7 +3976,7 @@ again: | |||
3069 | if (handle->wakeup != local_read(&buffer->wakeup)) | 3976 | if (handle->wakeup != local_read(&buffer->wakeup)) |
3070 | perf_output_wakeup(handle); | 3977 | perf_output_wakeup(handle); |
3071 | 3978 | ||
3072 | out: | 3979 | out: |
3073 | preempt_enable(); | 3980 | preempt_enable(); |
3074 | } | 3981 | } |
3075 | 3982 | ||
@@ -3096,6 +4003,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
3096 | } while (len); | 4003 | } while (len); |
3097 | } | 4004 | } |
3098 | 4005 | ||
4006 | static void __perf_event_header__init_id(struct perf_event_header *header, | ||
4007 | struct perf_sample_data *data, | ||
4008 | struct perf_event *event) | ||
4009 | { | ||
4010 | u64 sample_type = event->attr.sample_type; | ||
4011 | |||
4012 | data->type = sample_type; | ||
4013 | header->size += event->id_header_size; | ||
4014 | |||
4015 | if (sample_type & PERF_SAMPLE_TID) { | ||
4016 | /* namespace issues */ | ||
4017 | data->tid_entry.pid = perf_event_pid(event, current); | ||
4018 | data->tid_entry.tid = perf_event_tid(event, current); | ||
4019 | } | ||
4020 | |||
4021 | if (sample_type & PERF_SAMPLE_TIME) | ||
4022 | data->time = perf_clock(); | ||
4023 | |||
4024 | if (sample_type & PERF_SAMPLE_ID) | ||
4025 | data->id = primary_event_id(event); | ||
4026 | |||
4027 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
4028 | data->stream_id = event->id; | ||
4029 | |||
4030 | if (sample_type & PERF_SAMPLE_CPU) { | ||
4031 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
4032 | data->cpu_entry.reserved = 0; | ||
4033 | } | ||
4034 | } | ||
4035 | |||
4036 | static void perf_event_header__init_id(struct perf_event_header *header, | ||
4037 | struct perf_sample_data *data, | ||
4038 | struct perf_event *event) | ||
4039 | { | ||
4040 | if (event->attr.sample_id_all) | ||
4041 | __perf_event_header__init_id(header, data, event); | ||
4042 | } | ||
4043 | |||
4044 | static void __perf_event__output_id_sample(struct perf_output_handle *handle, | ||
4045 | struct perf_sample_data *data) | ||
4046 | { | ||
4047 | u64 sample_type = data->type; | ||
4048 | |||
4049 | if (sample_type & PERF_SAMPLE_TID) | ||
4050 | perf_output_put(handle, data->tid_entry); | ||
4051 | |||
4052 | if (sample_type & PERF_SAMPLE_TIME) | ||
4053 | perf_output_put(handle, data->time); | ||
4054 | |||
4055 | if (sample_type & PERF_SAMPLE_ID) | ||
4056 | perf_output_put(handle, data->id); | ||
4057 | |||
4058 | if (sample_type & PERF_SAMPLE_STREAM_ID) | ||
4059 | perf_output_put(handle, data->stream_id); | ||
4060 | |||
4061 | if (sample_type & PERF_SAMPLE_CPU) | ||
4062 | perf_output_put(handle, data->cpu_entry); | ||
4063 | } | ||
4064 | |||
4065 | static void perf_event__output_id_sample(struct perf_event *event, | ||
4066 | struct perf_output_handle *handle, | ||
4067 | struct perf_sample_data *sample) | ||
4068 | { | ||
4069 | if (event->attr.sample_id_all) | ||
4070 | __perf_event__output_id_sample(handle, sample); | ||
4071 | } | ||
4072 | |||
3099 | int perf_output_begin(struct perf_output_handle *handle, | 4073 | int perf_output_begin(struct perf_output_handle *handle, |
3100 | struct perf_event *event, unsigned int size, | 4074 | struct perf_event *event, unsigned int size, |
3101 | int nmi, int sample) | 4075 | int nmi, int sample) |
@@ -3103,6 +4077,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3103 | struct perf_buffer *buffer; | 4077 | struct perf_buffer *buffer; |
3104 | unsigned long tail, offset, head; | 4078 | unsigned long tail, offset, head; |
3105 | int have_lost; | 4079 | int have_lost; |
4080 | struct perf_sample_data sample_data; | ||
3106 | struct { | 4081 | struct { |
3107 | struct perf_event_header header; | 4082 | struct perf_event_header header; |
3108 | u64 id; | 4083 | u64 id; |
@@ -3129,8 +4104,12 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3129 | goto out; | 4104 | goto out; |
3130 | 4105 | ||
3131 | have_lost = local_read(&buffer->lost); | 4106 | have_lost = local_read(&buffer->lost); |
3132 | if (have_lost) | 4107 | if (have_lost) { |
3133 | size += sizeof(lost_event); | 4108 | lost_event.header.size = sizeof(lost_event); |
4109 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
4110 | event); | ||
4111 | size += lost_event.header.size; | ||
4112 | } | ||
3134 | 4113 | ||
3135 | perf_output_get_handle(handle); | 4114 | perf_output_get_handle(handle); |
3136 | 4115 | ||
@@ -3161,11 +4140,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3161 | if (have_lost) { | 4140 | if (have_lost) { |
3162 | lost_event.header.type = PERF_RECORD_LOST; | 4141 | lost_event.header.type = PERF_RECORD_LOST; |
3163 | lost_event.header.misc = 0; | 4142 | lost_event.header.misc = 0; |
3164 | lost_event.header.size = sizeof(lost_event); | ||
3165 | lost_event.id = event->id; | 4143 | lost_event.id = event->id; |
3166 | lost_event.lost = local_xchg(&buffer->lost, 0); | 4144 | lost_event.lost = local_xchg(&buffer->lost, 0); |
3167 | 4145 | ||
3168 | perf_output_put(handle, lost_event); | 4146 | perf_output_put(handle, lost_event); |
4147 | perf_event__output_id_sample(event, handle, &sample_data); | ||
3169 | } | 4148 | } |
3170 | 4149 | ||
3171 | return 0; | 4150 | return 0; |
@@ -3198,30 +4177,9 @@ void perf_output_end(struct perf_output_handle *handle) | |||
3198 | rcu_read_unlock(); | 4177 | rcu_read_unlock(); |
3199 | } | 4178 | } |
3200 | 4179 | ||
3201 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | ||
3202 | { | ||
3203 | /* | ||
3204 | * only top level events have the pid namespace they were created in | ||
3205 | */ | ||
3206 | if (event->parent) | ||
3207 | event = event->parent; | ||
3208 | |||
3209 | return task_tgid_nr_ns(p, event->ns); | ||
3210 | } | ||
3211 | |||
3212 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) | ||
3213 | { | ||
3214 | /* | ||
3215 | * only top level events have the pid namespace they were created in | ||
3216 | */ | ||
3217 | if (event->parent) | ||
3218 | event = event->parent; | ||
3219 | |||
3220 | return task_pid_nr_ns(p, event->ns); | ||
3221 | } | ||
3222 | |||
3223 | static void perf_output_read_one(struct perf_output_handle *handle, | 4180 | static void perf_output_read_one(struct perf_output_handle *handle, |
3224 | struct perf_event *event) | 4181 | struct perf_event *event, |
4182 | u64 enabled, u64 running) | ||
3225 | { | 4183 | { |
3226 | u64 read_format = event->attr.read_format; | 4184 | u64 read_format = event->attr.read_format; |
3227 | u64 values[4]; | 4185 | u64 values[4]; |
@@ -3229,11 +4187,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
3229 | 4187 | ||
3230 | values[n++] = perf_event_count(event); | 4188 | values[n++] = perf_event_count(event); |
3231 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | 4189 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
3232 | values[n++] = event->total_time_enabled + | 4190 | values[n++] = enabled + |
3233 | atomic64_read(&event->child_total_time_enabled); | 4191 | atomic64_read(&event->child_total_time_enabled); |
3234 | } | 4192 | } |
3235 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { | 4193 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { |
3236 | values[n++] = event->total_time_running + | 4194 | values[n++] = running + |
3237 | atomic64_read(&event->child_total_time_running); | 4195 | atomic64_read(&event->child_total_time_running); |
3238 | } | 4196 | } |
3239 | if (read_format & PERF_FORMAT_ID) | 4197 | if (read_format & PERF_FORMAT_ID) |
@@ -3246,7 +4204,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
3246 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. | 4204 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. |
3247 | */ | 4205 | */ |
3248 | static void perf_output_read_group(struct perf_output_handle *handle, | 4206 | static void perf_output_read_group(struct perf_output_handle *handle, |
3249 | struct perf_event *event) | 4207 | struct perf_event *event, |
4208 | u64 enabled, u64 running) | ||
3250 | { | 4209 | { |
3251 | struct perf_event *leader = event->group_leader, *sub; | 4210 | struct perf_event *leader = event->group_leader, *sub; |
3252 | u64 read_format = event->attr.read_format; | 4211 | u64 read_format = event->attr.read_format; |
@@ -3256,10 +4215,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3256 | values[n++] = 1 + leader->nr_siblings; | 4215 | values[n++] = 1 + leader->nr_siblings; |
3257 | 4216 | ||
3258 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 4217 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
3259 | values[n++] = leader->total_time_enabled; | 4218 | values[n++] = enabled; |
3260 | 4219 | ||
3261 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 4220 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
3262 | values[n++] = leader->total_time_running; | 4221 | values[n++] = running; |
3263 | 4222 | ||
3264 | if (leader != event) | 4223 | if (leader != event) |
3265 | leader->pmu->read(leader); | 4224 | leader->pmu->read(leader); |
@@ -3284,13 +4243,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3284 | } | 4243 | } |
3285 | } | 4244 | } |
3286 | 4245 | ||
4246 | #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ | ||
4247 | PERF_FORMAT_TOTAL_TIME_RUNNING) | ||
4248 | |||
3287 | static void perf_output_read(struct perf_output_handle *handle, | 4249 | static void perf_output_read(struct perf_output_handle *handle, |
3288 | struct perf_event *event) | 4250 | struct perf_event *event) |
3289 | { | 4251 | { |
4252 | u64 enabled = 0, running = 0, now, ctx_time; | ||
4253 | u64 read_format = event->attr.read_format; | ||
4254 | |||
4255 | /* | ||
4256 | * compute total_time_enabled, total_time_running | ||
4257 | * based on snapshot values taken when the event | ||
4258 | * was last scheduled in. | ||
4259 | * | ||
4260 | * we cannot simply called update_context_time() | ||
4261 | * because of locking issue as we are called in | ||
4262 | * NMI context | ||
4263 | */ | ||
4264 | if (read_format & PERF_FORMAT_TOTAL_TIMES) { | ||
4265 | now = perf_clock(); | ||
4266 | ctx_time = event->shadow_ctx_time + now; | ||
4267 | enabled = ctx_time - event->tstamp_enabled; | ||
4268 | running = ctx_time - event->tstamp_running; | ||
4269 | } | ||
4270 | |||
3290 | if (event->attr.read_format & PERF_FORMAT_GROUP) | 4271 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
3291 | perf_output_read_group(handle, event); | 4272 | perf_output_read_group(handle, event, enabled, running); |
3292 | else | 4273 | else |
3293 | perf_output_read_one(handle, event); | 4274 | perf_output_read_one(handle, event, enabled, running); |
3294 | } | 4275 | } |
3295 | 4276 | ||
3296 | void perf_output_sample(struct perf_output_handle *handle, | 4277 | void perf_output_sample(struct perf_output_handle *handle, |
@@ -3370,61 +4351,16 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
3370 | { | 4351 | { |
3371 | u64 sample_type = event->attr.sample_type; | 4352 | u64 sample_type = event->attr.sample_type; |
3372 | 4353 | ||
3373 | data->type = sample_type; | ||
3374 | |||
3375 | header->type = PERF_RECORD_SAMPLE; | 4354 | header->type = PERF_RECORD_SAMPLE; |
3376 | header->size = sizeof(*header); | 4355 | header->size = sizeof(*header) + event->header_size; |
3377 | 4356 | ||
3378 | header->misc = 0; | 4357 | header->misc = 0; |
3379 | header->misc |= perf_misc_flags(regs); | 4358 | header->misc |= perf_misc_flags(regs); |
3380 | 4359 | ||
3381 | if (sample_type & PERF_SAMPLE_IP) { | 4360 | __perf_event_header__init_id(header, data, event); |
3382 | data->ip = perf_instruction_pointer(regs); | ||
3383 | |||
3384 | header->size += sizeof(data->ip); | ||
3385 | } | ||
3386 | |||
3387 | if (sample_type & PERF_SAMPLE_TID) { | ||
3388 | /* namespace issues */ | ||
3389 | data->tid_entry.pid = perf_event_pid(event, current); | ||
3390 | data->tid_entry.tid = perf_event_tid(event, current); | ||
3391 | |||
3392 | header->size += sizeof(data->tid_entry); | ||
3393 | } | ||
3394 | |||
3395 | if (sample_type & PERF_SAMPLE_TIME) { | ||
3396 | data->time = perf_clock(); | ||
3397 | |||
3398 | header->size += sizeof(data->time); | ||
3399 | } | ||
3400 | |||
3401 | if (sample_type & PERF_SAMPLE_ADDR) | ||
3402 | header->size += sizeof(data->addr); | ||
3403 | |||
3404 | if (sample_type & PERF_SAMPLE_ID) { | ||
3405 | data->id = primary_event_id(event); | ||
3406 | |||
3407 | header->size += sizeof(data->id); | ||
3408 | } | ||
3409 | |||
3410 | if (sample_type & PERF_SAMPLE_STREAM_ID) { | ||
3411 | data->stream_id = event->id; | ||
3412 | |||
3413 | header->size += sizeof(data->stream_id); | ||
3414 | } | ||
3415 | |||
3416 | if (sample_type & PERF_SAMPLE_CPU) { | ||
3417 | data->cpu_entry.cpu = raw_smp_processor_id(); | ||
3418 | data->cpu_entry.reserved = 0; | ||
3419 | 4361 | ||
3420 | header->size += sizeof(data->cpu_entry); | 4362 | if (sample_type & PERF_SAMPLE_IP) |
3421 | } | 4363 | data->ip = perf_instruction_pointer(regs); |
3422 | |||
3423 | if (sample_type & PERF_SAMPLE_PERIOD) | ||
3424 | header->size += sizeof(data->period); | ||
3425 | |||
3426 | if (sample_type & PERF_SAMPLE_READ) | ||
3427 | header->size += perf_event_read_size(event); | ||
3428 | 4364 | ||
3429 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { | 4365 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
3430 | int size = 1; | 4366 | int size = 1; |
@@ -3457,14 +4393,20 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
3457 | struct perf_output_handle handle; | 4393 | struct perf_output_handle handle; |
3458 | struct perf_event_header header; | 4394 | struct perf_event_header header; |
3459 | 4395 | ||
4396 | /* protect the callchain buffers */ | ||
4397 | rcu_read_lock(); | ||
4398 | |||
3460 | perf_prepare_sample(&header, data, event, regs); | 4399 | perf_prepare_sample(&header, data, event, regs); |
3461 | 4400 | ||
3462 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 4401 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
3463 | return; | 4402 | goto exit; |
3464 | 4403 | ||
3465 | perf_output_sample(&handle, &header, data, event); | 4404 | perf_output_sample(&handle, &header, data, event); |
3466 | 4405 | ||
3467 | perf_output_end(&handle); | 4406 | perf_output_end(&handle); |
4407 | |||
4408 | exit: | ||
4409 | rcu_read_unlock(); | ||
3468 | } | 4410 | } |
3469 | 4411 | ||
3470 | /* | 4412 | /* |
@@ -3483,23 +4425,26 @@ perf_event_read_event(struct perf_event *event, | |||
3483 | struct task_struct *task) | 4425 | struct task_struct *task) |
3484 | { | 4426 | { |
3485 | struct perf_output_handle handle; | 4427 | struct perf_output_handle handle; |
4428 | struct perf_sample_data sample; | ||
3486 | struct perf_read_event read_event = { | 4429 | struct perf_read_event read_event = { |
3487 | .header = { | 4430 | .header = { |
3488 | .type = PERF_RECORD_READ, | 4431 | .type = PERF_RECORD_READ, |
3489 | .misc = 0, | 4432 | .misc = 0, |
3490 | .size = sizeof(read_event) + perf_event_read_size(event), | 4433 | .size = sizeof(read_event) + event->read_size, |
3491 | }, | 4434 | }, |
3492 | .pid = perf_event_pid(event, task), | 4435 | .pid = perf_event_pid(event, task), |
3493 | .tid = perf_event_tid(event, task), | 4436 | .tid = perf_event_tid(event, task), |
3494 | }; | 4437 | }; |
3495 | int ret; | 4438 | int ret; |
3496 | 4439 | ||
4440 | perf_event_header__init_id(&read_event.header, &sample, event); | ||
3497 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 4441 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); |
3498 | if (ret) | 4442 | if (ret) |
3499 | return; | 4443 | return; |
3500 | 4444 | ||
3501 | perf_output_put(&handle, read_event); | 4445 | perf_output_put(&handle, read_event); |
3502 | perf_output_read(&handle, event); | 4446 | perf_output_read(&handle, event); |
4447 | perf_event__output_id_sample(event, &handle, &sample); | ||
3503 | 4448 | ||
3504 | perf_output_end(&handle); | 4449 | perf_output_end(&handle); |
3505 | } | 4450 | } |
@@ -3529,14 +4474,16 @@ static void perf_event_task_output(struct perf_event *event, | |||
3529 | struct perf_task_event *task_event) | 4474 | struct perf_task_event *task_event) |
3530 | { | 4475 | { |
3531 | struct perf_output_handle handle; | 4476 | struct perf_output_handle handle; |
4477 | struct perf_sample_data sample; | ||
3532 | struct task_struct *task = task_event->task; | 4478 | struct task_struct *task = task_event->task; |
3533 | int size, ret; | 4479 | int ret, size = task_event->event_id.header.size; |
3534 | 4480 | ||
3535 | size = task_event->event_id.header.size; | 4481 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
3536 | ret = perf_output_begin(&handle, event, size, 0, 0); | ||
3537 | 4482 | ||
4483 | ret = perf_output_begin(&handle, event, | ||
4484 | task_event->event_id.header.size, 0, 0); | ||
3538 | if (ret) | 4485 | if (ret) |
3539 | return; | 4486 | goto out; |
3540 | 4487 | ||
3541 | task_event->event_id.pid = perf_event_pid(event, task); | 4488 | task_event->event_id.pid = perf_event_pid(event, task); |
3542 | task_event->event_id.ppid = perf_event_pid(event, current); | 4489 | task_event->event_id.ppid = perf_event_pid(event, current); |
@@ -3546,7 +4493,11 @@ static void perf_event_task_output(struct perf_event *event, | |||
3546 | 4493 | ||
3547 | perf_output_put(&handle, task_event->event_id); | 4494 | perf_output_put(&handle, task_event->event_id); |
3548 | 4495 | ||
4496 | perf_event__output_id_sample(event, &handle, &sample); | ||
4497 | |||
3549 | perf_output_end(&handle); | 4498 | perf_output_end(&handle); |
4499 | out: | ||
4500 | task_event->event_id.header.size = size; | ||
3550 | } | 4501 | } |
3551 | 4502 | ||
3552 | static int perf_event_task_match(struct perf_event *event) | 4503 | static int perf_event_task_match(struct perf_event *event) |
@@ -3554,7 +4505,7 @@ static int perf_event_task_match(struct perf_event *event) | |||
3554 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4505 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3555 | return 0; | 4506 | return 0; |
3556 | 4507 | ||
3557 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4508 | if (!event_filter_match(event)) |
3558 | return 0; | 4509 | return 0; |
3559 | 4510 | ||
3560 | if (event->attr.comm || event->attr.mmap || | 4511 | if (event->attr.comm || event->attr.mmap || |
@@ -3578,16 +4529,29 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, | |||
3578 | static void perf_event_task_event(struct perf_task_event *task_event) | 4529 | static void perf_event_task_event(struct perf_task_event *task_event) |
3579 | { | 4530 | { |
3580 | struct perf_cpu_context *cpuctx; | 4531 | struct perf_cpu_context *cpuctx; |
3581 | struct perf_event_context *ctx = task_event->task_ctx; | 4532 | struct perf_event_context *ctx; |
4533 | struct pmu *pmu; | ||
4534 | int ctxn; | ||
3582 | 4535 | ||
3583 | rcu_read_lock(); | 4536 | rcu_read_lock(); |
3584 | cpuctx = &get_cpu_var(perf_cpu_context); | 4537 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3585 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 4538 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3586 | if (!ctx) | 4539 | if (cpuctx->active_pmu != pmu) |
3587 | ctx = rcu_dereference(current->perf_event_ctxp); | 4540 | goto next; |
3588 | if (ctx) | 4541 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3589 | perf_event_task_ctx(ctx, task_event); | 4542 | |
3590 | put_cpu_var(perf_cpu_context); | 4543 | ctx = task_event->task_ctx; |
4544 | if (!ctx) { | ||
4545 | ctxn = pmu->task_ctx_nr; | ||
4546 | if (ctxn < 0) | ||
4547 | goto next; | ||
4548 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4549 | } | ||
4550 | if (ctx) | ||
4551 | perf_event_task_ctx(ctx, task_event); | ||
4552 | next: | ||
4553 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4554 | } | ||
3591 | rcu_read_unlock(); | 4555 | rcu_read_unlock(); |
3592 | } | 4556 | } |
3593 | 4557 | ||
@@ -3648,11 +4612,16 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3648 | struct perf_comm_event *comm_event) | 4612 | struct perf_comm_event *comm_event) |
3649 | { | 4613 | { |
3650 | struct perf_output_handle handle; | 4614 | struct perf_output_handle handle; |
4615 | struct perf_sample_data sample; | ||
3651 | int size = comm_event->event_id.header.size; | 4616 | int size = comm_event->event_id.header.size; |
3652 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4617 | int ret; |
4618 | |||
4619 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | ||
4620 | ret = perf_output_begin(&handle, event, | ||
4621 | comm_event->event_id.header.size, 0, 0); | ||
3653 | 4622 | ||
3654 | if (ret) | 4623 | if (ret) |
3655 | return; | 4624 | goto out; |
3656 | 4625 | ||
3657 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); | 4626 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); |
3658 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4627 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
@@ -3660,7 +4629,12 @@ static void perf_event_comm_output(struct perf_event *event, | |||
3660 | perf_output_put(&handle, comm_event->event_id); | 4629 | perf_output_put(&handle, comm_event->event_id); |
3661 | perf_output_copy(&handle, comm_event->comm, | 4630 | perf_output_copy(&handle, comm_event->comm, |
3662 | comm_event->comm_size); | 4631 | comm_event->comm_size); |
4632 | |||
4633 | perf_event__output_id_sample(event, &handle, &sample); | ||
4634 | |||
3663 | perf_output_end(&handle); | 4635 | perf_output_end(&handle); |
4636 | out: | ||
4637 | comm_event->event_id.header.size = size; | ||
3664 | } | 4638 | } |
3665 | 4639 | ||
3666 | static int perf_event_comm_match(struct perf_event *event) | 4640 | static int perf_event_comm_match(struct perf_event *event) |
@@ -3668,7 +4642,7 @@ static int perf_event_comm_match(struct perf_event *event) | |||
3668 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4642 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3669 | return 0; | 4643 | return 0; |
3670 | 4644 | ||
3671 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4645 | if (!event_filter_match(event)) |
3672 | return 0; | 4646 | return 0; |
3673 | 4647 | ||
3674 | if (event->attr.comm) | 4648 | if (event->attr.comm) |
@@ -3692,8 +4666,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3692 | { | 4666 | { |
3693 | struct perf_cpu_context *cpuctx; | 4667 | struct perf_cpu_context *cpuctx; |
3694 | struct perf_event_context *ctx; | 4668 | struct perf_event_context *ctx; |
3695 | unsigned int size; | ||
3696 | char comm[TASK_COMM_LEN]; | 4669 | char comm[TASK_COMM_LEN]; |
4670 | unsigned int size; | ||
4671 | struct pmu *pmu; | ||
4672 | int ctxn; | ||
3697 | 4673 | ||
3698 | memset(comm, 0, sizeof(comm)); | 4674 | memset(comm, 0, sizeof(comm)); |
3699 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 4675 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
@@ -3703,23 +4679,39 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3703 | comm_event->comm_size = size; | 4679 | comm_event->comm_size = size; |
3704 | 4680 | ||
3705 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4681 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3706 | |||
3707 | rcu_read_lock(); | 4682 | rcu_read_lock(); |
3708 | cpuctx = &get_cpu_var(perf_cpu_context); | 4683 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3709 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 4684 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3710 | ctx = rcu_dereference(current->perf_event_ctxp); | 4685 | if (cpuctx->active_pmu != pmu) |
3711 | if (ctx) | 4686 | goto next; |
3712 | perf_event_comm_ctx(ctx, comm_event); | 4687 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3713 | put_cpu_var(perf_cpu_context); | 4688 | |
4689 | ctxn = pmu->task_ctx_nr; | ||
4690 | if (ctxn < 0) | ||
4691 | goto next; | ||
4692 | |||
4693 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4694 | if (ctx) | ||
4695 | perf_event_comm_ctx(ctx, comm_event); | ||
4696 | next: | ||
4697 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4698 | } | ||
3714 | rcu_read_unlock(); | 4699 | rcu_read_unlock(); |
3715 | } | 4700 | } |
3716 | 4701 | ||
3717 | void perf_event_comm(struct task_struct *task) | 4702 | void perf_event_comm(struct task_struct *task) |
3718 | { | 4703 | { |
3719 | struct perf_comm_event comm_event; | 4704 | struct perf_comm_event comm_event; |
4705 | struct perf_event_context *ctx; | ||
4706 | int ctxn; | ||
4707 | |||
4708 | for_each_task_context_nr(ctxn) { | ||
4709 | ctx = task->perf_event_ctxp[ctxn]; | ||
4710 | if (!ctx) | ||
4711 | continue; | ||
3720 | 4712 | ||
3721 | if (task->perf_event_ctxp) | 4713 | perf_event_enable_on_exec(ctx); |
3722 | perf_event_enable_on_exec(task); | 4714 | } |
3723 | 4715 | ||
3724 | if (!atomic_read(&nr_comm_events)) | 4716 | if (!atomic_read(&nr_comm_events)) |
3725 | return; | 4717 | return; |
@@ -3767,11 +4759,15 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
3767 | struct perf_mmap_event *mmap_event) | 4759 | struct perf_mmap_event *mmap_event) |
3768 | { | 4760 | { |
3769 | struct perf_output_handle handle; | 4761 | struct perf_output_handle handle; |
4762 | struct perf_sample_data sample; | ||
3770 | int size = mmap_event->event_id.header.size; | 4763 | int size = mmap_event->event_id.header.size; |
3771 | int ret = perf_output_begin(&handle, event, size, 0, 0); | 4764 | int ret; |
3772 | 4765 | ||
4766 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | ||
4767 | ret = perf_output_begin(&handle, event, | ||
4768 | mmap_event->event_id.header.size, 0, 0); | ||
3773 | if (ret) | 4769 | if (ret) |
3774 | return; | 4770 | goto out; |
3775 | 4771 | ||
3776 | mmap_event->event_id.pid = perf_event_pid(event, current); | 4772 | mmap_event->event_id.pid = perf_event_pid(event, current); |
3777 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4773 | mmap_event->event_id.tid = perf_event_tid(event, current); |
@@ -3779,7 +4775,12 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
3779 | perf_output_put(&handle, mmap_event->event_id); | 4775 | perf_output_put(&handle, mmap_event->event_id); |
3780 | perf_output_copy(&handle, mmap_event->file_name, | 4776 | perf_output_copy(&handle, mmap_event->file_name, |
3781 | mmap_event->file_size); | 4777 | mmap_event->file_size); |
4778 | |||
4779 | perf_event__output_id_sample(event, &handle, &sample); | ||
4780 | |||
3782 | perf_output_end(&handle); | 4781 | perf_output_end(&handle); |
4782 | out: | ||
4783 | mmap_event->event_id.header.size = size; | ||
3783 | } | 4784 | } |
3784 | 4785 | ||
3785 | static int perf_event_mmap_match(struct perf_event *event, | 4786 | static int perf_event_mmap_match(struct perf_event *event, |
@@ -3789,7 +4790,7 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
3789 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 4790 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3790 | return 0; | 4791 | return 0; |
3791 | 4792 | ||
3792 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4793 | if (!event_filter_match(event)) |
3793 | return 0; | 4794 | return 0; |
3794 | 4795 | ||
3795 | if ((!executable && event->attr.mmap_data) || | 4796 | if ((!executable && event->attr.mmap_data) || |
@@ -3821,6 +4822,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3821 | char tmp[16]; | 4822 | char tmp[16]; |
3822 | char *buf = NULL; | 4823 | char *buf = NULL; |
3823 | const char *name; | 4824 | const char *name; |
4825 | struct pmu *pmu; | ||
4826 | int ctxn; | ||
3824 | 4827 | ||
3825 | memset(tmp, 0, sizeof(tmp)); | 4828 | memset(tmp, 0, sizeof(tmp)); |
3826 | 4829 | ||
@@ -3873,12 +4876,25 @@ got_name: | |||
3873 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4876 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
3874 | 4877 | ||
3875 | rcu_read_lock(); | 4878 | rcu_read_lock(); |
3876 | cpuctx = &get_cpu_var(perf_cpu_context); | 4879 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3877 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4880 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3878 | ctx = rcu_dereference(current->perf_event_ctxp); | 4881 | if (cpuctx->active_pmu != pmu) |
3879 | if (ctx) | 4882 | goto next; |
3880 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4883 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
3881 | put_cpu_var(perf_cpu_context); | 4884 | vma->vm_flags & VM_EXEC); |
4885 | |||
4886 | ctxn = pmu->task_ctx_nr; | ||
4887 | if (ctxn < 0) | ||
4888 | goto next; | ||
4889 | |||
4890 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4891 | if (ctx) { | ||
4892 | perf_event_mmap_ctx(ctx, mmap_event, | ||
4893 | vma->vm_flags & VM_EXEC); | ||
4894 | } | ||
4895 | next: | ||
4896 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4897 | } | ||
3882 | rcu_read_unlock(); | 4898 | rcu_read_unlock(); |
3883 | 4899 | ||
3884 | kfree(buf); | 4900 | kfree(buf); |
@@ -3919,6 +4935,7 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
3919 | static void perf_log_throttle(struct perf_event *event, int enable) | 4935 | static void perf_log_throttle(struct perf_event *event, int enable) |
3920 | { | 4936 | { |
3921 | struct perf_output_handle handle; | 4937 | struct perf_output_handle handle; |
4938 | struct perf_sample_data sample; | ||
3922 | int ret; | 4939 | int ret; |
3923 | 4940 | ||
3924 | struct { | 4941 | struct { |
@@ -3940,11 +4957,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
3940 | if (enable) | 4957 | if (enable) |
3941 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; | 4958 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; |
3942 | 4959 | ||
3943 | ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); | 4960 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
4961 | |||
4962 | ret = perf_output_begin(&handle, event, | ||
4963 | throttle_event.header.size, 1, 0); | ||
3944 | if (ret) | 4964 | if (ret) |
3945 | return; | 4965 | return; |
3946 | 4966 | ||
3947 | perf_output_put(&handle, throttle_event); | 4967 | perf_output_put(&handle, throttle_event); |
4968 | perf_event__output_id_sample(event, &handle, &sample); | ||
3948 | perf_output_end(&handle); | 4969 | perf_output_end(&handle); |
3949 | } | 4970 | } |
3950 | 4971 | ||
@@ -3960,28 +4981,21 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3960 | struct hw_perf_event *hwc = &event->hw; | 4981 | struct hw_perf_event *hwc = &event->hw; |
3961 | int ret = 0; | 4982 | int ret = 0; |
3962 | 4983 | ||
3963 | throttle = (throttle && event->pmu->unthrottle != NULL); | 4984 | /* |
4985 | * Non-sampling counters might still use the PMI to fold short | ||
4986 | * hardware counters, ignore those. | ||
4987 | */ | ||
4988 | if (unlikely(!is_sampling_event(event))) | ||
4989 | return 0; | ||
3964 | 4990 | ||
3965 | if (!throttle) { | 4991 | if (unlikely(hwc->interrupts >= max_samples_per_tick)) { |
3966 | hwc->interrupts++; | 4992 | if (throttle) { |
3967 | } else { | 4993 | hwc->interrupts = MAX_INTERRUPTS; |
3968 | if (hwc->interrupts != MAX_INTERRUPTS) { | 4994 | perf_log_throttle(event, 0); |
3969 | hwc->interrupts++; | ||
3970 | if (HZ * hwc->interrupts > | ||
3971 | (u64)sysctl_perf_event_sample_rate) { | ||
3972 | hwc->interrupts = MAX_INTERRUPTS; | ||
3973 | perf_log_throttle(event, 0); | ||
3974 | ret = 1; | ||
3975 | } | ||
3976 | } else { | ||
3977 | /* | ||
3978 | * Keep re-disabling events even though on the previous | ||
3979 | * pass we disabled it - just in case we raced with a | ||
3980 | * sched-in and the event got enabled again: | ||
3981 | */ | ||
3982 | ret = 1; | 4995 | ret = 1; |
3983 | } | 4996 | } |
3984 | } | 4997 | } else |
4998 | hwc->interrupts++; | ||
3985 | 4999 | ||
3986 | if (event->attr.freq) { | 5000 | if (event->attr.freq) { |
3987 | u64 now = perf_clock(); | 5001 | u64 now = perf_clock(); |
@@ -4004,8 +5018,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4004 | event->pending_kill = POLL_HUP; | 5018 | event->pending_kill = POLL_HUP; |
4005 | if (nmi) { | 5019 | if (nmi) { |
4006 | event->pending_disable = 1; | 5020 | event->pending_disable = 1; |
4007 | perf_pending_queue(&event->pending, | 5021 | irq_work_queue(&event->pending); |
4008 | perf_pending_event); | ||
4009 | } else | 5022 | } else |
4010 | perf_event_disable(event); | 5023 | perf_event_disable(event); |
4011 | } | 5024 | } |
@@ -4015,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4015 | else | 5028 | else |
4016 | perf_event_output(event, nmi, data, regs); | 5029 | perf_event_output(event, nmi, data, regs); |
4017 | 5030 | ||
5031 | if (event->fasync && event->pending_kill) { | ||
5032 | if (nmi) { | ||
5033 | event->pending_wakeup = 1; | ||
5034 | irq_work_queue(&event->pending); | ||
5035 | } else | ||
5036 | perf_event_wakeup(event); | ||
5037 | } | ||
5038 | |||
4018 | return ret; | 5039 | return ret; |
4019 | } | 5040 | } |
4020 | 5041 | ||
@@ -4029,6 +5050,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, | |||
4029 | * Generic software event infrastructure | 5050 | * Generic software event infrastructure |
4030 | */ | 5051 | */ |
4031 | 5052 | ||
5053 | struct swevent_htable { | ||
5054 | struct swevent_hlist *swevent_hlist; | ||
5055 | struct mutex hlist_mutex; | ||
5056 | int hlist_refcount; | ||
5057 | |||
5058 | /* Recursion avoidance in each contexts */ | ||
5059 | int recursion[PERF_NR_CONTEXTS]; | ||
5060 | }; | ||
5061 | |||
5062 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | ||
5063 | |||
4032 | /* | 5064 | /* |
4033 | * We directly increment event->count and keep a second value in | 5065 | * We directly increment event->count and keep a second value in |
4034 | * event->hw.period_left to count intervals. This period event | 5066 | * event->hw.period_left to count intervals. This period event |
@@ -4086,7 +5118,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4086 | } | 5118 | } |
4087 | } | 5119 | } |
4088 | 5120 | ||
4089 | static void perf_swevent_add(struct perf_event *event, u64 nr, | 5121 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
4090 | int nmi, struct perf_sample_data *data, | 5122 | int nmi, struct perf_sample_data *data, |
4091 | struct pt_regs *regs) | 5123 | struct pt_regs *regs) |
4092 | { | 5124 | { |
@@ -4097,7 +5129,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4097 | if (!regs) | 5129 | if (!regs) |
4098 | return; | 5130 | return; |
4099 | 5131 | ||
4100 | if (!hwc->sample_period) | 5132 | if (!is_sampling_event(event)) |
4101 | return; | 5133 | return; |
4102 | 5134 | ||
4103 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 5135 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
@@ -4112,6 +5144,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4112 | static int perf_exclude_event(struct perf_event *event, | 5144 | static int perf_exclude_event(struct perf_event *event, |
4113 | struct pt_regs *regs) | 5145 | struct pt_regs *regs) |
4114 | { | 5146 | { |
5147 | if (event->hw.state & PERF_HES_STOPPED) | ||
5148 | return 1; | ||
5149 | |||
4115 | if (regs) { | 5150 | if (regs) { |
4116 | if (event->attr.exclude_user && user_mode(regs)) | 5151 | if (event->attr.exclude_user && user_mode(regs)) |
4117 | return 1; | 5152 | return 1; |
@@ -4158,11 +5193,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) | |||
4158 | 5193 | ||
4159 | /* For the read side: events when they trigger */ | 5194 | /* For the read side: events when they trigger */ |
4160 | static inline struct hlist_head * | 5195 | static inline struct hlist_head * |
4161 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 5196 | find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) |
4162 | { | 5197 | { |
4163 | struct swevent_hlist *hlist; | 5198 | struct swevent_hlist *hlist; |
4164 | 5199 | ||
4165 | hlist = rcu_dereference(ctx->swevent_hlist); | 5200 | hlist = rcu_dereference(swhash->swevent_hlist); |
4166 | if (!hlist) | 5201 | if (!hlist) |
4167 | return NULL; | 5202 | return NULL; |
4168 | 5203 | ||
@@ -4171,7 +5206,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | |||
4171 | 5206 | ||
4172 | /* For the event head insertion and removal in the hlist */ | 5207 | /* For the event head insertion and removal in the hlist */ |
4173 | static inline struct hlist_head * | 5208 | static inline struct hlist_head * |
4174 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | 5209 | find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) |
4175 | { | 5210 | { |
4176 | struct swevent_hlist *hlist; | 5211 | struct swevent_hlist *hlist; |
4177 | u32 event_id = event->attr.config; | 5212 | u32 event_id = event->attr.config; |
@@ -4182,7 +5217,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | |||
4182 | * and release. Which makes the protected version suitable here. | 5217 | * and release. Which makes the protected version suitable here. |
4183 | * The context lock guarantees that. | 5218 | * The context lock guarantees that. |
4184 | */ | 5219 | */ |
4185 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | 5220 | hlist = rcu_dereference_protected(swhash->swevent_hlist, |
4186 | lockdep_is_held(&event->ctx->lock)); | 5221 | lockdep_is_held(&event->ctx->lock)); |
4187 | if (!hlist) | 5222 | if (!hlist) |
4188 | return NULL; | 5223 | return NULL; |
@@ -4195,23 +5230,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
4195 | struct perf_sample_data *data, | 5230 | struct perf_sample_data *data, |
4196 | struct pt_regs *regs) | 5231 | struct pt_regs *regs) |
4197 | { | 5232 | { |
4198 | struct perf_cpu_context *cpuctx; | 5233 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4199 | struct perf_event *event; | 5234 | struct perf_event *event; |
4200 | struct hlist_node *node; | 5235 | struct hlist_node *node; |
4201 | struct hlist_head *head; | 5236 | struct hlist_head *head; |
4202 | 5237 | ||
4203 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4204 | |||
4205 | rcu_read_lock(); | 5238 | rcu_read_lock(); |
4206 | 5239 | head = find_swevent_head_rcu(swhash, type, event_id); | |
4207 | head = find_swevent_head_rcu(cpuctx, type, event_id); | ||
4208 | |||
4209 | if (!head) | 5240 | if (!head) |
4210 | goto end; | 5241 | goto end; |
4211 | 5242 | ||
4212 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5243 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4213 | if (perf_swevent_match(event, type, event_id, data, regs)) | 5244 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4214 | perf_swevent_add(event, nr, nmi, data, regs); | 5245 | perf_swevent_event(event, nr, nmi, data, regs); |
4215 | } | 5246 | } |
4216 | end: | 5247 | end: |
4217 | rcu_read_unlock(); | 5248 | rcu_read_unlock(); |
@@ -4219,33 +5250,17 @@ end: | |||
4219 | 5250 | ||
4220 | int perf_swevent_get_recursion_context(void) | 5251 | int perf_swevent_get_recursion_context(void) |
4221 | { | 5252 | { |
4222 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 5253 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4223 | int rctx; | ||
4224 | |||
4225 | if (in_nmi()) | ||
4226 | rctx = 3; | ||
4227 | else if (in_irq()) | ||
4228 | rctx = 2; | ||
4229 | else if (in_softirq()) | ||
4230 | rctx = 1; | ||
4231 | else | ||
4232 | rctx = 0; | ||
4233 | |||
4234 | if (cpuctx->recursion[rctx]) | ||
4235 | return -1; | ||
4236 | |||
4237 | cpuctx->recursion[rctx]++; | ||
4238 | barrier(); | ||
4239 | 5254 | ||
4240 | return rctx; | 5255 | return get_recursion_context(swhash->recursion); |
4241 | } | 5256 | } |
4242 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 5257 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4243 | 5258 | ||
4244 | void inline perf_swevent_put_recursion_context(int rctx) | 5259 | inline void perf_swevent_put_recursion_context(int rctx) |
4245 | { | 5260 | { |
4246 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 5261 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4247 | barrier(); | 5262 | |
4248 | cpuctx->recursion[rctx]--; | 5263 | put_recursion_context(swhash->recursion, rctx); |
4249 | } | 5264 | } |
4250 | 5265 | ||
4251 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 5266 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
@@ -4271,20 +5286,20 @@ static void perf_swevent_read(struct perf_event *event) | |||
4271 | { | 5286 | { |
4272 | } | 5287 | } |
4273 | 5288 | ||
4274 | static int perf_swevent_enable(struct perf_event *event) | 5289 | static int perf_swevent_add(struct perf_event *event, int flags) |
4275 | { | 5290 | { |
5291 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | ||
4276 | struct hw_perf_event *hwc = &event->hw; | 5292 | struct hw_perf_event *hwc = &event->hw; |
4277 | struct perf_cpu_context *cpuctx; | ||
4278 | struct hlist_head *head; | 5293 | struct hlist_head *head; |
4279 | 5294 | ||
4280 | cpuctx = &__get_cpu_var(perf_cpu_context); | 5295 | if (is_sampling_event(event)) { |
4281 | |||
4282 | if (hwc->sample_period) { | ||
4283 | hwc->last_period = hwc->sample_period; | 5296 | hwc->last_period = hwc->sample_period; |
4284 | perf_swevent_set_period(event); | 5297 | perf_swevent_set_period(event); |
4285 | } | 5298 | } |
4286 | 5299 | ||
4287 | head = find_swevent_head(cpuctx, event); | 5300 | hwc->state = !(flags & PERF_EF_START); |
5301 | |||
5302 | head = find_swevent_head(swhash, event); | ||
4288 | if (WARN_ON_ONCE(!head)) | 5303 | if (WARN_ON_ONCE(!head)) |
4289 | return -EINVAL; | 5304 | return -EINVAL; |
4290 | 5305 | ||
@@ -4293,233 +5308,50 @@ static int perf_swevent_enable(struct perf_event *event) | |||
4293 | return 0; | 5308 | return 0; |
4294 | } | 5309 | } |
4295 | 5310 | ||
4296 | static void perf_swevent_disable(struct perf_event *event) | 5311 | static void perf_swevent_del(struct perf_event *event, int flags) |
4297 | { | 5312 | { |
4298 | hlist_del_rcu(&event->hlist_entry); | 5313 | hlist_del_rcu(&event->hlist_entry); |
4299 | } | 5314 | } |
4300 | 5315 | ||
4301 | static void perf_swevent_void(struct perf_event *event) | 5316 | static void perf_swevent_start(struct perf_event *event, int flags) |
4302 | { | 5317 | { |
5318 | event->hw.state = 0; | ||
4303 | } | 5319 | } |
4304 | 5320 | ||
4305 | static int perf_swevent_int(struct perf_event *event) | 5321 | static void perf_swevent_stop(struct perf_event *event, int flags) |
4306 | { | 5322 | { |
4307 | return 0; | 5323 | event->hw.state = PERF_HES_STOPPED; |
4308 | } | 5324 | } |
4309 | 5325 | ||
4310 | static const struct pmu perf_ops_generic = { | ||
4311 | .enable = perf_swevent_enable, | ||
4312 | .disable = perf_swevent_disable, | ||
4313 | .start = perf_swevent_int, | ||
4314 | .stop = perf_swevent_void, | ||
4315 | .read = perf_swevent_read, | ||
4316 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ | ||
4317 | }; | ||
4318 | |||
4319 | /* | ||
4320 | * hrtimer based swevent callback | ||
4321 | */ | ||
4322 | |||
4323 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4324 | { | ||
4325 | enum hrtimer_restart ret = HRTIMER_RESTART; | ||
4326 | struct perf_sample_data data; | ||
4327 | struct pt_regs *regs; | ||
4328 | struct perf_event *event; | ||
4329 | u64 period; | ||
4330 | |||
4331 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | ||
4332 | event->pmu->read(event); | ||
4333 | |||
4334 | perf_sample_data_init(&data, 0); | ||
4335 | data.period = event->hw.last_period; | ||
4336 | regs = get_irq_regs(); | ||
4337 | |||
4338 | if (regs && !perf_exclude_event(event, regs)) { | ||
4339 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4340 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4341 | ret = HRTIMER_NORESTART; | ||
4342 | } | ||
4343 | |||
4344 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4345 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4346 | |||
4347 | return ret; | ||
4348 | } | ||
4349 | |||
4350 | static void perf_swevent_start_hrtimer(struct perf_event *event) | ||
4351 | { | ||
4352 | struct hw_perf_event *hwc = &event->hw; | ||
4353 | |||
4354 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4355 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4356 | if (hwc->sample_period) { | ||
4357 | u64 period; | ||
4358 | |||
4359 | if (hwc->remaining) { | ||
4360 | if (hwc->remaining < 0) | ||
4361 | period = 10000; | ||
4362 | else | ||
4363 | period = hwc->remaining; | ||
4364 | hwc->remaining = 0; | ||
4365 | } else { | ||
4366 | period = max_t(u64, 10000, hwc->sample_period); | ||
4367 | } | ||
4368 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4369 | ns_to_ktime(period), 0, | ||
4370 | HRTIMER_MODE_REL, 0); | ||
4371 | } | ||
4372 | } | ||
4373 | |||
4374 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4375 | { | ||
4376 | struct hw_perf_event *hwc = &event->hw; | ||
4377 | |||
4378 | if (hwc->sample_period) { | ||
4379 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4380 | hwc->remaining = ktime_to_ns(remaining); | ||
4381 | |||
4382 | hrtimer_cancel(&hwc->hrtimer); | ||
4383 | } | ||
4384 | } | ||
4385 | |||
4386 | /* | ||
4387 | * Software event: cpu wall time clock | ||
4388 | */ | ||
4389 | |||
4390 | static void cpu_clock_perf_event_update(struct perf_event *event) | ||
4391 | { | ||
4392 | int cpu = raw_smp_processor_id(); | ||
4393 | s64 prev; | ||
4394 | u64 now; | ||
4395 | |||
4396 | now = cpu_clock(cpu); | ||
4397 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4398 | local64_add(now - prev, &event->count); | ||
4399 | } | ||
4400 | |||
4401 | static int cpu_clock_perf_event_enable(struct perf_event *event) | ||
4402 | { | ||
4403 | struct hw_perf_event *hwc = &event->hw; | ||
4404 | int cpu = raw_smp_processor_id(); | ||
4405 | |||
4406 | local64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
4407 | perf_swevent_start_hrtimer(event); | ||
4408 | |||
4409 | return 0; | ||
4410 | } | ||
4411 | |||
4412 | static void cpu_clock_perf_event_disable(struct perf_event *event) | ||
4413 | { | ||
4414 | perf_swevent_cancel_hrtimer(event); | ||
4415 | cpu_clock_perf_event_update(event); | ||
4416 | } | ||
4417 | |||
4418 | static void cpu_clock_perf_event_read(struct perf_event *event) | ||
4419 | { | ||
4420 | cpu_clock_perf_event_update(event); | ||
4421 | } | ||
4422 | |||
4423 | static const struct pmu perf_ops_cpu_clock = { | ||
4424 | .enable = cpu_clock_perf_event_enable, | ||
4425 | .disable = cpu_clock_perf_event_disable, | ||
4426 | .read = cpu_clock_perf_event_read, | ||
4427 | }; | ||
4428 | |||
4429 | /* | ||
4430 | * Software event: task time clock | ||
4431 | */ | ||
4432 | |||
4433 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) | ||
4434 | { | ||
4435 | u64 prev; | ||
4436 | s64 delta; | ||
4437 | |||
4438 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4439 | delta = now - prev; | ||
4440 | local64_add(delta, &event->count); | ||
4441 | } | ||
4442 | |||
4443 | static int task_clock_perf_event_enable(struct perf_event *event) | ||
4444 | { | ||
4445 | struct hw_perf_event *hwc = &event->hw; | ||
4446 | u64 now; | ||
4447 | |||
4448 | now = event->ctx->time; | ||
4449 | |||
4450 | local64_set(&hwc->prev_count, now); | ||
4451 | |||
4452 | perf_swevent_start_hrtimer(event); | ||
4453 | |||
4454 | return 0; | ||
4455 | } | ||
4456 | |||
4457 | static void task_clock_perf_event_disable(struct perf_event *event) | ||
4458 | { | ||
4459 | perf_swevent_cancel_hrtimer(event); | ||
4460 | task_clock_perf_event_update(event, event->ctx->time); | ||
4461 | |||
4462 | } | ||
4463 | |||
4464 | static void task_clock_perf_event_read(struct perf_event *event) | ||
4465 | { | ||
4466 | u64 time; | ||
4467 | |||
4468 | if (!in_nmi()) { | ||
4469 | update_context_time(event->ctx); | ||
4470 | time = event->ctx->time; | ||
4471 | } else { | ||
4472 | u64 now = perf_clock(); | ||
4473 | u64 delta = now - event->ctx->timestamp; | ||
4474 | time = event->ctx->time + delta; | ||
4475 | } | ||
4476 | |||
4477 | task_clock_perf_event_update(event, time); | ||
4478 | } | ||
4479 | |||
4480 | static const struct pmu perf_ops_task_clock = { | ||
4481 | .enable = task_clock_perf_event_enable, | ||
4482 | .disable = task_clock_perf_event_disable, | ||
4483 | .read = task_clock_perf_event_read, | ||
4484 | }; | ||
4485 | |||
4486 | /* Deref the hlist from the update side */ | 5326 | /* Deref the hlist from the update side */ |
4487 | static inline struct swevent_hlist * | 5327 | static inline struct swevent_hlist * |
4488 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | 5328 | swevent_hlist_deref(struct swevent_htable *swhash) |
4489 | { | ||
4490 | return rcu_dereference_protected(cpuctx->swevent_hlist, | ||
4491 | lockdep_is_held(&cpuctx->hlist_mutex)); | ||
4492 | } | ||
4493 | |||
4494 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | ||
4495 | { | 5329 | { |
4496 | struct swevent_hlist *hlist; | 5330 | return rcu_dereference_protected(swhash->swevent_hlist, |
4497 | 5331 | lockdep_is_held(&swhash->hlist_mutex)); | |
4498 | hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); | ||
4499 | kfree(hlist); | ||
4500 | } | 5332 | } |
4501 | 5333 | ||
4502 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 5334 | static void swevent_hlist_release(struct swevent_htable *swhash) |
4503 | { | 5335 | { |
4504 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); | 5336 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
4505 | 5337 | ||
4506 | if (!hlist) | 5338 | if (!hlist) |
4507 | return; | 5339 | return; |
4508 | 5340 | ||
4509 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 5341 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
4510 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 5342 | kfree_rcu(hlist, rcu_head); |
4511 | } | 5343 | } |
4512 | 5344 | ||
4513 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 5345 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
4514 | { | 5346 | { |
4515 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 5347 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4516 | 5348 | ||
4517 | mutex_lock(&cpuctx->hlist_mutex); | 5349 | mutex_lock(&swhash->hlist_mutex); |
4518 | 5350 | ||
4519 | if (!--cpuctx->hlist_refcount) | 5351 | if (!--swhash->hlist_refcount) |
4520 | swevent_hlist_release(cpuctx); | 5352 | swevent_hlist_release(swhash); |
4521 | 5353 | ||
4522 | mutex_unlock(&cpuctx->hlist_mutex); | 5354 | mutex_unlock(&swhash->hlist_mutex); |
4523 | } | 5355 | } |
4524 | 5356 | ||
4525 | static void swevent_hlist_put(struct perf_event *event) | 5357 | static void swevent_hlist_put(struct perf_event *event) |
@@ -4537,12 +5369,12 @@ static void swevent_hlist_put(struct perf_event *event) | |||
4537 | 5369 | ||
4538 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | 5370 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
4539 | { | 5371 | { |
4540 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 5372 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4541 | int err = 0; | 5373 | int err = 0; |
4542 | 5374 | ||
4543 | mutex_lock(&cpuctx->hlist_mutex); | 5375 | mutex_lock(&swhash->hlist_mutex); |
4544 | 5376 | ||
4545 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { | 5377 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { |
4546 | struct swevent_hlist *hlist; | 5378 | struct swevent_hlist *hlist; |
4547 | 5379 | ||
4548 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 5380 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
@@ -4550,11 +5382,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
4550 | err = -ENOMEM; | 5382 | err = -ENOMEM; |
4551 | goto exit; | 5383 | goto exit; |
4552 | } | 5384 | } |
4553 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 5385 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
4554 | } | 5386 | } |
4555 | cpuctx->hlist_refcount++; | 5387 | swhash->hlist_refcount++; |
4556 | exit: | 5388 | exit: |
4557 | mutex_unlock(&cpuctx->hlist_mutex); | 5389 | mutex_unlock(&swhash->hlist_mutex); |
4558 | 5390 | ||
4559 | return err; | 5391 | return err; |
4560 | } | 5392 | } |
@@ -4578,7 +5410,7 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4578 | put_online_cpus(); | 5410 | put_online_cpus(); |
4579 | 5411 | ||
4580 | return 0; | 5412 | return 0; |
4581 | fail: | 5413 | fail: |
4582 | for_each_possible_cpu(cpu) { | 5414 | for_each_possible_cpu(cpu) { |
4583 | if (cpu == failed_cpu) | 5415 | if (cpu == failed_cpu) |
4584 | break; | 5416 | break; |
@@ -4589,17 +5421,64 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4589 | return err; | 5421 | return err; |
4590 | } | 5422 | } |
4591 | 5423 | ||
4592 | #ifdef CONFIG_EVENT_TRACING | 5424 | struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
5425 | |||
5426 | static void sw_perf_event_destroy(struct perf_event *event) | ||
5427 | { | ||
5428 | u64 event_id = event->attr.config; | ||
5429 | |||
5430 | WARN_ON(event->parent); | ||
5431 | |||
5432 | jump_label_dec(&perf_swevent_enabled[event_id]); | ||
5433 | swevent_hlist_put(event); | ||
5434 | } | ||
5435 | |||
5436 | static int perf_swevent_init(struct perf_event *event) | ||
5437 | { | ||
5438 | int event_id = event->attr.config; | ||
5439 | |||
5440 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5441 | return -ENOENT; | ||
5442 | |||
5443 | switch (event_id) { | ||
5444 | case PERF_COUNT_SW_CPU_CLOCK: | ||
5445 | case PERF_COUNT_SW_TASK_CLOCK: | ||
5446 | return -ENOENT; | ||
5447 | |||
5448 | default: | ||
5449 | break; | ||
5450 | } | ||
5451 | |||
5452 | if (event_id >= PERF_COUNT_SW_MAX) | ||
5453 | return -ENOENT; | ||
5454 | |||
5455 | if (!event->parent) { | ||
5456 | int err; | ||
5457 | |||
5458 | err = swevent_hlist_get(event); | ||
5459 | if (err) | ||
5460 | return err; | ||
5461 | |||
5462 | jump_label_inc(&perf_swevent_enabled[event_id]); | ||
5463 | event->destroy = sw_perf_event_destroy; | ||
5464 | } | ||
4593 | 5465 | ||
4594 | static const struct pmu perf_ops_tracepoint = { | 5466 | return 0; |
4595 | .enable = perf_trace_enable, | 5467 | } |
4596 | .disable = perf_trace_disable, | 5468 | |
4597 | .start = perf_swevent_int, | 5469 | static struct pmu perf_swevent = { |
4598 | .stop = perf_swevent_void, | 5470 | .task_ctx_nr = perf_sw_context, |
5471 | |||
5472 | .event_init = perf_swevent_init, | ||
5473 | .add = perf_swevent_add, | ||
5474 | .del = perf_swevent_del, | ||
5475 | .start = perf_swevent_start, | ||
5476 | .stop = perf_swevent_stop, | ||
4599 | .read = perf_swevent_read, | 5477 | .read = perf_swevent_read, |
4600 | .unthrottle = perf_swevent_void, | ||
4601 | }; | 5478 | }; |
4602 | 5479 | ||
5480 | #ifdef CONFIG_EVENT_TRACING | ||
5481 | |||
4603 | static int perf_tp_filter_match(struct perf_event *event, | 5482 | static int perf_tp_filter_match(struct perf_event *event, |
4604 | struct perf_sample_data *data) | 5483 | struct perf_sample_data *data) |
4605 | { | 5484 | { |
@@ -4614,6 +5493,8 @@ static int perf_tp_event_match(struct perf_event *event, | |||
4614 | struct perf_sample_data *data, | 5493 | struct perf_sample_data *data, |
4615 | struct pt_regs *regs) | 5494 | struct pt_regs *regs) |
4616 | { | 5495 | { |
5496 | if (event->hw.state & PERF_HES_STOPPED) | ||
5497 | return 0; | ||
4617 | /* | 5498 | /* |
4618 | * All tracepoints are from kernel-space. | 5499 | * All tracepoints are from kernel-space. |
4619 | */ | 5500 | */ |
@@ -4643,7 +5524,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
4643 | 5524 | ||
4644 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5525 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4645 | if (perf_tp_event_match(event, &data, regs)) | 5526 | if (perf_tp_event_match(event, &data, regs)) |
4646 | perf_swevent_add(event, count, 1, &data, regs); | 5527 | perf_swevent_event(event, count, 1, &data, regs); |
4647 | } | 5528 | } |
4648 | 5529 | ||
4649 | perf_swevent_put_recursion_context(rctx); | 5530 | perf_swevent_put_recursion_context(rctx); |
@@ -4655,26 +5536,36 @@ static void tp_perf_event_destroy(struct perf_event *event) | |||
4655 | perf_trace_destroy(event); | 5536 | perf_trace_destroy(event); |
4656 | } | 5537 | } |
4657 | 5538 | ||
4658 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 5539 | static int perf_tp_event_init(struct perf_event *event) |
4659 | { | 5540 | { |
4660 | int err; | 5541 | int err; |
4661 | 5542 | ||
4662 | /* | 5543 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
4663 | * Raw tracepoint data is a severe data leak, only allow root to | 5544 | return -ENOENT; |
4664 | * have these. | ||
4665 | */ | ||
4666 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | ||
4667 | perf_paranoid_tracepoint_raw() && | ||
4668 | !capable(CAP_SYS_ADMIN)) | ||
4669 | return ERR_PTR(-EPERM); | ||
4670 | 5545 | ||
4671 | err = perf_trace_init(event); | 5546 | err = perf_trace_init(event); |
4672 | if (err) | 5547 | if (err) |
4673 | return NULL; | 5548 | return err; |
4674 | 5549 | ||
4675 | event->destroy = tp_perf_event_destroy; | 5550 | event->destroy = tp_perf_event_destroy; |
4676 | 5551 | ||
4677 | return &perf_ops_tracepoint; | 5552 | return 0; |
5553 | } | ||
5554 | |||
5555 | static struct pmu perf_tracepoint = { | ||
5556 | .task_ctx_nr = perf_sw_context, | ||
5557 | |||
5558 | .event_init = perf_tp_event_init, | ||
5559 | .add = perf_trace_add, | ||
5560 | .del = perf_trace_del, | ||
5561 | .start = perf_swevent_start, | ||
5562 | .stop = perf_swevent_stop, | ||
5563 | .read = perf_swevent_read, | ||
5564 | }; | ||
5565 | |||
5566 | static inline void perf_tp_register(void) | ||
5567 | { | ||
5568 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); | ||
4678 | } | 5569 | } |
4679 | 5570 | ||
4680 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 5571 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4702,9 +5593,8 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4702 | 5593 | ||
4703 | #else | 5594 | #else |
4704 | 5595 | ||
4705 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 5596 | static inline void perf_tp_register(void) |
4706 | { | 5597 | { |
4707 | return NULL; | ||
4708 | } | 5598 | } |
4709 | 5599 | ||
4710 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 5600 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4719,105 +5609,535 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4719 | #endif /* CONFIG_EVENT_TRACING */ | 5609 | #endif /* CONFIG_EVENT_TRACING */ |
4720 | 5610 | ||
4721 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 5611 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
4722 | static void bp_perf_event_destroy(struct perf_event *event) | 5612 | void perf_bp_event(struct perf_event *bp, void *data) |
4723 | { | 5613 | { |
4724 | release_bp_slot(event); | 5614 | struct perf_sample_data sample; |
5615 | struct pt_regs *regs = data; | ||
5616 | |||
5617 | perf_sample_data_init(&sample, bp->attr.bp_addr); | ||
5618 | |||
5619 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | ||
5620 | perf_swevent_event(bp, 1, 1, &sample, regs); | ||
4725 | } | 5621 | } |
5622 | #endif | ||
5623 | |||
5624 | /* | ||
5625 | * hrtimer based swevent callback | ||
5626 | */ | ||
4726 | 5627 | ||
4727 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 5628 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) |
4728 | { | 5629 | { |
4729 | int err; | 5630 | enum hrtimer_restart ret = HRTIMER_RESTART; |
5631 | struct perf_sample_data data; | ||
5632 | struct pt_regs *regs; | ||
5633 | struct perf_event *event; | ||
5634 | u64 period; | ||
4730 | 5635 | ||
4731 | err = register_perf_hw_breakpoint(bp); | 5636 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4732 | if (err) | 5637 | |
4733 | return ERR_PTR(err); | 5638 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
5639 | return HRTIMER_NORESTART; | ||
5640 | |||
5641 | event->pmu->read(event); | ||
5642 | |||
5643 | perf_sample_data_init(&data, 0); | ||
5644 | data.period = event->hw.last_period; | ||
5645 | regs = get_irq_regs(); | ||
5646 | |||
5647 | if (regs && !perf_exclude_event(event, regs)) { | ||
5648 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
5649 | if (perf_event_overflow(event, 0, &data, regs)) | ||
5650 | ret = HRTIMER_NORESTART; | ||
5651 | } | ||
4734 | 5652 | ||
4735 | bp->destroy = bp_perf_event_destroy; | 5653 | period = max_t(u64, 10000, event->hw.sample_period); |
5654 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4736 | 5655 | ||
4737 | return &perf_ops_bp; | 5656 | return ret; |
4738 | } | 5657 | } |
4739 | 5658 | ||
4740 | void perf_bp_event(struct perf_event *bp, void *data) | 5659 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4741 | { | 5660 | { |
4742 | struct perf_sample_data sample; | 5661 | struct hw_perf_event *hwc = &event->hw; |
4743 | struct pt_regs *regs = data; | 5662 | s64 period; |
4744 | 5663 | ||
4745 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5664 | if (!is_sampling_event(event)) |
5665 | return; | ||
4746 | 5666 | ||
4747 | if (!perf_exclude_event(bp, regs)) | 5667 | period = local64_read(&hwc->period_left); |
4748 | perf_swevent_add(bp, 1, 1, &sample, regs); | 5668 | if (period) { |
5669 | if (period < 0) | ||
5670 | period = 10000; | ||
5671 | |||
5672 | local64_set(&hwc->period_left, 0); | ||
5673 | } else { | ||
5674 | period = max_t(u64, 10000, hwc->sample_period); | ||
5675 | } | ||
5676 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
5677 | ns_to_ktime(period), 0, | ||
5678 | HRTIMER_MODE_REL_PINNED, 0); | ||
4749 | } | 5679 | } |
4750 | #else | 5680 | |
4751 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 5681 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
5682 | { | ||
5683 | struct hw_perf_event *hwc = &event->hw; | ||
5684 | |||
5685 | if (is_sampling_event(event)) { | ||
5686 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
5687 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | ||
5688 | |||
5689 | hrtimer_cancel(&hwc->hrtimer); | ||
5690 | } | ||
5691 | } | ||
5692 | |||
5693 | static void perf_swevent_init_hrtimer(struct perf_event *event) | ||
5694 | { | ||
5695 | struct hw_perf_event *hwc = &event->hw; | ||
5696 | |||
5697 | if (!is_sampling_event(event)) | ||
5698 | return; | ||
5699 | |||
5700 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5701 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5702 | |||
5703 | /* | ||
5704 | * Since hrtimers have a fixed rate, we can do a static freq->period | ||
5705 | * mapping and avoid the whole period adjust feedback stuff. | ||
5706 | */ | ||
5707 | if (event->attr.freq) { | ||
5708 | long freq = event->attr.sample_freq; | ||
5709 | |||
5710 | event->attr.sample_period = NSEC_PER_SEC / freq; | ||
5711 | hwc->sample_period = event->attr.sample_period; | ||
5712 | local64_set(&hwc->period_left, hwc->sample_period); | ||
5713 | event->attr.freq = 0; | ||
5714 | } | ||
5715 | } | ||
5716 | |||
5717 | /* | ||
5718 | * Software event: cpu wall time clock | ||
5719 | */ | ||
5720 | |||
5721 | static void cpu_clock_event_update(struct perf_event *event) | ||
5722 | { | ||
5723 | s64 prev; | ||
5724 | u64 now; | ||
5725 | |||
5726 | now = local_clock(); | ||
5727 | prev = local64_xchg(&event->hw.prev_count, now); | ||
5728 | local64_add(now - prev, &event->count); | ||
5729 | } | ||
5730 | |||
5731 | static void cpu_clock_event_start(struct perf_event *event, int flags) | ||
5732 | { | ||
5733 | local64_set(&event->hw.prev_count, local_clock()); | ||
5734 | perf_swevent_start_hrtimer(event); | ||
5735 | } | ||
5736 | |||
5737 | static void cpu_clock_event_stop(struct perf_event *event, int flags) | ||
5738 | { | ||
5739 | perf_swevent_cancel_hrtimer(event); | ||
5740 | cpu_clock_event_update(event); | ||
5741 | } | ||
5742 | |||
5743 | static int cpu_clock_event_add(struct perf_event *event, int flags) | ||
5744 | { | ||
5745 | if (flags & PERF_EF_START) | ||
5746 | cpu_clock_event_start(event, flags); | ||
5747 | |||
5748 | return 0; | ||
5749 | } | ||
5750 | |||
5751 | static void cpu_clock_event_del(struct perf_event *event, int flags) | ||
5752 | { | ||
5753 | cpu_clock_event_stop(event, flags); | ||
5754 | } | ||
5755 | |||
5756 | static void cpu_clock_event_read(struct perf_event *event) | ||
5757 | { | ||
5758 | cpu_clock_event_update(event); | ||
5759 | } | ||
5760 | |||
5761 | static int cpu_clock_event_init(struct perf_event *event) | ||
5762 | { | ||
5763 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5764 | return -ENOENT; | ||
5765 | |||
5766 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | ||
5767 | return -ENOENT; | ||
5768 | |||
5769 | perf_swevent_init_hrtimer(event); | ||
5770 | |||
5771 | return 0; | ||
5772 | } | ||
5773 | |||
5774 | static struct pmu perf_cpu_clock = { | ||
5775 | .task_ctx_nr = perf_sw_context, | ||
5776 | |||
5777 | .event_init = cpu_clock_event_init, | ||
5778 | .add = cpu_clock_event_add, | ||
5779 | .del = cpu_clock_event_del, | ||
5780 | .start = cpu_clock_event_start, | ||
5781 | .stop = cpu_clock_event_stop, | ||
5782 | .read = cpu_clock_event_read, | ||
5783 | }; | ||
5784 | |||
5785 | /* | ||
5786 | * Software event: task time clock | ||
5787 | */ | ||
5788 | |||
5789 | static void task_clock_event_update(struct perf_event *event, u64 now) | ||
5790 | { | ||
5791 | u64 prev; | ||
5792 | s64 delta; | ||
5793 | |||
5794 | prev = local64_xchg(&event->hw.prev_count, now); | ||
5795 | delta = now - prev; | ||
5796 | local64_add(delta, &event->count); | ||
5797 | } | ||
5798 | |||
5799 | static void task_clock_event_start(struct perf_event *event, int flags) | ||
5800 | { | ||
5801 | local64_set(&event->hw.prev_count, event->ctx->time); | ||
5802 | perf_swevent_start_hrtimer(event); | ||
5803 | } | ||
5804 | |||
5805 | static void task_clock_event_stop(struct perf_event *event, int flags) | ||
5806 | { | ||
5807 | perf_swevent_cancel_hrtimer(event); | ||
5808 | task_clock_event_update(event, event->ctx->time); | ||
5809 | } | ||
5810 | |||
5811 | static int task_clock_event_add(struct perf_event *event, int flags) | ||
5812 | { | ||
5813 | if (flags & PERF_EF_START) | ||
5814 | task_clock_event_start(event, flags); | ||
5815 | |||
5816 | return 0; | ||
5817 | } | ||
5818 | |||
5819 | static void task_clock_event_del(struct perf_event *event, int flags) | ||
5820 | { | ||
5821 | task_clock_event_stop(event, PERF_EF_UPDATE); | ||
5822 | } | ||
5823 | |||
5824 | static void task_clock_event_read(struct perf_event *event) | ||
5825 | { | ||
5826 | u64 now = perf_clock(); | ||
5827 | u64 delta = now - event->ctx->timestamp; | ||
5828 | u64 time = event->ctx->time + delta; | ||
5829 | |||
5830 | task_clock_event_update(event, time); | ||
5831 | } | ||
5832 | |||
5833 | static int task_clock_event_init(struct perf_event *event) | ||
5834 | { | ||
5835 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5836 | return -ENOENT; | ||
5837 | |||
5838 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | ||
5839 | return -ENOENT; | ||
5840 | |||
5841 | perf_swevent_init_hrtimer(event); | ||
5842 | |||
5843 | return 0; | ||
5844 | } | ||
5845 | |||
5846 | static struct pmu perf_task_clock = { | ||
5847 | .task_ctx_nr = perf_sw_context, | ||
5848 | |||
5849 | .event_init = task_clock_event_init, | ||
5850 | .add = task_clock_event_add, | ||
5851 | .del = task_clock_event_del, | ||
5852 | .start = task_clock_event_start, | ||
5853 | .stop = task_clock_event_stop, | ||
5854 | .read = task_clock_event_read, | ||
5855 | }; | ||
5856 | |||
5857 | static void perf_pmu_nop_void(struct pmu *pmu) | ||
5858 | { | ||
5859 | } | ||
5860 | |||
5861 | static int perf_pmu_nop_int(struct pmu *pmu) | ||
5862 | { | ||
5863 | return 0; | ||
5864 | } | ||
5865 | |||
5866 | static void perf_pmu_start_txn(struct pmu *pmu) | ||
5867 | { | ||
5868 | perf_pmu_disable(pmu); | ||
5869 | } | ||
5870 | |||
5871 | static int perf_pmu_commit_txn(struct pmu *pmu) | ||
5872 | { | ||
5873 | perf_pmu_enable(pmu); | ||
5874 | return 0; | ||
5875 | } | ||
5876 | |||
5877 | static void perf_pmu_cancel_txn(struct pmu *pmu) | ||
5878 | { | ||
5879 | perf_pmu_enable(pmu); | ||
5880 | } | ||
5881 | |||
5882 | /* | ||
5883 | * Ensures all contexts with the same task_ctx_nr have the same | ||
5884 | * pmu_cpu_context too. | ||
5885 | */ | ||
5886 | static void *find_pmu_context(int ctxn) | ||
4752 | { | 5887 | { |
5888 | struct pmu *pmu; | ||
5889 | |||
5890 | if (ctxn < 0) | ||
5891 | return NULL; | ||
5892 | |||
5893 | list_for_each_entry(pmu, &pmus, entry) { | ||
5894 | if (pmu->task_ctx_nr == ctxn) | ||
5895 | return pmu->pmu_cpu_context; | ||
5896 | } | ||
5897 | |||
4753 | return NULL; | 5898 | return NULL; |
4754 | } | 5899 | } |
4755 | 5900 | ||
4756 | void perf_bp_event(struct perf_event *bp, void *regs) | 5901 | static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) |
4757 | { | 5902 | { |
5903 | int cpu; | ||
5904 | |||
5905 | for_each_possible_cpu(cpu) { | ||
5906 | struct perf_cpu_context *cpuctx; | ||
5907 | |||
5908 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5909 | |||
5910 | if (cpuctx->active_pmu == old_pmu) | ||
5911 | cpuctx->active_pmu = pmu; | ||
5912 | } | ||
4758 | } | 5913 | } |
4759 | #endif | ||
4760 | 5914 | ||
4761 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 5915 | static void free_pmu_context(struct pmu *pmu) |
5916 | { | ||
5917 | struct pmu *i; | ||
4762 | 5918 | ||
4763 | static void sw_perf_event_destroy(struct perf_event *event) | 5919 | mutex_lock(&pmus_lock); |
5920 | /* | ||
5921 | * Like a real lame refcount. | ||
5922 | */ | ||
5923 | list_for_each_entry(i, &pmus, entry) { | ||
5924 | if (i->pmu_cpu_context == pmu->pmu_cpu_context) { | ||
5925 | update_pmu_context(i, pmu); | ||
5926 | goto out; | ||
5927 | } | ||
5928 | } | ||
5929 | |||
5930 | free_percpu(pmu->pmu_cpu_context); | ||
5931 | out: | ||
5932 | mutex_unlock(&pmus_lock); | ||
5933 | } | ||
5934 | static struct idr pmu_idr; | ||
5935 | |||
5936 | static ssize_t | ||
5937 | type_show(struct device *dev, struct device_attribute *attr, char *page) | ||
4764 | { | 5938 | { |
4765 | u64 event_id = event->attr.config; | 5939 | struct pmu *pmu = dev_get_drvdata(dev); |
4766 | 5940 | ||
4767 | WARN_ON(event->parent); | 5941 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
5942 | } | ||
4768 | 5943 | ||
4769 | atomic_dec(&perf_swevent_enabled[event_id]); | 5944 | static struct device_attribute pmu_dev_attrs[] = { |
4770 | swevent_hlist_put(event); | 5945 | __ATTR_RO(type), |
5946 | __ATTR_NULL, | ||
5947 | }; | ||
5948 | |||
5949 | static int pmu_bus_running; | ||
5950 | static struct bus_type pmu_bus = { | ||
5951 | .name = "event_source", | ||
5952 | .dev_attrs = pmu_dev_attrs, | ||
5953 | }; | ||
5954 | |||
5955 | static void pmu_dev_release(struct device *dev) | ||
5956 | { | ||
5957 | kfree(dev); | ||
4771 | } | 5958 | } |
4772 | 5959 | ||
4773 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 5960 | static int pmu_dev_alloc(struct pmu *pmu) |
4774 | { | 5961 | { |
4775 | const struct pmu *pmu = NULL; | 5962 | int ret = -ENOMEM; |
4776 | u64 event_id = event->attr.config; | 5963 | |
5964 | pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); | ||
5965 | if (!pmu->dev) | ||
5966 | goto out; | ||
5967 | |||
5968 | device_initialize(pmu->dev); | ||
5969 | ret = dev_set_name(pmu->dev, "%s", pmu->name); | ||
5970 | if (ret) | ||
5971 | goto free_dev; | ||
5972 | |||
5973 | dev_set_drvdata(pmu->dev, pmu); | ||
5974 | pmu->dev->bus = &pmu_bus; | ||
5975 | pmu->dev->release = pmu_dev_release; | ||
5976 | ret = device_add(pmu->dev); | ||
5977 | if (ret) | ||
5978 | goto free_dev; | ||
5979 | |||
5980 | out: | ||
5981 | return ret; | ||
5982 | |||
5983 | free_dev: | ||
5984 | put_device(pmu->dev); | ||
5985 | goto out; | ||
5986 | } | ||
5987 | |||
5988 | static struct lock_class_key cpuctx_mutex; | ||
5989 | |||
5990 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | ||
5991 | { | ||
5992 | int cpu, ret; | ||
5993 | |||
5994 | mutex_lock(&pmus_lock); | ||
5995 | ret = -ENOMEM; | ||
5996 | pmu->pmu_disable_count = alloc_percpu(int); | ||
5997 | if (!pmu->pmu_disable_count) | ||
5998 | goto unlock; | ||
5999 | |||
6000 | pmu->type = -1; | ||
6001 | if (!name) | ||
6002 | goto skip_type; | ||
6003 | pmu->name = name; | ||
6004 | |||
6005 | if (type < 0) { | ||
6006 | int err = idr_pre_get(&pmu_idr, GFP_KERNEL); | ||
6007 | if (!err) | ||
6008 | goto free_pdc; | ||
6009 | |||
6010 | err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); | ||
6011 | if (err) { | ||
6012 | ret = err; | ||
6013 | goto free_pdc; | ||
6014 | } | ||
6015 | } | ||
6016 | pmu->type = type; | ||
6017 | |||
6018 | if (pmu_bus_running) { | ||
6019 | ret = pmu_dev_alloc(pmu); | ||
6020 | if (ret) | ||
6021 | goto free_idr; | ||
6022 | } | ||
6023 | |||
6024 | skip_type: | ||
6025 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | ||
6026 | if (pmu->pmu_cpu_context) | ||
6027 | goto got_cpu_context; | ||
6028 | |||
6029 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | ||
6030 | if (!pmu->pmu_cpu_context) | ||
6031 | goto free_dev; | ||
6032 | |||
6033 | for_each_possible_cpu(cpu) { | ||
6034 | struct perf_cpu_context *cpuctx; | ||
6035 | |||
6036 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
6037 | __perf_event_init_context(&cpuctx->ctx); | ||
6038 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | ||
6039 | cpuctx->ctx.type = cpu_context; | ||
6040 | cpuctx->ctx.pmu = pmu; | ||
6041 | cpuctx->jiffies_interval = 1; | ||
6042 | INIT_LIST_HEAD(&cpuctx->rotation_list); | ||
6043 | cpuctx->active_pmu = pmu; | ||
6044 | } | ||
6045 | |||
6046 | got_cpu_context: | ||
6047 | if (!pmu->start_txn) { | ||
6048 | if (pmu->pmu_enable) { | ||
6049 | /* | ||
6050 | * If we have pmu_enable/pmu_disable calls, install | ||
6051 | * transaction stubs that use that to try and batch | ||
6052 | * hardware accesses. | ||
6053 | */ | ||
6054 | pmu->start_txn = perf_pmu_start_txn; | ||
6055 | pmu->commit_txn = perf_pmu_commit_txn; | ||
6056 | pmu->cancel_txn = perf_pmu_cancel_txn; | ||
6057 | } else { | ||
6058 | pmu->start_txn = perf_pmu_nop_void; | ||
6059 | pmu->commit_txn = perf_pmu_nop_int; | ||
6060 | pmu->cancel_txn = perf_pmu_nop_void; | ||
6061 | } | ||
6062 | } | ||
6063 | |||
6064 | if (!pmu->pmu_enable) { | ||
6065 | pmu->pmu_enable = perf_pmu_nop_void; | ||
6066 | pmu->pmu_disable = perf_pmu_nop_void; | ||
6067 | } | ||
6068 | |||
6069 | list_add_rcu(&pmu->entry, &pmus); | ||
6070 | ret = 0; | ||
6071 | unlock: | ||
6072 | mutex_unlock(&pmus_lock); | ||
6073 | |||
6074 | return ret; | ||
6075 | |||
6076 | free_dev: | ||
6077 | device_del(pmu->dev); | ||
6078 | put_device(pmu->dev); | ||
6079 | |||
6080 | free_idr: | ||
6081 | if (pmu->type >= PERF_TYPE_MAX) | ||
6082 | idr_remove(&pmu_idr, pmu->type); | ||
6083 | |||
6084 | free_pdc: | ||
6085 | free_percpu(pmu->pmu_disable_count); | ||
6086 | goto unlock; | ||
6087 | } | ||
6088 | |||
6089 | void perf_pmu_unregister(struct pmu *pmu) | ||
6090 | { | ||
6091 | mutex_lock(&pmus_lock); | ||
6092 | list_del_rcu(&pmu->entry); | ||
6093 | mutex_unlock(&pmus_lock); | ||
4777 | 6094 | ||
4778 | /* | 6095 | /* |
4779 | * Software events (currently) can't in general distinguish | 6096 | * We dereference the pmu list under both SRCU and regular RCU, so |
4780 | * between user, kernel and hypervisor events. | 6097 | * synchronize against both of those. |
4781 | * However, context switches and cpu migrations are considered | ||
4782 | * to be kernel events, and page faults are never hypervisor | ||
4783 | * events. | ||
4784 | */ | 6098 | */ |
4785 | switch (event_id) { | 6099 | synchronize_srcu(&pmus_srcu); |
4786 | case PERF_COUNT_SW_CPU_CLOCK: | 6100 | synchronize_rcu(); |
4787 | pmu = &perf_ops_cpu_clock; | ||
4788 | 6101 | ||
4789 | break; | 6102 | free_percpu(pmu->pmu_disable_count); |
4790 | case PERF_COUNT_SW_TASK_CLOCK: | 6103 | if (pmu->type >= PERF_TYPE_MAX) |
4791 | /* | 6104 | idr_remove(&pmu_idr, pmu->type); |
4792 | * If the user instantiates this as a per-cpu event, | 6105 | device_del(pmu->dev); |
4793 | * use the cpu_clock event instead. | 6106 | put_device(pmu->dev); |
4794 | */ | 6107 | free_pmu_context(pmu); |
4795 | if (event->ctx->task) | 6108 | } |
4796 | pmu = &perf_ops_task_clock; | ||
4797 | else | ||
4798 | pmu = &perf_ops_cpu_clock; | ||
4799 | 6109 | ||
4800 | break; | 6110 | struct pmu *perf_init_event(struct perf_event *event) |
4801 | case PERF_COUNT_SW_PAGE_FAULTS: | 6111 | { |
4802 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: | 6112 | struct pmu *pmu = NULL; |
4803 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | 6113 | int idx; |
4804 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | 6114 | int ret; |
4805 | case PERF_COUNT_SW_CPU_MIGRATIONS: | 6115 | |
4806 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | 6116 | idx = srcu_read_lock(&pmus_srcu); |
4807 | case PERF_COUNT_SW_EMULATION_FAULTS: | 6117 | |
4808 | if (!event->parent) { | 6118 | rcu_read_lock(); |
4809 | int err; | 6119 | pmu = idr_find(&pmu_idr, event->attr.type); |
4810 | 6120 | rcu_read_unlock(); | |
4811 | err = swevent_hlist_get(event); | 6121 | if (pmu) { |
4812 | if (err) | 6122 | ret = pmu->event_init(event); |
4813 | return ERR_PTR(err); | 6123 | if (ret) |
6124 | pmu = ERR_PTR(ret); | ||
6125 | goto unlock; | ||
6126 | } | ||
4814 | 6127 | ||
4815 | atomic_inc(&perf_swevent_enabled[event_id]); | 6128 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
4816 | event->destroy = sw_perf_event_destroy; | 6129 | ret = pmu->event_init(event); |
6130 | if (!ret) | ||
6131 | goto unlock; | ||
6132 | |||
6133 | if (ret != -ENOENT) { | ||
6134 | pmu = ERR_PTR(ret); | ||
6135 | goto unlock; | ||
4817 | } | 6136 | } |
4818 | pmu = &perf_ops_generic; | ||
4819 | break; | ||
4820 | } | 6137 | } |
6138 | pmu = ERR_PTR(-ENOENT); | ||
6139 | unlock: | ||
6140 | srcu_read_unlock(&pmus_srcu, idx); | ||
4821 | 6141 | ||
4822 | return pmu; | 6142 | return pmu; |
4823 | } | 6143 | } |
@@ -4826,20 +6146,23 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4826 | * Allocate and initialize a event structure | 6146 | * Allocate and initialize a event structure |
4827 | */ | 6147 | */ |
4828 | static struct perf_event * | 6148 | static struct perf_event * |
4829 | perf_event_alloc(struct perf_event_attr *attr, | 6149 | perf_event_alloc(struct perf_event_attr *attr, int cpu, |
4830 | int cpu, | 6150 | struct task_struct *task, |
4831 | struct perf_event_context *ctx, | 6151 | struct perf_event *group_leader, |
4832 | struct perf_event *group_leader, | 6152 | struct perf_event *parent_event, |
4833 | struct perf_event *parent_event, | 6153 | perf_overflow_handler_t overflow_handler) |
4834 | perf_overflow_handler_t overflow_handler, | 6154 | { |
4835 | gfp_t gfpflags) | 6155 | struct pmu *pmu; |
4836 | { | ||
4837 | const struct pmu *pmu; | ||
4838 | struct perf_event *event; | 6156 | struct perf_event *event; |
4839 | struct hw_perf_event *hwc; | 6157 | struct hw_perf_event *hwc; |
4840 | long err; | 6158 | long err; |
4841 | 6159 | ||
4842 | event = kzalloc(sizeof(*event), gfpflags); | 6160 | if ((unsigned)cpu >= nr_cpu_ids) { |
6161 | if (!task || cpu != -1) | ||
6162 | return ERR_PTR(-EINVAL); | ||
6163 | } | ||
6164 | |||
6165 | event = kzalloc(sizeof(*event), GFP_KERNEL); | ||
4843 | if (!event) | 6166 | if (!event) |
4844 | return ERR_PTR(-ENOMEM); | 6167 | return ERR_PTR(-ENOMEM); |
4845 | 6168 | ||
@@ -4857,6 +6180,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4857 | INIT_LIST_HEAD(&event->event_entry); | 6180 | INIT_LIST_HEAD(&event->event_entry); |
4858 | INIT_LIST_HEAD(&event->sibling_list); | 6181 | INIT_LIST_HEAD(&event->sibling_list); |
4859 | init_waitqueue_head(&event->waitq); | 6182 | init_waitqueue_head(&event->waitq); |
6183 | init_irq_work(&event->pending, perf_pending_event); | ||
4860 | 6184 | ||
4861 | mutex_init(&event->mmap_mutex); | 6185 | mutex_init(&event->mmap_mutex); |
4862 | 6186 | ||
@@ -4864,7 +6188,6 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4864 | event->attr = *attr; | 6188 | event->attr = *attr; |
4865 | event->group_leader = group_leader; | 6189 | event->group_leader = group_leader; |
4866 | event->pmu = NULL; | 6190 | event->pmu = NULL; |
4867 | event->ctx = ctx; | ||
4868 | event->oncpu = -1; | 6191 | event->oncpu = -1; |
4869 | 6192 | ||
4870 | event->parent = parent_event; | 6193 | event->parent = parent_event; |
@@ -4874,9 +6197,20 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4874 | 6197 | ||
4875 | event->state = PERF_EVENT_STATE_INACTIVE; | 6198 | event->state = PERF_EVENT_STATE_INACTIVE; |
4876 | 6199 | ||
6200 | if (task) { | ||
6201 | event->attach_state = PERF_ATTACH_TASK; | ||
6202 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
6203 | /* | ||
6204 | * hw_breakpoint is a bit difficult here.. | ||
6205 | */ | ||
6206 | if (attr->type == PERF_TYPE_BREAKPOINT) | ||
6207 | event->hw.bp_target = task; | ||
6208 | #endif | ||
6209 | } | ||
6210 | |||
4877 | if (!overflow_handler && parent_event) | 6211 | if (!overflow_handler && parent_event) |
4878 | overflow_handler = parent_event->overflow_handler; | 6212 | overflow_handler = parent_event->overflow_handler; |
4879 | 6213 | ||
4880 | event->overflow_handler = overflow_handler; | 6214 | event->overflow_handler = overflow_handler; |
4881 | 6215 | ||
4882 | if (attr->disabled) | 6216 | if (attr->disabled) |
@@ -4898,29 +6232,8 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4898 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 6232 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
4899 | goto done; | 6233 | goto done; |
4900 | 6234 | ||
4901 | switch (attr->type) { | 6235 | pmu = perf_init_event(event); |
4902 | case PERF_TYPE_RAW: | ||
4903 | case PERF_TYPE_HARDWARE: | ||
4904 | case PERF_TYPE_HW_CACHE: | ||
4905 | pmu = hw_perf_event_init(event); | ||
4906 | break; | ||
4907 | |||
4908 | case PERF_TYPE_SOFTWARE: | ||
4909 | pmu = sw_perf_event_init(event); | ||
4910 | break; | ||
4911 | |||
4912 | case PERF_TYPE_TRACEPOINT: | ||
4913 | pmu = tp_perf_event_init(event); | ||
4914 | break; | ||
4915 | |||
4916 | case PERF_TYPE_BREAKPOINT: | ||
4917 | pmu = bp_perf_event_init(event); | ||
4918 | break; | ||
4919 | |||
4920 | 6236 | ||
4921 | default: | ||
4922 | break; | ||
4923 | } | ||
4924 | done: | 6237 | done: |
4925 | err = 0; | 6238 | err = 0; |
4926 | if (!pmu) | 6239 | if (!pmu) |
@@ -4938,13 +6251,21 @@ done: | |||
4938 | event->pmu = pmu; | 6251 | event->pmu = pmu; |
4939 | 6252 | ||
4940 | if (!event->parent) { | 6253 | if (!event->parent) { |
4941 | atomic_inc(&nr_events); | 6254 | if (event->attach_state & PERF_ATTACH_TASK) |
6255 | jump_label_inc(&perf_sched_events); | ||
4942 | if (event->attr.mmap || event->attr.mmap_data) | 6256 | if (event->attr.mmap || event->attr.mmap_data) |
4943 | atomic_inc(&nr_mmap_events); | 6257 | atomic_inc(&nr_mmap_events); |
4944 | if (event->attr.comm) | 6258 | if (event->attr.comm) |
4945 | atomic_inc(&nr_comm_events); | 6259 | atomic_inc(&nr_comm_events); |
4946 | if (event->attr.task) | 6260 | if (event->attr.task) |
4947 | atomic_inc(&nr_task_events); | 6261 | atomic_inc(&nr_task_events); |
6262 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | ||
6263 | err = get_callchain_buffers(); | ||
6264 | if (err) { | ||
6265 | free_event(event); | ||
6266 | return ERR_PTR(err); | ||
6267 | } | ||
6268 | } | ||
4948 | } | 6269 | } |
4949 | 6270 | ||
4950 | return event; | 6271 | return event; |
@@ -5092,17 +6413,21 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5092 | struct perf_event_attr __user *, attr_uptr, | 6413 | struct perf_event_attr __user *, attr_uptr, |
5093 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 6414 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
5094 | { | 6415 | { |
5095 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; | 6416 | struct perf_event *group_leader = NULL, *output_event = NULL; |
6417 | struct perf_event *event, *sibling; | ||
5096 | struct perf_event_attr attr; | 6418 | struct perf_event_attr attr; |
5097 | struct perf_event_context *ctx; | 6419 | struct perf_event_context *ctx; |
5098 | struct file *event_file = NULL; | 6420 | struct file *event_file = NULL; |
5099 | struct file *group_file = NULL; | 6421 | struct file *group_file = NULL; |
6422 | struct task_struct *task = NULL; | ||
6423 | struct pmu *pmu; | ||
5100 | int event_fd; | 6424 | int event_fd; |
6425 | int move_group = 0; | ||
5101 | int fput_needed = 0; | 6426 | int fput_needed = 0; |
5102 | int err; | 6427 | int err; |
5103 | 6428 | ||
5104 | /* for future expandability... */ | 6429 | /* for future expandability... */ |
5105 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) | 6430 | if (flags & ~PERF_FLAG_ALL) |
5106 | return -EINVAL; | 6431 | return -EINVAL; |
5107 | 6432 | ||
5108 | err = perf_copy_attr(attr_uptr, &attr); | 6433 | err = perf_copy_attr(attr_uptr, &attr); |
@@ -5119,24 +6444,24 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5119 | return -EINVAL; | 6444 | return -EINVAL; |
5120 | } | 6445 | } |
5121 | 6446 | ||
6447 | /* | ||
6448 | * In cgroup mode, the pid argument is used to pass the fd | ||
6449 | * opened to the cgroup directory in cgroupfs. The cpu argument | ||
6450 | * designates the cpu on which to monitor threads from that | ||
6451 | * cgroup. | ||
6452 | */ | ||
6453 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | ||
6454 | return -EINVAL; | ||
6455 | |||
5122 | event_fd = get_unused_fd_flags(O_RDWR); | 6456 | event_fd = get_unused_fd_flags(O_RDWR); |
5123 | if (event_fd < 0) | 6457 | if (event_fd < 0) |
5124 | return event_fd; | 6458 | return event_fd; |
5125 | 6459 | ||
5126 | /* | ||
5127 | * Get the target context (task or percpu): | ||
5128 | */ | ||
5129 | ctx = find_get_context(pid, cpu); | ||
5130 | if (IS_ERR(ctx)) { | ||
5131 | err = PTR_ERR(ctx); | ||
5132 | goto err_fd; | ||
5133 | } | ||
5134 | |||
5135 | if (group_fd != -1) { | 6460 | if (group_fd != -1) { |
5136 | group_leader = perf_fget_light(group_fd, &fput_needed); | 6461 | group_leader = perf_fget_light(group_fd, &fput_needed); |
5137 | if (IS_ERR(group_leader)) { | 6462 | if (IS_ERR(group_leader)) { |
5138 | err = PTR_ERR(group_leader); | 6463 | err = PTR_ERR(group_leader); |
5139 | goto err_put_context; | 6464 | goto err_fd; |
5140 | } | 6465 | } |
5141 | group_file = group_leader->filp; | 6466 | group_file = group_leader->filp; |
5142 | if (flags & PERF_FLAG_FD_OUTPUT) | 6467 | if (flags & PERF_FLAG_FD_OUTPUT) |
@@ -5145,6 +6470,76 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5145 | group_leader = NULL; | 6470 | group_leader = NULL; |
5146 | } | 6471 | } |
5147 | 6472 | ||
6473 | if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { | ||
6474 | task = find_lively_task_by_vpid(pid); | ||
6475 | if (IS_ERR(task)) { | ||
6476 | err = PTR_ERR(task); | ||
6477 | goto err_group_fd; | ||
6478 | } | ||
6479 | } | ||
6480 | |||
6481 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | ||
6482 | if (IS_ERR(event)) { | ||
6483 | err = PTR_ERR(event); | ||
6484 | goto err_task; | ||
6485 | } | ||
6486 | |||
6487 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
6488 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
6489 | if (err) | ||
6490 | goto err_alloc; | ||
6491 | /* | ||
6492 | * one more event: | ||
6493 | * - that has cgroup constraint on event->cpu | ||
6494 | * - that may need work on context switch | ||
6495 | */ | ||
6496 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6497 | jump_label_inc(&perf_sched_events); | ||
6498 | } | ||
6499 | |||
6500 | /* | ||
6501 | * Special case software events and allow them to be part of | ||
6502 | * any hardware group. | ||
6503 | */ | ||
6504 | pmu = event->pmu; | ||
6505 | |||
6506 | if (group_leader && | ||
6507 | (is_software_event(event) != is_software_event(group_leader))) { | ||
6508 | if (is_software_event(event)) { | ||
6509 | /* | ||
6510 | * If event and group_leader are not both a software | ||
6511 | * event, and event is, then group leader is not. | ||
6512 | * | ||
6513 | * Allow the addition of software events to !software | ||
6514 | * groups, this is safe because software events never | ||
6515 | * fail to schedule. | ||
6516 | */ | ||
6517 | pmu = group_leader->pmu; | ||
6518 | } else if (is_software_event(group_leader) && | ||
6519 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | ||
6520 | /* | ||
6521 | * In case the group is a pure software group, and we | ||
6522 | * try to add a hardware event, move the whole group to | ||
6523 | * the hardware context. | ||
6524 | */ | ||
6525 | move_group = 1; | ||
6526 | } | ||
6527 | } | ||
6528 | |||
6529 | /* | ||
6530 | * Get the target context (task or percpu): | ||
6531 | */ | ||
6532 | ctx = find_get_context(pmu, task, cpu); | ||
6533 | if (IS_ERR(ctx)) { | ||
6534 | err = PTR_ERR(ctx); | ||
6535 | goto err_alloc; | ||
6536 | } | ||
6537 | |||
6538 | if (task) { | ||
6539 | put_task_struct(task); | ||
6540 | task = NULL; | ||
6541 | } | ||
6542 | |||
5148 | /* | 6543 | /* |
5149 | * Look up the group leader (we will attach this event to it): | 6544 | * Look up the group leader (we will attach this event to it): |
5150 | */ | 6545 | */ |
@@ -5156,53 +6551,84 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5156 | * becoming part of another group-sibling): | 6551 | * becoming part of another group-sibling): |
5157 | */ | 6552 | */ |
5158 | if (group_leader->group_leader != group_leader) | 6553 | if (group_leader->group_leader != group_leader) |
5159 | goto err_put_context; | 6554 | goto err_context; |
5160 | /* | 6555 | /* |
5161 | * Do not allow to attach to a group in a different | 6556 | * Do not allow to attach to a group in a different |
5162 | * task or CPU context: | 6557 | * task or CPU context: |
5163 | */ | 6558 | */ |
5164 | if (group_leader->ctx != ctx) | 6559 | if (move_group) { |
5165 | goto err_put_context; | 6560 | if (group_leader->ctx->type != ctx->type) |
6561 | goto err_context; | ||
6562 | } else { | ||
6563 | if (group_leader->ctx != ctx) | ||
6564 | goto err_context; | ||
6565 | } | ||
6566 | |||
5166 | /* | 6567 | /* |
5167 | * Only a group leader can be exclusive or pinned | 6568 | * Only a group leader can be exclusive or pinned |
5168 | */ | 6569 | */ |
5169 | if (attr.exclusive || attr.pinned) | 6570 | if (attr.exclusive || attr.pinned) |
5170 | goto err_put_context; | 6571 | goto err_context; |
5171 | } | ||
5172 | |||
5173 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | ||
5174 | NULL, NULL, GFP_KERNEL); | ||
5175 | if (IS_ERR(event)) { | ||
5176 | err = PTR_ERR(event); | ||
5177 | goto err_put_context; | ||
5178 | } | 6572 | } |
5179 | 6573 | ||
5180 | if (output_event) { | 6574 | if (output_event) { |
5181 | err = perf_event_set_output(event, output_event); | 6575 | err = perf_event_set_output(event, output_event); |
5182 | if (err) | 6576 | if (err) |
5183 | goto err_free_put_context; | 6577 | goto err_context; |
5184 | } | 6578 | } |
5185 | 6579 | ||
5186 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 6580 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5187 | if (IS_ERR(event_file)) { | 6581 | if (IS_ERR(event_file)) { |
5188 | err = PTR_ERR(event_file); | 6582 | err = PTR_ERR(event_file); |
5189 | goto err_free_put_context; | 6583 | goto err_context; |
6584 | } | ||
6585 | |||
6586 | if (move_group) { | ||
6587 | struct perf_event_context *gctx = group_leader->ctx; | ||
6588 | |||
6589 | mutex_lock(&gctx->mutex); | ||
6590 | perf_remove_from_context(group_leader); | ||
6591 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
6592 | group_entry) { | ||
6593 | perf_remove_from_context(sibling); | ||
6594 | put_ctx(gctx); | ||
6595 | } | ||
6596 | mutex_unlock(&gctx->mutex); | ||
6597 | put_ctx(gctx); | ||
5190 | } | 6598 | } |
5191 | 6599 | ||
5192 | event->filp = event_file; | 6600 | event->filp = event_file; |
5193 | WARN_ON_ONCE(ctx->parent_ctx); | 6601 | WARN_ON_ONCE(ctx->parent_ctx); |
5194 | mutex_lock(&ctx->mutex); | 6602 | mutex_lock(&ctx->mutex); |
6603 | |||
6604 | if (move_group) { | ||
6605 | perf_install_in_context(ctx, group_leader, cpu); | ||
6606 | get_ctx(ctx); | ||
6607 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
6608 | group_entry) { | ||
6609 | perf_install_in_context(ctx, sibling, cpu); | ||
6610 | get_ctx(ctx); | ||
6611 | } | ||
6612 | } | ||
6613 | |||
5195 | perf_install_in_context(ctx, event, cpu); | 6614 | perf_install_in_context(ctx, event, cpu); |
5196 | ++ctx->generation; | 6615 | ++ctx->generation; |
6616 | perf_unpin_context(ctx); | ||
5197 | mutex_unlock(&ctx->mutex); | 6617 | mutex_unlock(&ctx->mutex); |
5198 | 6618 | ||
5199 | event->owner = current; | 6619 | event->owner = current; |
5200 | get_task_struct(current); | 6620 | |
5201 | mutex_lock(¤t->perf_event_mutex); | 6621 | mutex_lock(¤t->perf_event_mutex); |
5202 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | 6622 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
5203 | mutex_unlock(¤t->perf_event_mutex); | 6623 | mutex_unlock(¤t->perf_event_mutex); |
5204 | 6624 | ||
5205 | /* | 6625 | /* |
6626 | * Precalculate sample_data sizes | ||
6627 | */ | ||
6628 | perf_event__header_size(event); | ||
6629 | perf_event__id_header_size(event); | ||
6630 | |||
6631 | /* | ||
5206 | * Drop the reference on the group_event after placing the | 6632 | * Drop the reference on the group_event after placing the |
5207 | * new event on the sibling_list. This ensures destruction | 6633 | * new event on the sibling_list. This ensures destruction |
5208 | * of the group leader will find the pointer to itself in | 6634 | * of the group leader will find the pointer to itself in |
@@ -5212,11 +6638,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5212 | fd_install(event_fd, event_file); | 6638 | fd_install(event_fd, event_file); |
5213 | return event_fd; | 6639 | return event_fd; |
5214 | 6640 | ||
5215 | err_free_put_context: | 6641 | err_context: |
6642 | perf_unpin_context(ctx); | ||
6643 | put_ctx(ctx); | ||
6644 | err_alloc: | ||
5216 | free_event(event); | 6645 | free_event(event); |
5217 | err_put_context: | 6646 | err_task: |
6647 | if (task) | ||
6648 | put_task_struct(task); | ||
6649 | err_group_fd: | ||
5218 | fput_light(group_file, fput_needed); | 6650 | fput_light(group_file, fput_needed); |
5219 | put_ctx(ctx); | ||
5220 | err_fd: | 6651 | err_fd: |
5221 | put_unused_fd(event_fd); | 6652 | put_unused_fd(event_fd); |
5222 | return err; | 6653 | return err; |
@@ -5227,32 +6658,31 @@ err_fd: | |||
5227 | * | 6658 | * |
5228 | * @attr: attributes of the counter to create | 6659 | * @attr: attributes of the counter to create |
5229 | * @cpu: cpu in which the counter is bound | 6660 | * @cpu: cpu in which the counter is bound |
5230 | * @pid: task to profile | 6661 | * @task: task to profile (NULL for percpu) |
5231 | */ | 6662 | */ |
5232 | struct perf_event * | 6663 | struct perf_event * |
5233 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 6664 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
5234 | pid_t pid, | 6665 | struct task_struct *task, |
5235 | perf_overflow_handler_t overflow_handler) | 6666 | perf_overflow_handler_t overflow_handler) |
5236 | { | 6667 | { |
5237 | struct perf_event *event; | ||
5238 | struct perf_event_context *ctx; | 6668 | struct perf_event_context *ctx; |
6669 | struct perf_event *event; | ||
5239 | int err; | 6670 | int err; |
5240 | 6671 | ||
5241 | /* | 6672 | /* |
5242 | * Get the target context (task or percpu): | 6673 | * Get the target context (task or percpu): |
5243 | */ | 6674 | */ |
5244 | 6675 | ||
5245 | ctx = find_get_context(pid, cpu); | 6676 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); |
5246 | if (IS_ERR(ctx)) { | ||
5247 | err = PTR_ERR(ctx); | ||
5248 | goto err_exit; | ||
5249 | } | ||
5250 | |||
5251 | event = perf_event_alloc(attr, cpu, ctx, NULL, | ||
5252 | NULL, overflow_handler, GFP_KERNEL); | ||
5253 | if (IS_ERR(event)) { | 6677 | if (IS_ERR(event)) { |
5254 | err = PTR_ERR(event); | 6678 | err = PTR_ERR(event); |
5255 | goto err_put_context; | 6679 | goto err; |
6680 | } | ||
6681 | |||
6682 | ctx = find_get_context(event->pmu, task, cpu); | ||
6683 | if (IS_ERR(ctx)) { | ||
6684 | err = PTR_ERR(ctx); | ||
6685 | goto err_free; | ||
5256 | } | 6686 | } |
5257 | 6687 | ||
5258 | event->filp = NULL; | 6688 | event->filp = NULL; |
@@ -5260,122 +6690,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
5260 | mutex_lock(&ctx->mutex); | 6690 | mutex_lock(&ctx->mutex); |
5261 | perf_install_in_context(ctx, event, cpu); | 6691 | perf_install_in_context(ctx, event, cpu); |
5262 | ++ctx->generation; | 6692 | ++ctx->generation; |
6693 | perf_unpin_context(ctx); | ||
5263 | mutex_unlock(&ctx->mutex); | 6694 | mutex_unlock(&ctx->mutex); |
5264 | 6695 | ||
5265 | event->owner = current; | ||
5266 | get_task_struct(current); | ||
5267 | mutex_lock(¤t->perf_event_mutex); | ||
5268 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | ||
5269 | mutex_unlock(¤t->perf_event_mutex); | ||
5270 | |||
5271 | return event; | 6696 | return event; |
5272 | 6697 | ||
5273 | err_put_context: | 6698 | err_free: |
5274 | put_ctx(ctx); | 6699 | free_event(event); |
5275 | err_exit: | 6700 | err: |
5276 | return ERR_PTR(err); | 6701 | return ERR_PTR(err); |
5277 | } | 6702 | } |
5278 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 6703 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
5279 | 6704 | ||
5280 | /* | ||
5281 | * inherit a event from parent task to child task: | ||
5282 | */ | ||
5283 | static struct perf_event * | ||
5284 | inherit_event(struct perf_event *parent_event, | ||
5285 | struct task_struct *parent, | ||
5286 | struct perf_event_context *parent_ctx, | ||
5287 | struct task_struct *child, | ||
5288 | struct perf_event *group_leader, | ||
5289 | struct perf_event_context *child_ctx) | ||
5290 | { | ||
5291 | struct perf_event *child_event; | ||
5292 | |||
5293 | /* | ||
5294 | * Instead of creating recursive hierarchies of events, | ||
5295 | * we link inherited events back to the original parent, | ||
5296 | * which has a filp for sure, which we use as the reference | ||
5297 | * count: | ||
5298 | */ | ||
5299 | if (parent_event->parent) | ||
5300 | parent_event = parent_event->parent; | ||
5301 | |||
5302 | child_event = perf_event_alloc(&parent_event->attr, | ||
5303 | parent_event->cpu, child_ctx, | ||
5304 | group_leader, parent_event, | ||
5305 | NULL, GFP_KERNEL); | ||
5306 | if (IS_ERR(child_event)) | ||
5307 | return child_event; | ||
5308 | get_ctx(child_ctx); | ||
5309 | |||
5310 | /* | ||
5311 | * Make the child state follow the state of the parent event, | ||
5312 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5313 | * so we won't race with perf_event_{en, dis}able_family. | ||
5314 | */ | ||
5315 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5316 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5317 | else | ||
5318 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5319 | |||
5320 | if (parent_event->attr.freq) { | ||
5321 | u64 sample_period = parent_event->hw.sample_period; | ||
5322 | struct hw_perf_event *hwc = &child_event->hw; | ||
5323 | |||
5324 | hwc->sample_period = sample_period; | ||
5325 | hwc->last_period = sample_period; | ||
5326 | |||
5327 | local64_set(&hwc->period_left, sample_period); | ||
5328 | } | ||
5329 | |||
5330 | child_event->overflow_handler = parent_event->overflow_handler; | ||
5331 | |||
5332 | /* | ||
5333 | * Link it up in the child's context: | ||
5334 | */ | ||
5335 | add_event_to_ctx(child_event, child_ctx); | ||
5336 | |||
5337 | /* | ||
5338 | * Get a reference to the parent filp - we will fput it | ||
5339 | * when the child event exits. This is safe to do because | ||
5340 | * we are in the parent and we know that the filp still | ||
5341 | * exists and has a nonzero count: | ||
5342 | */ | ||
5343 | atomic_long_inc(&parent_event->filp->f_count); | ||
5344 | |||
5345 | /* | ||
5346 | * Link this into the parent event's child list | ||
5347 | */ | ||
5348 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
5349 | mutex_lock(&parent_event->child_mutex); | ||
5350 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
5351 | mutex_unlock(&parent_event->child_mutex); | ||
5352 | |||
5353 | return child_event; | ||
5354 | } | ||
5355 | |||
5356 | static int inherit_group(struct perf_event *parent_event, | ||
5357 | struct task_struct *parent, | ||
5358 | struct perf_event_context *parent_ctx, | ||
5359 | struct task_struct *child, | ||
5360 | struct perf_event_context *child_ctx) | ||
5361 | { | ||
5362 | struct perf_event *leader; | ||
5363 | struct perf_event *sub; | ||
5364 | struct perf_event *child_ctr; | ||
5365 | |||
5366 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
5367 | child, NULL, child_ctx); | ||
5368 | if (IS_ERR(leader)) | ||
5369 | return PTR_ERR(leader); | ||
5370 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
5371 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
5372 | child, leader, child_ctx); | ||
5373 | if (IS_ERR(child_ctr)) | ||
5374 | return PTR_ERR(child_ctr); | ||
5375 | } | ||
5376 | return 0; | ||
5377 | } | ||
5378 | |||
5379 | static void sync_child_event(struct perf_event *child_event, | 6705 | static void sync_child_event(struct perf_event *child_event, |
5380 | struct task_struct *child) | 6706 | struct task_struct *child) |
5381 | { | 6707 | { |
@@ -5416,32 +6742,32 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
5416 | struct perf_event_context *child_ctx, | 6742 | struct perf_event_context *child_ctx, |
5417 | struct task_struct *child) | 6743 | struct task_struct *child) |
5418 | { | 6744 | { |
5419 | struct perf_event *parent_event; | 6745 | if (child_event->parent) { |
6746 | raw_spin_lock_irq(&child_ctx->lock); | ||
6747 | perf_group_detach(child_event); | ||
6748 | raw_spin_unlock_irq(&child_ctx->lock); | ||
6749 | } | ||
5420 | 6750 | ||
5421 | perf_event_remove_from_context(child_event); | 6751 | perf_remove_from_context(child_event); |
5422 | 6752 | ||
5423 | parent_event = child_event->parent; | ||
5424 | /* | 6753 | /* |
5425 | * It can happen that parent exits first, and has events | 6754 | * It can happen that the parent exits first, and has events |
5426 | * that are still around due to the child reference. These | 6755 | * that are still around due to the child reference. These |
5427 | * events need to be zapped - but otherwise linger. | 6756 | * events need to be zapped. |
5428 | */ | 6757 | */ |
5429 | if (parent_event) { | 6758 | if (child_event->parent) { |
5430 | sync_child_event(child_event, child); | 6759 | sync_child_event(child_event, child); |
5431 | free_event(child_event); | 6760 | free_event(child_event); |
5432 | } | 6761 | } |
5433 | } | 6762 | } |
5434 | 6763 | ||
5435 | /* | 6764 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
5436 | * When a child task exits, feed back event values to parent events. | ||
5437 | */ | ||
5438 | void perf_event_exit_task(struct task_struct *child) | ||
5439 | { | 6765 | { |
5440 | struct perf_event *child_event, *tmp; | 6766 | struct perf_event *child_event, *tmp; |
5441 | struct perf_event_context *child_ctx; | 6767 | struct perf_event_context *child_ctx; |
5442 | unsigned long flags; | 6768 | unsigned long flags; |
5443 | 6769 | ||
5444 | if (likely(!child->perf_event_ctxp)) { | 6770 | if (likely(!child->perf_event_ctxp[ctxn])) { |
5445 | perf_event_task(child, NULL, 0); | 6771 | perf_event_task(child, NULL, 0); |
5446 | return; | 6772 | return; |
5447 | } | 6773 | } |
@@ -5453,8 +6779,8 @@ void perf_event_exit_task(struct task_struct *child) | |||
5453 | * scheduled, so we are now safe from rescheduling changing | 6779 | * scheduled, so we are now safe from rescheduling changing |
5454 | * our context. | 6780 | * our context. |
5455 | */ | 6781 | */ |
5456 | child_ctx = child->perf_event_ctxp; | 6782 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); |
5457 | __perf_event_task_sched_out(child_ctx); | 6783 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
5458 | 6784 | ||
5459 | /* | 6785 | /* |
5460 | * Take the context lock here so that if find_get_context is | 6786 | * Take the context lock here so that if find_get_context is |
@@ -5462,7 +6788,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
5462 | * incremented the context's refcount before we do put_ctx below. | 6788 | * incremented the context's refcount before we do put_ctx below. |
5463 | */ | 6789 | */ |
5464 | raw_spin_lock(&child_ctx->lock); | 6790 | raw_spin_lock(&child_ctx->lock); |
5465 | child->perf_event_ctxp = NULL; | 6791 | child->perf_event_ctxp[ctxn] = NULL; |
5466 | /* | 6792 | /* |
5467 | * If this context is a clone; unclone it so it can't get | 6793 | * If this context is a clone; unclone it so it can't get |
5468 | * swapped to another process while we're removing all | 6794 | * swapped to another process while we're removing all |
@@ -5515,6 +6841,33 @@ again: | |||
5515 | put_ctx(child_ctx); | 6841 | put_ctx(child_ctx); |
5516 | } | 6842 | } |
5517 | 6843 | ||
6844 | /* | ||
6845 | * When a child task exits, feed back event values to parent events. | ||
6846 | */ | ||
6847 | void perf_event_exit_task(struct task_struct *child) | ||
6848 | { | ||
6849 | struct perf_event *event, *tmp; | ||
6850 | int ctxn; | ||
6851 | |||
6852 | mutex_lock(&child->perf_event_mutex); | ||
6853 | list_for_each_entry_safe(event, tmp, &child->perf_event_list, | ||
6854 | owner_entry) { | ||
6855 | list_del_init(&event->owner_entry); | ||
6856 | |||
6857 | /* | ||
6858 | * Ensure the list deletion is visible before we clear | ||
6859 | * the owner, closes a race against perf_release() where | ||
6860 | * we need to serialize on the owner->perf_event_mutex. | ||
6861 | */ | ||
6862 | smp_wmb(); | ||
6863 | event->owner = NULL; | ||
6864 | } | ||
6865 | mutex_unlock(&child->perf_event_mutex); | ||
6866 | |||
6867 | for_each_task_context_nr(ctxn) | ||
6868 | perf_event_exit_task_context(child, ctxn); | ||
6869 | } | ||
6870 | |||
5518 | static void perf_free_event(struct perf_event *event, | 6871 | static void perf_free_event(struct perf_event *event, |
5519 | struct perf_event_context *ctx) | 6872 | struct perf_event_context *ctx) |
5520 | { | 6873 | { |
@@ -5536,48 +6889,172 @@ static void perf_free_event(struct perf_event *event, | |||
5536 | 6889 | ||
5537 | /* | 6890 | /* |
5538 | * free an unexposed, unused context as created by inheritance by | 6891 | * free an unexposed, unused context as created by inheritance by |
5539 | * init_task below, used by fork() in case of fail. | 6892 | * perf_event_init_task below, used by fork() in case of fail. |
5540 | */ | 6893 | */ |
5541 | void perf_event_free_task(struct task_struct *task) | 6894 | void perf_event_free_task(struct task_struct *task) |
5542 | { | 6895 | { |
5543 | struct perf_event_context *ctx = task->perf_event_ctxp; | 6896 | struct perf_event_context *ctx; |
5544 | struct perf_event *event, *tmp; | 6897 | struct perf_event *event, *tmp; |
6898 | int ctxn; | ||
5545 | 6899 | ||
5546 | if (!ctx) | 6900 | for_each_task_context_nr(ctxn) { |
5547 | return; | 6901 | ctx = task->perf_event_ctxp[ctxn]; |
6902 | if (!ctx) | ||
6903 | continue; | ||
5548 | 6904 | ||
5549 | mutex_lock(&ctx->mutex); | 6905 | mutex_lock(&ctx->mutex); |
5550 | again: | 6906 | again: |
5551 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6907 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, |
5552 | perf_free_event(event, ctx); | 6908 | group_entry) |
6909 | perf_free_event(event, ctx); | ||
5553 | 6910 | ||
5554 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, | 6911 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
5555 | group_entry) | 6912 | group_entry) |
5556 | perf_free_event(event, ctx); | 6913 | perf_free_event(event, ctx); |
5557 | 6914 | ||
5558 | if (!list_empty(&ctx->pinned_groups) || | 6915 | if (!list_empty(&ctx->pinned_groups) || |
5559 | !list_empty(&ctx->flexible_groups)) | 6916 | !list_empty(&ctx->flexible_groups)) |
5560 | goto again; | 6917 | goto again; |
5561 | 6918 | ||
5562 | mutex_unlock(&ctx->mutex); | 6919 | mutex_unlock(&ctx->mutex); |
5563 | 6920 | ||
5564 | put_ctx(ctx); | 6921 | put_ctx(ctx); |
6922 | } | ||
6923 | } | ||
6924 | |||
6925 | void perf_event_delayed_put(struct task_struct *task) | ||
6926 | { | ||
6927 | int ctxn; | ||
6928 | |||
6929 | for_each_task_context_nr(ctxn) | ||
6930 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | ||
6931 | } | ||
6932 | |||
6933 | /* | ||
6934 | * inherit a event from parent task to child task: | ||
6935 | */ | ||
6936 | static struct perf_event * | ||
6937 | inherit_event(struct perf_event *parent_event, | ||
6938 | struct task_struct *parent, | ||
6939 | struct perf_event_context *parent_ctx, | ||
6940 | struct task_struct *child, | ||
6941 | struct perf_event *group_leader, | ||
6942 | struct perf_event_context *child_ctx) | ||
6943 | { | ||
6944 | struct perf_event *child_event; | ||
6945 | unsigned long flags; | ||
6946 | |||
6947 | /* | ||
6948 | * Instead of creating recursive hierarchies of events, | ||
6949 | * we link inherited events back to the original parent, | ||
6950 | * which has a filp for sure, which we use as the reference | ||
6951 | * count: | ||
6952 | */ | ||
6953 | if (parent_event->parent) | ||
6954 | parent_event = parent_event->parent; | ||
6955 | |||
6956 | child_event = perf_event_alloc(&parent_event->attr, | ||
6957 | parent_event->cpu, | ||
6958 | child, | ||
6959 | group_leader, parent_event, | ||
6960 | NULL); | ||
6961 | if (IS_ERR(child_event)) | ||
6962 | return child_event; | ||
6963 | get_ctx(child_ctx); | ||
6964 | |||
6965 | /* | ||
6966 | * Make the child state follow the state of the parent event, | ||
6967 | * not its attr.disabled bit. We hold the parent's mutex, | ||
6968 | * so we won't race with perf_event_{en, dis}able_family. | ||
6969 | */ | ||
6970 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
6971 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
6972 | else | ||
6973 | child_event->state = PERF_EVENT_STATE_OFF; | ||
6974 | |||
6975 | if (parent_event->attr.freq) { | ||
6976 | u64 sample_period = parent_event->hw.sample_period; | ||
6977 | struct hw_perf_event *hwc = &child_event->hw; | ||
6978 | |||
6979 | hwc->sample_period = sample_period; | ||
6980 | hwc->last_period = sample_period; | ||
6981 | |||
6982 | local64_set(&hwc->period_left, sample_period); | ||
6983 | } | ||
6984 | |||
6985 | child_event->ctx = child_ctx; | ||
6986 | child_event->overflow_handler = parent_event->overflow_handler; | ||
6987 | |||
6988 | /* | ||
6989 | * Precalculate sample_data sizes | ||
6990 | */ | ||
6991 | perf_event__header_size(child_event); | ||
6992 | perf_event__id_header_size(child_event); | ||
6993 | |||
6994 | /* | ||
6995 | * Link it up in the child's context: | ||
6996 | */ | ||
6997 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | ||
6998 | add_event_to_ctx(child_event, child_ctx); | ||
6999 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
7000 | |||
7001 | /* | ||
7002 | * Get a reference to the parent filp - we will fput it | ||
7003 | * when the child event exits. This is safe to do because | ||
7004 | * we are in the parent and we know that the filp still | ||
7005 | * exists and has a nonzero count: | ||
7006 | */ | ||
7007 | atomic_long_inc(&parent_event->filp->f_count); | ||
7008 | |||
7009 | /* | ||
7010 | * Link this into the parent event's child list | ||
7011 | */ | ||
7012 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
7013 | mutex_lock(&parent_event->child_mutex); | ||
7014 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
7015 | mutex_unlock(&parent_event->child_mutex); | ||
7016 | |||
7017 | return child_event; | ||
7018 | } | ||
7019 | |||
7020 | static int inherit_group(struct perf_event *parent_event, | ||
7021 | struct task_struct *parent, | ||
7022 | struct perf_event_context *parent_ctx, | ||
7023 | struct task_struct *child, | ||
7024 | struct perf_event_context *child_ctx) | ||
7025 | { | ||
7026 | struct perf_event *leader; | ||
7027 | struct perf_event *sub; | ||
7028 | struct perf_event *child_ctr; | ||
7029 | |||
7030 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
7031 | child, NULL, child_ctx); | ||
7032 | if (IS_ERR(leader)) | ||
7033 | return PTR_ERR(leader); | ||
7034 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
7035 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
7036 | child, leader, child_ctx); | ||
7037 | if (IS_ERR(child_ctr)) | ||
7038 | return PTR_ERR(child_ctr); | ||
7039 | } | ||
7040 | return 0; | ||
5565 | } | 7041 | } |
5566 | 7042 | ||
5567 | static int | 7043 | static int |
5568 | inherit_task_group(struct perf_event *event, struct task_struct *parent, | 7044 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
5569 | struct perf_event_context *parent_ctx, | 7045 | struct perf_event_context *parent_ctx, |
5570 | struct task_struct *child, | 7046 | struct task_struct *child, int ctxn, |
5571 | int *inherited_all) | 7047 | int *inherited_all) |
5572 | { | 7048 | { |
5573 | int ret; | 7049 | int ret; |
5574 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | 7050 | struct perf_event_context *child_ctx; |
5575 | 7051 | ||
5576 | if (!event->attr.inherit) { | 7052 | if (!event->attr.inherit) { |
5577 | *inherited_all = 0; | 7053 | *inherited_all = 0; |
5578 | return 0; | 7054 | return 0; |
5579 | } | 7055 | } |
5580 | 7056 | ||
7057 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5581 | if (!child_ctx) { | 7058 | if (!child_ctx) { |
5582 | /* | 7059 | /* |
5583 | * This is executed from the parent task context, so | 7060 | * This is executed from the parent task context, so |
@@ -5586,14 +7063,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5586 | * child. | 7063 | * child. |
5587 | */ | 7064 | */ |
5588 | 7065 | ||
5589 | child_ctx = kzalloc(sizeof(struct perf_event_context), | 7066 | child_ctx = alloc_perf_context(event->pmu, child); |
5590 | GFP_KERNEL); | ||
5591 | if (!child_ctx) | 7067 | if (!child_ctx) |
5592 | return -ENOMEM; | 7068 | return -ENOMEM; |
5593 | 7069 | ||
5594 | __perf_event_init_context(child_ctx, child); | 7070 | child->perf_event_ctxp[ctxn] = child_ctx; |
5595 | child->perf_event_ctxp = child_ctx; | ||
5596 | get_task_struct(child); | ||
5597 | } | 7071 | } |
5598 | 7072 | ||
5599 | ret = inherit_group(event, parent, parent_ctx, | 7073 | ret = inherit_group(event, parent, parent_ctx, |
@@ -5605,32 +7079,27 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5605 | return ret; | 7079 | return ret; |
5606 | } | 7080 | } |
5607 | 7081 | ||
5608 | |||
5609 | /* | 7082 | /* |
5610 | * Initialize the perf_event context in task_struct | 7083 | * Initialize the perf_event context in task_struct |
5611 | */ | 7084 | */ |
5612 | int perf_event_init_task(struct task_struct *child) | 7085 | int perf_event_init_context(struct task_struct *child, int ctxn) |
5613 | { | 7086 | { |
5614 | struct perf_event_context *child_ctx, *parent_ctx; | 7087 | struct perf_event_context *child_ctx, *parent_ctx; |
5615 | struct perf_event_context *cloned_ctx; | 7088 | struct perf_event_context *cloned_ctx; |
5616 | struct perf_event *event; | 7089 | struct perf_event *event; |
5617 | struct task_struct *parent = current; | 7090 | struct task_struct *parent = current; |
5618 | int inherited_all = 1; | 7091 | int inherited_all = 1; |
7092 | unsigned long flags; | ||
5619 | int ret = 0; | 7093 | int ret = 0; |
5620 | 7094 | ||
5621 | child->perf_event_ctxp = NULL; | 7095 | if (likely(!parent->perf_event_ctxp[ctxn])) |
5622 | |||
5623 | mutex_init(&child->perf_event_mutex); | ||
5624 | INIT_LIST_HEAD(&child->perf_event_list); | ||
5625 | |||
5626 | if (likely(!parent->perf_event_ctxp)) | ||
5627 | return 0; | 7096 | return 0; |
5628 | 7097 | ||
5629 | /* | 7098 | /* |
5630 | * If the parent's context is a clone, pin it so it won't get | 7099 | * If the parent's context is a clone, pin it so it won't get |
5631 | * swapped under us. | 7100 | * swapped under us. |
5632 | */ | 7101 | */ |
5633 | parent_ctx = perf_pin_task_context(parent); | 7102 | parent_ctx = perf_pin_task_context(parent, ctxn); |
5634 | 7103 | ||
5635 | /* | 7104 | /* |
5636 | * No need to check if parent_ctx != NULL here; since we saw | 7105 | * No need to check if parent_ctx != NULL here; since we saw |
@@ -5650,31 +7119,42 @@ int perf_event_init_task(struct task_struct *child) | |||
5650 | * the list, not manipulating it: | 7119 | * the list, not manipulating it: |
5651 | */ | 7120 | */ |
5652 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 7121 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
5653 | ret = inherit_task_group(event, parent, parent_ctx, child, | 7122 | ret = inherit_task_group(event, parent, parent_ctx, |
5654 | &inherited_all); | 7123 | child, ctxn, &inherited_all); |
5655 | if (ret) | 7124 | if (ret) |
5656 | break; | 7125 | break; |
5657 | } | 7126 | } |
5658 | 7127 | ||
7128 | /* | ||
7129 | * We can't hold ctx->lock when iterating the ->flexible_group list due | ||
7130 | * to allocations, but we need to prevent rotation because | ||
7131 | * rotate_ctx() will change the list from interrupt context. | ||
7132 | */ | ||
7133 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); | ||
7134 | parent_ctx->rotate_disable = 1; | ||
7135 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
7136 | |||
5659 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 7137 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
5660 | ret = inherit_task_group(event, parent, parent_ctx, child, | 7138 | ret = inherit_task_group(event, parent, parent_ctx, |
5661 | &inherited_all); | 7139 | child, ctxn, &inherited_all); |
5662 | if (ret) | 7140 | if (ret) |
5663 | break; | 7141 | break; |
5664 | } | 7142 | } |
5665 | 7143 | ||
5666 | child_ctx = child->perf_event_ctxp; | 7144 | raw_spin_lock_irqsave(&parent_ctx->lock, flags); |
7145 | parent_ctx->rotate_disable = 0; | ||
7146 | |||
7147 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5667 | 7148 | ||
5668 | if (child_ctx && inherited_all) { | 7149 | if (child_ctx && inherited_all) { |
5669 | /* | 7150 | /* |
5670 | * Mark the child context as a clone of the parent | 7151 | * Mark the child context as a clone of the parent |
5671 | * context, or of whatever the parent is a clone of. | 7152 | * context, or of whatever the parent is a clone of. |
5672 | * Note that if the parent is a clone, it could get | 7153 | * |
5673 | * uncloned at any point, but that doesn't matter | 7154 | * Note that if the parent is a clone, the holding of |
5674 | * because the list of events and the generation | 7155 | * parent_ctx->lock avoids it from being uncloned. |
5675 | * count can't have changed since we took the mutex. | ||
5676 | */ | 7156 | */ |
5677 | cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); | 7157 | cloned_ctx = parent_ctx->parent_ctx; |
5678 | if (cloned_ctx) { | 7158 | if (cloned_ctx) { |
5679 | child_ctx->parent_ctx = cloned_ctx; | 7159 | child_ctx->parent_ctx = cloned_ctx; |
5680 | child_ctx->parent_gen = parent_ctx->parent_gen; | 7160 | child_ctx->parent_gen = parent_ctx->parent_gen; |
@@ -5685,75 +7165,136 @@ int perf_event_init_task(struct task_struct *child) | |||
5685 | get_ctx(child_ctx->parent_ctx); | 7165 | get_ctx(child_ctx->parent_ctx); |
5686 | } | 7166 | } |
5687 | 7167 | ||
7168 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
5688 | mutex_unlock(&parent_ctx->mutex); | 7169 | mutex_unlock(&parent_ctx->mutex); |
5689 | 7170 | ||
5690 | perf_unpin_context(parent_ctx); | 7171 | perf_unpin_context(parent_ctx); |
7172 | put_ctx(parent_ctx); | ||
5691 | 7173 | ||
5692 | return ret; | 7174 | return ret; |
5693 | } | 7175 | } |
5694 | 7176 | ||
7177 | /* | ||
7178 | * Initialize the perf_event context in task_struct | ||
7179 | */ | ||
7180 | int perf_event_init_task(struct task_struct *child) | ||
7181 | { | ||
7182 | int ctxn, ret; | ||
7183 | |||
7184 | memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); | ||
7185 | mutex_init(&child->perf_event_mutex); | ||
7186 | INIT_LIST_HEAD(&child->perf_event_list); | ||
7187 | |||
7188 | for_each_task_context_nr(ctxn) { | ||
7189 | ret = perf_event_init_context(child, ctxn); | ||
7190 | if (ret) | ||
7191 | return ret; | ||
7192 | } | ||
7193 | |||
7194 | return 0; | ||
7195 | } | ||
7196 | |||
5695 | static void __init perf_event_init_all_cpus(void) | 7197 | static void __init perf_event_init_all_cpus(void) |
5696 | { | 7198 | { |
7199 | struct swevent_htable *swhash; | ||
5697 | int cpu; | 7200 | int cpu; |
5698 | struct perf_cpu_context *cpuctx; | ||
5699 | 7201 | ||
5700 | for_each_possible_cpu(cpu) { | 7202 | for_each_possible_cpu(cpu) { |
5701 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 7203 | swhash = &per_cpu(swevent_htable, cpu); |
5702 | mutex_init(&cpuctx->hlist_mutex); | 7204 | mutex_init(&swhash->hlist_mutex); |
5703 | __perf_event_init_context(&cpuctx->ctx, NULL); | 7205 | INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); |
5704 | } | 7206 | } |
5705 | } | 7207 | } |
5706 | 7208 | ||
5707 | static void __cpuinit perf_event_init_cpu(int cpu) | 7209 | static void __cpuinit perf_event_init_cpu(int cpu) |
5708 | { | 7210 | { |
5709 | struct perf_cpu_context *cpuctx; | 7211 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5710 | 7212 | ||
5711 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 7213 | mutex_lock(&swhash->hlist_mutex); |
5712 | 7214 | if (swhash->hlist_refcount > 0) { | |
5713 | spin_lock(&perf_resource_lock); | ||
5714 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | ||
5715 | spin_unlock(&perf_resource_lock); | ||
5716 | |||
5717 | mutex_lock(&cpuctx->hlist_mutex); | ||
5718 | if (cpuctx->hlist_refcount > 0) { | ||
5719 | struct swevent_hlist *hlist; | 7215 | struct swevent_hlist *hlist; |
5720 | 7216 | ||
5721 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 7217 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
5722 | WARN_ON_ONCE(!hlist); | 7218 | WARN_ON(!hlist); |
5723 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 7219 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
5724 | } | 7220 | } |
5725 | mutex_unlock(&cpuctx->hlist_mutex); | 7221 | mutex_unlock(&swhash->hlist_mutex); |
5726 | } | 7222 | } |
5727 | 7223 | ||
5728 | #ifdef CONFIG_HOTPLUG_CPU | 7224 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC |
5729 | static void __perf_event_exit_cpu(void *info) | 7225 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
5730 | { | 7226 | { |
5731 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 7227 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
5732 | struct perf_event_context *ctx = &cpuctx->ctx; | 7228 | |
7229 | WARN_ON(!irqs_disabled()); | ||
7230 | |||
7231 | list_del_init(&cpuctx->rotation_list); | ||
7232 | } | ||
7233 | |||
7234 | static void __perf_event_exit_context(void *__info) | ||
7235 | { | ||
7236 | struct perf_event_context *ctx = __info; | ||
5733 | struct perf_event *event, *tmp; | 7237 | struct perf_event *event, *tmp; |
5734 | 7238 | ||
7239 | perf_pmu_rotate_stop(ctx->pmu); | ||
7240 | |||
5735 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 7241 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5736 | __perf_event_remove_from_context(event); | 7242 | __perf_remove_from_context(event); |
5737 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 7243 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
5738 | __perf_event_remove_from_context(event); | 7244 | __perf_remove_from_context(event); |
7245 | } | ||
7246 | |||
7247 | static void perf_event_exit_cpu_context(int cpu) | ||
7248 | { | ||
7249 | struct perf_event_context *ctx; | ||
7250 | struct pmu *pmu; | ||
7251 | int idx; | ||
7252 | |||
7253 | idx = srcu_read_lock(&pmus_srcu); | ||
7254 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
7255 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | ||
7256 | |||
7257 | mutex_lock(&ctx->mutex); | ||
7258 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | ||
7259 | mutex_unlock(&ctx->mutex); | ||
7260 | } | ||
7261 | srcu_read_unlock(&pmus_srcu, idx); | ||
5739 | } | 7262 | } |
7263 | |||
5740 | static void perf_event_exit_cpu(int cpu) | 7264 | static void perf_event_exit_cpu(int cpu) |
5741 | { | 7265 | { |
5742 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 7266 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5743 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
5744 | 7267 | ||
5745 | mutex_lock(&cpuctx->hlist_mutex); | 7268 | mutex_lock(&swhash->hlist_mutex); |
5746 | swevent_hlist_release(cpuctx); | 7269 | swevent_hlist_release(swhash); |
5747 | mutex_unlock(&cpuctx->hlist_mutex); | 7270 | mutex_unlock(&swhash->hlist_mutex); |
5748 | 7271 | ||
5749 | mutex_lock(&ctx->mutex); | 7272 | perf_event_exit_cpu_context(cpu); |
5750 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | ||
5751 | mutex_unlock(&ctx->mutex); | ||
5752 | } | 7273 | } |
5753 | #else | 7274 | #else |
5754 | static inline void perf_event_exit_cpu(int cpu) { } | 7275 | static inline void perf_event_exit_cpu(int cpu) { } |
5755 | #endif | 7276 | #endif |
5756 | 7277 | ||
7278 | static int | ||
7279 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | ||
7280 | { | ||
7281 | int cpu; | ||
7282 | |||
7283 | for_each_online_cpu(cpu) | ||
7284 | perf_event_exit_cpu(cpu); | ||
7285 | |||
7286 | return NOTIFY_OK; | ||
7287 | } | ||
7288 | |||
7289 | /* | ||
7290 | * Run the perf reboot notifier at the very last possible moment so that | ||
7291 | * the generic watchdog code runs as long as possible. | ||
7292 | */ | ||
7293 | static struct notifier_block perf_reboot_notifier = { | ||
7294 | .notifier_call = perf_reboot, | ||
7295 | .priority = INT_MIN, | ||
7296 | }; | ||
7297 | |||
5757 | static int __cpuinit | 7298 | static int __cpuinit |
5758 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | 7299 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
5759 | { | 7300 | { |
@@ -5778,118 +7319,115 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5778 | return NOTIFY_OK; | 7319 | return NOTIFY_OK; |
5779 | } | 7320 | } |
5780 | 7321 | ||
5781 | /* | ||
5782 | * This has to have a higher priority than migration_notifier in sched.c. | ||
5783 | */ | ||
5784 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
5785 | .notifier_call = perf_cpu_notify, | ||
5786 | .priority = 20, | ||
5787 | }; | ||
5788 | |||
5789 | void __init perf_event_init(void) | 7322 | void __init perf_event_init(void) |
5790 | { | 7323 | { |
7324 | int ret; | ||
7325 | |||
7326 | idr_init(&pmu_idr); | ||
7327 | |||
5791 | perf_event_init_all_cpus(); | 7328 | perf_event_init_all_cpus(); |
5792 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | 7329 | init_srcu_struct(&pmus_srcu); |
5793 | (void *)(long)smp_processor_id()); | 7330 | perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); |
5794 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | 7331 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
5795 | (void *)(long)smp_processor_id()); | 7332 | perf_pmu_register(&perf_task_clock, NULL, -1); |
5796 | register_cpu_notifier(&perf_cpu_nb); | 7333 | perf_tp_register(); |
5797 | } | 7334 | perf_cpu_notifier(perf_cpu_notify); |
7335 | register_reboot_notifier(&perf_reboot_notifier); | ||
5798 | 7336 | ||
5799 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, | 7337 | ret = init_hw_breakpoint(); |
5800 | struct sysdev_class_attribute *attr, | 7338 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
5801 | char *buf) | ||
5802 | { | ||
5803 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
5804 | } | 7339 | } |
5805 | 7340 | ||
5806 | static ssize_t | 7341 | static int __init perf_event_sysfs_init(void) |
5807 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
5808 | struct sysdev_class_attribute *attr, | ||
5809 | const char *buf, | ||
5810 | size_t count) | ||
5811 | { | 7342 | { |
5812 | struct perf_cpu_context *cpuctx; | 7343 | struct pmu *pmu; |
5813 | unsigned long val; | 7344 | int ret; |
5814 | int err, cpu, mpt; | ||
5815 | 7345 | ||
5816 | err = strict_strtoul(buf, 10, &val); | 7346 | mutex_lock(&pmus_lock); |
5817 | if (err) | 7347 | |
5818 | return err; | 7348 | ret = bus_register(&pmu_bus); |
5819 | if (val > perf_max_events) | 7349 | if (ret) |
5820 | return -EINVAL; | 7350 | goto unlock; |
7351 | |||
7352 | list_for_each_entry(pmu, &pmus, entry) { | ||
7353 | if (!pmu->name || pmu->type < 0) | ||
7354 | continue; | ||
5821 | 7355 | ||
5822 | spin_lock(&perf_resource_lock); | 7356 | ret = pmu_dev_alloc(pmu); |
5823 | perf_reserved_percpu = val; | 7357 | WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); |
5824 | for_each_online_cpu(cpu) { | ||
5825 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5826 | raw_spin_lock_irq(&cpuctx->ctx.lock); | ||
5827 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, | ||
5828 | perf_max_events - perf_reserved_percpu); | ||
5829 | cpuctx->max_pertask = mpt; | ||
5830 | raw_spin_unlock_irq(&cpuctx->ctx.lock); | ||
5831 | } | 7358 | } |
5832 | spin_unlock(&perf_resource_lock); | 7359 | pmu_bus_running = 1; |
7360 | ret = 0; | ||
5833 | 7361 | ||
5834 | return count; | 7362 | unlock: |
5835 | } | 7363 | mutex_unlock(&pmus_lock); |
5836 | 7364 | ||
5837 | static ssize_t perf_show_overcommit(struct sysdev_class *class, | 7365 | return ret; |
5838 | struct sysdev_class_attribute *attr, | ||
5839 | char *buf) | ||
5840 | { | ||
5841 | return sprintf(buf, "%d\n", perf_overcommit); | ||
5842 | } | 7366 | } |
7367 | device_initcall(perf_event_sysfs_init); | ||
5843 | 7368 | ||
5844 | static ssize_t | 7369 | #ifdef CONFIG_CGROUP_PERF |
5845 | perf_set_overcommit(struct sysdev_class *class, | 7370 | static struct cgroup_subsys_state *perf_cgroup_create( |
5846 | struct sysdev_class_attribute *attr, | 7371 | struct cgroup_subsys *ss, struct cgroup *cont) |
5847 | const char *buf, size_t count) | ||
5848 | { | 7372 | { |
5849 | unsigned long val; | 7373 | struct perf_cgroup *jc; |
5850 | int err; | ||
5851 | 7374 | ||
5852 | err = strict_strtoul(buf, 10, &val); | 7375 | jc = kzalloc(sizeof(*jc), GFP_KERNEL); |
5853 | if (err) | 7376 | if (!jc) |
5854 | return err; | 7377 | return ERR_PTR(-ENOMEM); |
5855 | if (val > 1) | ||
5856 | return -EINVAL; | ||
5857 | 7378 | ||
5858 | spin_lock(&perf_resource_lock); | 7379 | jc->info = alloc_percpu(struct perf_cgroup_info); |
5859 | perf_overcommit = val; | 7380 | if (!jc->info) { |
5860 | spin_unlock(&perf_resource_lock); | 7381 | kfree(jc); |
7382 | return ERR_PTR(-ENOMEM); | ||
7383 | } | ||
5861 | 7384 | ||
5862 | return count; | 7385 | return &jc->css; |
5863 | } | 7386 | } |
5864 | 7387 | ||
5865 | static SYSDEV_CLASS_ATTR( | 7388 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, |
5866 | reserve_percpu, | 7389 | struct cgroup *cont) |
5867 | 0644, | 7390 | { |
5868 | perf_show_reserve_percpu, | 7391 | struct perf_cgroup *jc; |
5869 | perf_set_reserve_percpu | 7392 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), |
5870 | ); | 7393 | struct perf_cgroup, css); |
5871 | 7394 | free_percpu(jc->info); | |
5872 | static SYSDEV_CLASS_ATTR( | 7395 | kfree(jc); |
5873 | overcommit, | 7396 | } |
5874 | 0644, | ||
5875 | perf_show_overcommit, | ||
5876 | perf_set_overcommit | ||
5877 | ); | ||
5878 | 7397 | ||
5879 | static struct attribute *perfclass_attrs[] = { | 7398 | static int __perf_cgroup_move(void *info) |
5880 | &attr_reserve_percpu.attr, | 7399 | { |
5881 | &attr_overcommit.attr, | 7400 | struct task_struct *task = info; |
5882 | NULL | 7401 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); |
5883 | }; | 7402 | return 0; |
7403 | } | ||
5884 | 7404 | ||
5885 | static struct attribute_group perfclass_attr_group = { | 7405 | static void |
5886 | .attrs = perfclass_attrs, | 7406 | perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) |
5887 | .name = "perf_events", | 7407 | { |
5888 | }; | 7408 | task_function_call(task, __perf_cgroup_move, task); |
7409 | } | ||
5889 | 7410 | ||
5890 | static int __init perf_event_sysfs_init(void) | 7411 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7412 | struct cgroup *old_cgrp, struct task_struct *task) | ||
5891 | { | 7413 | { |
5892 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | 7414 | /* |
5893 | &perfclass_attr_group); | 7415 | * cgroup_exit() is called in the copy_process() failure path. |
7416 | * Ignore this case since the task hasn't ran yet, this avoids | ||
7417 | * trying to poke a half freed task state from generic code. | ||
7418 | */ | ||
7419 | if (!(task->flags & PF_EXITING)) | ||
7420 | return; | ||
7421 | |||
7422 | perf_cgroup_attach_task(cgrp, task); | ||
5894 | } | 7423 | } |
5895 | device_initcall(perf_event_sysfs_init); | 7424 | |
7425 | struct cgroup_subsys perf_subsys = { | ||
7426 | .name = "perf_event", | ||
7427 | .subsys_id = perf_subsys_id, | ||
7428 | .create = perf_cgroup_create, | ||
7429 | .destroy = perf_cgroup_destroy, | ||
7430 | .exit = perf_cgroup_exit, | ||
7431 | .attach_task = perf_cgroup_attach_task, | ||
7432 | }; | ||
7433 | #endif /* CONFIG_CGROUP_PERF */ | ||
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c7c2aed9e2dc..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
113 | */ | 113 | */ |
114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | 114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) |
115 | { | 115 | { |
116 | struct perf_event_context *ctx = bp->ctx; | 116 | struct task_struct *tsk = bp->hw.bp_target; |
117 | struct perf_event *iter; | 117 | struct perf_event *iter; |
118 | int count = 0; | 118 | int count = 0; |
119 | 119 | ||
120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
121 | if (iter->ctx == ctx && find_slot_idx(iter) == type) | 121 | if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) |
122 | count += hw_breakpoint_weight(iter); | 122 | count += hw_breakpoint_weight(iter); |
123 | } | 123 | } |
124 | 124 | ||
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
134 | enum bp_type_idx type) | 134 | enum bp_type_idx type) |
135 | { | 135 | { |
136 | int cpu = bp->cpu; | 136 | int cpu = bp->cpu; |
137 | struct task_struct *tsk = bp->ctx->task; | 137 | struct task_struct *tsk = bp->hw.bp_target; |
138 | 138 | ||
139 | if (cpu >= 0) { | 139 | if (cpu >= 0) { |
140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); | 140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); |
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
213 | int weight) | 213 | int weight) |
214 | { | 214 | { |
215 | int cpu = bp->cpu; | 215 | int cpu = bp->cpu; |
216 | struct task_struct *tsk = bp->ctx->task; | 216 | struct task_struct *tsk = bp->hw.bp_target; |
217 | 217 | ||
218 | /* Pinned counter cpu profiling */ | 218 | /* Pinned counter cpu profiling */ |
219 | if (!tsk) { | 219 | if (!tsk) { |
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
433 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
434 | struct task_struct *tsk) | 434 | struct task_struct *tsk) |
435 | { | 435 | { |
436 | return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), | 436 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered); |
437 | triggered); | ||
438 | } | 437 | } |
439 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
440 | 439 | ||
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
516 | get_online_cpus(); | 515 | get_online_cpus(); |
517 | for_each_online_cpu(cpu) { | 516 | for_each_online_cpu(cpu) { |
518 | pevent = per_cpu_ptr(cpu_events, cpu); | 517 | pevent = per_cpu_ptr(cpu_events, cpu); |
519 | bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); | 518 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); |
520 | 519 | ||
521 | *pevent = bp; | 520 | *pevent = bp; |
522 | 521 | ||
@@ -566,7 +565,62 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { | |||
566 | .priority = 0x7fffffff | 565 | .priority = 0x7fffffff |
567 | }; | 566 | }; |
568 | 567 | ||
569 | static int __init init_hw_breakpoint(void) | 568 | static void bp_perf_event_destroy(struct perf_event *event) |
569 | { | ||
570 | release_bp_slot(event); | ||
571 | } | ||
572 | |||
573 | static int hw_breakpoint_event_init(struct perf_event *bp) | ||
574 | { | ||
575 | int err; | ||
576 | |||
577 | if (bp->attr.type != PERF_TYPE_BREAKPOINT) | ||
578 | return -ENOENT; | ||
579 | |||
580 | err = register_perf_hw_breakpoint(bp); | ||
581 | if (err) | ||
582 | return err; | ||
583 | |||
584 | bp->destroy = bp_perf_event_destroy; | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int hw_breakpoint_add(struct perf_event *bp, int flags) | ||
590 | { | ||
591 | if (!(flags & PERF_EF_START)) | ||
592 | bp->hw.state = PERF_HES_STOPPED; | ||
593 | |||
594 | return arch_install_hw_breakpoint(bp); | ||
595 | } | ||
596 | |||
597 | static void hw_breakpoint_del(struct perf_event *bp, int flags) | ||
598 | { | ||
599 | arch_uninstall_hw_breakpoint(bp); | ||
600 | } | ||
601 | |||
602 | static void hw_breakpoint_start(struct perf_event *bp, int flags) | ||
603 | { | ||
604 | bp->hw.state = 0; | ||
605 | } | ||
606 | |||
607 | static void hw_breakpoint_stop(struct perf_event *bp, int flags) | ||
608 | { | ||
609 | bp->hw.state = PERF_HES_STOPPED; | ||
610 | } | ||
611 | |||
612 | static struct pmu perf_breakpoint = { | ||
613 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | ||
614 | |||
615 | .event_init = hw_breakpoint_event_init, | ||
616 | .add = hw_breakpoint_add, | ||
617 | .del = hw_breakpoint_del, | ||
618 | .start = hw_breakpoint_start, | ||
619 | .stop = hw_breakpoint_stop, | ||
620 | .read = hw_breakpoint_pmu_read, | ||
621 | }; | ||
622 | |||
623 | int __init init_hw_breakpoint(void) | ||
570 | { | 624 | { |
571 | unsigned int **task_bp_pinned; | 625 | unsigned int **task_bp_pinned; |
572 | int cpu, err_cpu; | 626 | int cpu, err_cpu; |
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void) | |||
587 | 641 | ||
588 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
589 | 643 | ||
644 | perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); | ||
645 | |||
590 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
591 | 647 | ||
592 | err_alloc: | 648 | err_alloc: |
@@ -599,11 +655,5 @@ static int __init init_hw_breakpoint(void) | |||
599 | 655 | ||
600 | return -ENOMEM; | 656 | return -ENOMEM; |
601 | } | 657 | } |
602 | core_initcall(init_hw_breakpoint); | ||
603 | 658 | ||
604 | 659 | ||
605 | struct pmu perf_ops_bp = { | ||
606 | .enable = arch_install_hw_breakpoint, | ||
607 | .disable = arch_uninstall_hw_breakpoint, | ||
608 | .read = hw_breakpoint_pmu_read, | ||
609 | }; | ||
diff --git a/kernel/exit.c b/kernel/exit.c index b9d3bc6c21ec..64879bdff921 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/perf_event.h> | 50 | #include <linux/perf_event.h> |
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | ||
53 | 54 | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | #include <asm/unistd.h> | 56 | #include <asm/unistd.h> |
@@ -70,7 +71,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
70 | 71 | ||
71 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
72 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
73 | __get_cpu_var(process_counts)--; | 74 | __this_cpu_dec(process_counts); |
74 | } | 75 | } |
75 | list_del_rcu(&p->thread_group); | 76 | list_del_rcu(&p->thread_group); |
76 | } | 77 | } |
@@ -97,6 +98,14 @@ static void __exit_signal(struct task_struct *tsk) | |||
97 | sig->tty = NULL; | 98 | sig->tty = NULL; |
98 | } else { | 99 | } else { |
99 | /* | 100 | /* |
101 | * This can only happen if the caller is de_thread(). | ||
102 | * FIXME: this is the temporary hack, we should teach | ||
103 | * posix-cpu-timers to handle this case correctly. | ||
104 | */ | ||
105 | if (unlikely(has_group_leader_pid(tsk))) | ||
106 | posix_cpu_timers_exit_group(tsk); | ||
107 | |||
108 | /* | ||
100 | * If there is any task waiting for the group exit | 109 | * If there is any task waiting for the group exit |
101 | * then notify it: | 110 | * then notify it: |
102 | */ | 111 | */ |
@@ -151,9 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
151 | { | 160 | { |
152 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 161 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
153 | 162 | ||
154 | #ifdef CONFIG_PERF_EVENTS | 163 | perf_event_delayed_put(tsk); |
155 | WARN_ON_ONCE(tsk->perf_event_ctxp); | ||
156 | #endif | ||
157 | trace_sched_process_free(tsk); | 164 | trace_sched_process_free(tsk); |
158 | put_task_struct(tsk); | 165 | put_task_struct(tsk); |
159 | } | 166 | } |
@@ -556,29 +563,28 @@ void exit_files(struct task_struct *tsk) | |||
556 | 563 | ||
557 | #ifdef CONFIG_MM_OWNER | 564 | #ifdef CONFIG_MM_OWNER |
558 | /* | 565 | /* |
559 | * Task p is exiting and it owned mm, lets find a new owner for it | 566 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
560 | */ | 567 | */ |
561 | static inline int | ||
562 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) | ||
563 | { | ||
564 | /* | ||
565 | * If there are other users of the mm and the owner (us) is exiting | ||
566 | * we need to find a new owner to take on the responsibility. | ||
567 | */ | ||
568 | if (atomic_read(&mm->mm_users) <= 1) | ||
569 | return 0; | ||
570 | if (mm->owner != p) | ||
571 | return 0; | ||
572 | return 1; | ||
573 | } | ||
574 | |||
575 | void mm_update_next_owner(struct mm_struct *mm) | 568 | void mm_update_next_owner(struct mm_struct *mm) |
576 | { | 569 | { |
577 | struct task_struct *c, *g, *p = current; | 570 | struct task_struct *c, *g, *p = current; |
578 | 571 | ||
579 | retry: | 572 | retry: |
580 | if (!mm_need_new_owner(mm, p)) | 573 | /* |
574 | * If the exiting or execing task is not the owner, it's | ||
575 | * someone else's problem. | ||
576 | */ | ||
577 | if (mm->owner != p) | ||
578 | return; | ||
579 | /* | ||
580 | * The current owner is exiting/execing and there are no other | ||
581 | * candidates. Do not leave the mm pointing to a possibly | ||
582 | * freed task structure. | ||
583 | */ | ||
584 | if (atomic_read(&mm->mm_users) <= 1) { | ||
585 | mm->owner = NULL; | ||
581 | return; | 586 | return; |
587 | } | ||
582 | 588 | ||
583 | read_lock(&tasklist_lock); | 589 | read_lock(&tasklist_lock); |
584 | /* | 590 | /* |
@@ -691,6 +697,8 @@ static void exit_mm(struct task_struct * tsk) | |||
691 | enter_lazy_tlb(mm, current); | 697 | enter_lazy_tlb(mm, current); |
692 | /* We don't want this task to be frozen prematurely */ | 698 | /* We don't want this task to be frozen prematurely */ |
693 | clear_freeze_flag(tsk); | 699 | clear_freeze_flag(tsk); |
700 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
701 | atomic_dec(&mm->oom_disable_count); | ||
694 | task_unlock(tsk); | 702 | task_unlock(tsk); |
695 | mm_update_next_owner(mm); | 703 | mm_update_next_owner(mm); |
696 | mmput(mm); | 704 | mmput(mm); |
@@ -704,6 +712,8 @@ static void exit_mm(struct task_struct * tsk) | |||
704 | * space. | 712 | * space. |
705 | */ | 713 | */ |
706 | static struct task_struct *find_new_reaper(struct task_struct *father) | 714 | static struct task_struct *find_new_reaper(struct task_struct *father) |
715 | __releases(&tasklist_lock) | ||
716 | __acquires(&tasklist_lock) | ||
707 | { | 717 | { |
708 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 718 | struct pid_namespace *pid_ns = task_active_pid_ns(father); |
709 | struct task_struct *thread; | 719 | struct task_struct *thread; |
@@ -832,7 +842,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
832 | /* Let father know we died | 842 | /* Let father know we died |
833 | * | 843 | * |
834 | * Thread signals are configurable, but you aren't going to use | 844 | * Thread signals are configurable, but you aren't going to use |
835 | * that to send signals to arbitary processes. | 845 | * that to send signals to arbitrary processes. |
836 | * That stops right now. | 846 | * That stops right now. |
837 | * | 847 | * |
838 | * If the parent exec id doesn't match the exec id we saved | 848 | * If the parent exec id doesn't match the exec id we saved |
@@ -899,12 +909,22 @@ NORET_TYPE void do_exit(long code) | |||
899 | profile_task_exit(tsk); | 909 | profile_task_exit(tsk); |
900 | 910 | ||
901 | WARN_ON(atomic_read(&tsk->fs_excl)); | 911 | WARN_ON(atomic_read(&tsk->fs_excl)); |
912 | WARN_ON(blk_needs_flush_plug(tsk)); | ||
902 | 913 | ||
903 | if (unlikely(in_interrupt())) | 914 | if (unlikely(in_interrupt())) |
904 | panic("Aiee, killing interrupt handler!"); | 915 | panic("Aiee, killing interrupt handler!"); |
905 | if (unlikely(!tsk->pid)) | 916 | if (unlikely(!tsk->pid)) |
906 | panic("Attempted to kill the idle task!"); | 917 | panic("Attempted to kill the idle task!"); |
907 | 918 | ||
919 | /* | ||
920 | * If do_exit is called because this processes oopsed, it's possible | ||
921 | * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before | ||
922 | * continuing. Amongst other possible reasons, this is to prevent | ||
923 | * mm_release()->clear_child_tid() from writing to a user-controlled | ||
924 | * kernel address. | ||
925 | */ | ||
926 | set_fs(USER_DS); | ||
927 | |||
908 | tracehook_report_exit(&code); | 928 | tracehook_report_exit(&code); |
909 | 929 | ||
910 | validate_creds_for_do_exit(tsk); | 930 | validate_creds_for_do_exit(tsk); |
@@ -978,6 +998,15 @@ NORET_TYPE void do_exit(long code) | |||
978 | exit_fs(tsk); | 998 | exit_fs(tsk); |
979 | check_stack_usage(); | 999 | check_stack_usage(); |
980 | exit_thread(); | 1000 | exit_thread(); |
1001 | |||
1002 | /* | ||
1003 | * Flush inherited counters to the parent - before the parent | ||
1004 | * gets woken up by child-exit notifications. | ||
1005 | * | ||
1006 | * because of cgroup mode, must be called before cgroup_exit() | ||
1007 | */ | ||
1008 | perf_event_exit_task(tsk); | ||
1009 | |||
981 | cgroup_exit(tsk, 1); | 1010 | cgroup_exit(tsk, 1); |
982 | 1011 | ||
983 | if (group_dead) | 1012 | if (group_dead) |
@@ -990,12 +1019,7 @@ NORET_TYPE void do_exit(long code) | |||
990 | /* | 1019 | /* |
991 | * FIXME: do that only when needed, using sched_exit tracepoint | 1020 | * FIXME: do that only when needed, using sched_exit tracepoint |
992 | */ | 1021 | */ |
993 | flush_ptrace_hw_breakpoint(tsk); | 1022 | ptrace_put_breakpoints(tsk); |
994 | /* | ||
995 | * Flush inherited counters to the parent - before the parent | ||
996 | * gets woken up by child-exit notifications. | ||
997 | */ | ||
998 | perf_event_exit_task(tsk); | ||
999 | 1023 | ||
1000 | exit_notify(tsk, group_dead); | 1024 | exit_notify(tsk, group_dead); |
1001 | #ifdef CONFIG_NUMA | 1025 | #ifdef CONFIG_NUMA |
@@ -1356,11 +1380,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) | |||
1356 | return NULL; | 1380 | return NULL; |
1357 | } | 1381 | } |
1358 | 1382 | ||
1359 | /* | 1383 | /** |
1360 | * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold | 1384 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED |
1361 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1385 | * @wo: wait options |
1362 | * the lock and this task is uninteresting. If we return nonzero, we have | 1386 | * @ptrace: is the wait for ptrace |
1363 | * released the lock and the system call should return. | 1387 | * @p: task to wait for |
1388 | * | ||
1389 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. | ||
1390 | * | ||
1391 | * CONTEXT: | ||
1392 | * read_lock(&tasklist_lock), which is released if return value is | ||
1393 | * non-zero. Also, grabs and releases @p->sighand->siglock. | ||
1394 | * | ||
1395 | * RETURNS: | ||
1396 | * 0 if wait condition didn't exist and search for other wait conditions | ||
1397 | * should continue. Non-zero return, -errno on failure and @p's pid on | ||
1398 | * success, implies that tasklist_lock is released and wait condition | ||
1399 | * search should terminate. | ||
1364 | */ | 1400 | */ |
1365 | static int wait_task_stopped(struct wait_opts *wo, | 1401 | static int wait_task_stopped(struct wait_opts *wo, |
1366 | int ptrace, struct task_struct *p) | 1402 | int ptrace, struct task_struct *p) |
@@ -1376,6 +1412,9 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1376 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) | 1412 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) |
1377 | return 0; | 1413 | return 0; |
1378 | 1414 | ||
1415 | if (!task_stopped_code(p, ptrace)) | ||
1416 | return 0; | ||
1417 | |||
1379 | exit_code = 0; | 1418 | exit_code = 0; |
1380 | spin_lock_irq(&p->sighand->siglock); | 1419 | spin_lock_irq(&p->sighand->siglock); |
1381 | 1420 | ||
@@ -1517,33 +1556,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1517 | return 0; | 1556 | return 0; |
1518 | } | 1557 | } |
1519 | 1558 | ||
1520 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | 1559 | /* dead body doesn't have much to contribute */ |
1560 | if (p->exit_state == EXIT_DEAD) | ||
1561 | return 0; | ||
1562 | |||
1563 | /* slay zombie? */ | ||
1564 | if (p->exit_state == EXIT_ZOMBIE) { | ||
1521 | /* | 1565 | /* |
1522 | * This child is hidden by ptrace. | 1566 | * A zombie ptracee is only visible to its ptracer. |
1523 | * We aren't allowed to see it now, but eventually we will. | 1567 | * Notification and reaping will be cascaded to the real |
1568 | * parent when the ptracer detaches. | ||
1569 | */ | ||
1570 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | ||
1571 | /* it will become visible, clear notask_error */ | ||
1572 | wo->notask_error = 0; | ||
1573 | return 0; | ||
1574 | } | ||
1575 | |||
1576 | /* we don't reap group leaders with subthreads */ | ||
1577 | if (!delay_group_leader(p)) | ||
1578 | return wait_task_zombie(wo, p); | ||
1579 | |||
1580 | /* | ||
1581 | * Allow access to stopped/continued state via zombie by | ||
1582 | * falling through. Clearing of notask_error is complex. | ||
1583 | * | ||
1584 | * When !@ptrace: | ||
1585 | * | ||
1586 | * If WEXITED is set, notask_error should naturally be | ||
1587 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, | ||
1588 | * so, if there are live subthreads, there are events to | ||
1589 | * wait for. If all subthreads are dead, it's still safe | ||
1590 | * to clear - this function will be called again in finite | ||
1591 | * amount time once all the subthreads are released and | ||
1592 | * will then return without clearing. | ||
1593 | * | ||
1594 | * When @ptrace: | ||
1595 | * | ||
1596 | * Stopped state is per-task and thus can't change once the | ||
1597 | * target task dies. Only continued and exited can happen. | ||
1598 | * Clear notask_error if WCONTINUED | WEXITED. | ||
1599 | */ | ||
1600 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) | ||
1601 | wo->notask_error = 0; | ||
1602 | } else { | ||
1603 | /* | ||
1604 | * If @p is ptraced by a task in its real parent's group, | ||
1605 | * hide group stop/continued state when looking at @p as | ||
1606 | * the real parent; otherwise, a single stop can be | ||
1607 | * reported twice as group and ptrace stops. | ||
1608 | * | ||
1609 | * If a ptracer wants to distinguish the two events for its | ||
1610 | * own children, it should create a separate process which | ||
1611 | * takes the role of real parent. | ||
1612 | */ | ||
1613 | if (likely(!ptrace) && task_ptrace(p) && | ||
1614 | same_thread_group(p->parent, p->real_parent)) | ||
1615 | return 0; | ||
1616 | |||
1617 | /* | ||
1618 | * @p is alive and it's gonna stop, continue or exit, so | ||
1619 | * there always is something to wait for. | ||
1524 | */ | 1620 | */ |
1525 | wo->notask_error = 0; | 1621 | wo->notask_error = 0; |
1526 | return 0; | ||
1527 | } | 1622 | } |
1528 | 1623 | ||
1529 | if (p->exit_state == EXIT_DEAD) | ||
1530 | return 0; | ||
1531 | |||
1532 | /* | 1624 | /* |
1533 | * We don't reap group leaders with subthreads. | 1625 | * Wait for stopped. Depending on @ptrace, different stopped state |
1626 | * is used and the two don't interact with each other. | ||
1534 | */ | 1627 | */ |
1535 | if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) | 1628 | ret = wait_task_stopped(wo, ptrace, p); |
1536 | return wait_task_zombie(wo, p); | 1629 | if (ret) |
1630 | return ret; | ||
1537 | 1631 | ||
1538 | /* | 1632 | /* |
1539 | * It's stopped or running now, so it might | 1633 | * Wait for continued. There's only one continued state and the |
1540 | * later continue, exit, or stop again. | 1634 | * ptracer can consume it which can confuse the real parent. Don't |
1635 | * use WCONTINUED from ptracer. You don't need or want it. | ||
1541 | */ | 1636 | */ |
1542 | wo->notask_error = 0; | ||
1543 | |||
1544 | if (task_stopped_code(p, ptrace)) | ||
1545 | return wait_task_stopped(wo, ptrace, p); | ||
1546 | |||
1547 | return wait_task_continued(wo, p); | 1637 | return wait_task_continued(wo, p); |
1548 | } | 1638 | } |
1549 | 1639 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263f8524..5339705b8241 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr) | |||
72 | return 0; | 72 | return 0; |
73 | } | 73 | } |
74 | 74 | ||
75 | /** | ||
76 | * core_kernel_data - tell if addr points to kernel data | ||
77 | * @addr: address to test | ||
78 | * | ||
79 | * Returns true if @addr passed in is from the core kernel data | ||
80 | * section. | ||
81 | * | ||
82 | * Note: On some archs it may return true for core RODATA, and false | ||
83 | * for others. But will always be true for core RW data. | ||
84 | */ | ||
85 | int core_kernel_data(unsigned long addr) | ||
86 | { | ||
87 | if (addr >= (unsigned long)_sdata && | ||
88 | addr < (unsigned long)_edata) | ||
89 | return 1; | ||
90 | return 0; | ||
91 | } | ||
92 | |||
75 | int __kernel_text_address(unsigned long addr) | 93 | int __kernel_text_address(unsigned long addr) |
76 | { | 94 | { |
77 | if (core_kernel_text(addr)) | 95 | if (core_kernel_text(addr)) |
diff --git a/kernel/fork.c b/kernel/fork.c index ab7f29d906c7..25c6111fe3a6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/tracehook.h> | 40 | #include <linux/tracehook.h> |
41 | #include <linux/futex.h> | 41 | #include <linux/futex.h> |
42 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
43 | #include <linux/kthread.h> | ||
43 | #include <linux/task_io_accounting_ops.h> | 44 | #include <linux/task_io_accounting_ops.h> |
44 | #include <linux/rcupdate.h> | 45 | #include <linux/rcupdate.h> |
45 | #include <linux/ptrace.h> | 46 | #include <linux/ptrace.h> |
@@ -58,13 +59,14 @@ | |||
58 | #include <linux/taskstats_kern.h> | 59 | #include <linux/taskstats_kern.h> |
59 | #include <linux/random.h> | 60 | #include <linux/random.h> |
60 | #include <linux/tty.h> | 61 | #include <linux/tty.h> |
61 | #include <linux/proc_fs.h> | ||
62 | #include <linux/blkdev.h> | 62 | #include <linux/blkdev.h> |
63 | #include <linux/fs_struct.h> | 63 | #include <linux/fs_struct.h> |
64 | #include <linux/magic.h> | 64 | #include <linux/magic.h> |
65 | #include <linux/perf_event.h> | 65 | #include <linux/perf_event.h> |
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | ||
69 | #include <linux/khugepaged.h> | ||
68 | 70 | ||
69 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
70 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -110,20 +112,25 @@ int nr_processes(void) | |||
110 | } | 112 | } |
111 | 113 | ||
112 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 114 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
113 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | 115 | # define alloc_task_struct_node(node) \ |
114 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | 116 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) |
117 | # define free_task_struct(tsk) \ | ||
118 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
115 | static struct kmem_cache *task_struct_cachep; | 119 | static struct kmem_cache *task_struct_cachep; |
116 | #endif | 120 | #endif |
117 | 121 | ||
118 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 122 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR |
119 | static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) | 123 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
124 | int node) | ||
120 | { | 125 | { |
121 | #ifdef CONFIG_DEBUG_STACK_USAGE | 126 | #ifdef CONFIG_DEBUG_STACK_USAGE |
122 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 127 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; |
123 | #else | 128 | #else |
124 | gfp_t mask = GFP_KERNEL; | 129 | gfp_t mask = GFP_KERNEL; |
125 | #endif | 130 | #endif |
126 | return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); | 131 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); |
132 | |||
133 | return page ? page_address(page) : NULL; | ||
127 | } | 134 | } |
128 | 135 | ||
129 | static inline void free_thread_info(struct thread_info *ti) | 136 | static inline void free_thread_info(struct thread_info *ti) |
@@ -171,6 +178,7 @@ EXPORT_SYMBOL(free_task); | |||
171 | static inline void free_signal_struct(struct signal_struct *sig) | 178 | static inline void free_signal_struct(struct signal_struct *sig) |
172 | { | 179 | { |
173 | taskstats_tgid_free(sig); | 180 | taskstats_tgid_free(sig); |
181 | sched_autogroup_exit(sig); | ||
174 | kmem_cache_free(signal_cachep, sig); | 182 | kmem_cache_free(signal_cachep, sig); |
175 | } | 183 | } |
176 | 184 | ||
@@ -194,6 +202,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
194 | if (!profile_handoff_task(tsk)) | 202 | if (!profile_handoff_task(tsk)) |
195 | free_task(tsk); | 203 | free_task(tsk); |
196 | } | 204 | } |
205 | EXPORT_SYMBOL_GPL(__put_task_struct); | ||
197 | 206 | ||
198 | /* | 207 | /* |
199 | * macro override instead of weak attribute alias, to workaround | 208 | * macro override instead of weak attribute alias, to workaround |
@@ -249,16 +258,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
249 | struct task_struct *tsk; | 258 | struct task_struct *tsk; |
250 | struct thread_info *ti; | 259 | struct thread_info *ti; |
251 | unsigned long *stackend; | 260 | unsigned long *stackend; |
252 | 261 | int node = tsk_fork_get_node(orig); | |
253 | int err; | 262 | int err; |
254 | 263 | ||
255 | prepare_to_copy(orig); | 264 | prepare_to_copy(orig); |
256 | 265 | ||
257 | tsk = alloc_task_struct(); | 266 | tsk = alloc_task_struct_node(node); |
258 | if (!tsk) | 267 | if (!tsk) |
259 | return NULL; | 268 | return NULL; |
260 | 269 | ||
261 | ti = alloc_thread_info(tsk); | 270 | ti = alloc_thread_info_node(tsk, node); |
262 | if (!ti) { | 271 | if (!ti) { |
263 | free_task_struct(tsk); | 272 | free_task_struct(tsk); |
264 | return NULL; | 273 | return NULL; |
@@ -279,6 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
279 | 288 | ||
280 | setup_thread_stack(tsk, orig); | 289 | setup_thread_stack(tsk, orig); |
281 | clear_user_return_notifier(tsk); | 290 | clear_user_return_notifier(tsk); |
291 | clear_tsk_need_resched(tsk); | ||
282 | stackend = end_of_stack(tsk); | 292 | stackend = end_of_stack(tsk); |
283 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | 293 | *stackend = STACK_END_MAGIC; /* for overflow detection */ |
284 | 294 | ||
@@ -334,6 +344,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
334 | retval = ksm_fork(mm, oldmm); | 344 | retval = ksm_fork(mm, oldmm); |
335 | if (retval) | 345 | if (retval) |
336 | goto out; | 346 | goto out; |
347 | retval = khugepaged_fork(mm, oldmm); | ||
348 | if (retval) | ||
349 | goto out; | ||
337 | 350 | ||
338 | prev = NULL; | 351 | prev = NULL; |
339 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 352 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
@@ -376,15 +389,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
376 | get_file(file); | 389 | get_file(file); |
377 | if (tmp->vm_flags & VM_DENYWRITE) | 390 | if (tmp->vm_flags & VM_DENYWRITE) |
378 | atomic_dec(&inode->i_writecount); | 391 | atomic_dec(&inode->i_writecount); |
379 | spin_lock(&mapping->i_mmap_lock); | 392 | mutex_lock(&mapping->i_mmap_mutex); |
380 | if (tmp->vm_flags & VM_SHARED) | 393 | if (tmp->vm_flags & VM_SHARED) |
381 | mapping->i_mmap_writable++; | 394 | mapping->i_mmap_writable++; |
382 | tmp->vm_truncate_count = mpnt->vm_truncate_count; | ||
383 | flush_dcache_mmap_lock(mapping); | 395 | flush_dcache_mmap_lock(mapping); |
384 | /* insert tmp into the share list, just after mpnt */ | 396 | /* insert tmp into the share list, just after mpnt */ |
385 | vma_prio_tree_add(tmp, mpnt); | 397 | vma_prio_tree_add(tmp, mpnt); |
386 | flush_dcache_mmap_unlock(mapping); | 398 | flush_dcache_mmap_unlock(mapping); |
387 | spin_unlock(&mapping->i_mmap_lock); | 399 | mutex_unlock(&mapping->i_mmap_mutex); |
388 | } | 400 | } |
389 | 401 | ||
390 | /* | 402 | /* |
@@ -495,6 +507,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
495 | mm->cached_hole_size = ~0UL; | 507 | mm->cached_hole_size = ~0UL; |
496 | mm_init_aio(mm); | 508 | mm_init_aio(mm); |
497 | mm_init_owner(mm, p); | 509 | mm_init_owner(mm, p); |
510 | atomic_set(&mm->oom_disable_count, 0); | ||
498 | 511 | ||
499 | if (likely(!mm_alloc_pgd(mm))) { | 512 | if (likely(!mm_alloc_pgd(mm))) { |
500 | mm->def_flags = 0; | 513 | mm->def_flags = 0; |
@@ -514,11 +527,12 @@ struct mm_struct * mm_alloc(void) | |||
514 | struct mm_struct * mm; | 527 | struct mm_struct * mm; |
515 | 528 | ||
516 | mm = allocate_mm(); | 529 | mm = allocate_mm(); |
517 | if (mm) { | 530 | if (!mm) |
518 | memset(mm, 0, sizeof(*mm)); | 531 | return NULL; |
519 | mm = mm_init(mm, current); | 532 | |
520 | } | 533 | memset(mm, 0, sizeof(*mm)); |
521 | return mm; | 534 | mm_init_cpumask(mm); |
535 | return mm_init(mm, current); | ||
522 | } | 536 | } |
523 | 537 | ||
524 | /* | 538 | /* |
@@ -532,6 +546,9 @@ void __mmdrop(struct mm_struct *mm) | |||
532 | mm_free_pgd(mm); | 546 | mm_free_pgd(mm); |
533 | destroy_context(mm); | 547 | destroy_context(mm); |
534 | mmu_notifier_mm_destroy(mm); | 548 | mmu_notifier_mm_destroy(mm); |
549 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
550 | VM_BUG_ON(mm->pmd_huge_pte); | ||
551 | #endif | ||
535 | free_mm(mm); | 552 | free_mm(mm); |
536 | } | 553 | } |
537 | EXPORT_SYMBOL_GPL(__mmdrop); | 554 | EXPORT_SYMBOL_GPL(__mmdrop); |
@@ -546,6 +563,7 @@ void mmput(struct mm_struct *mm) | |||
546 | if (atomic_dec_and_test(&mm->mm_users)) { | 563 | if (atomic_dec_and_test(&mm->mm_users)) { |
547 | exit_aio(mm); | 564 | exit_aio(mm); |
548 | ksm_exit(mm); | 565 | ksm_exit(mm); |
566 | khugepaged_exit(mm); /* must run before exit_mmap */ | ||
549 | exit_mmap(mm); | 567 | exit_mmap(mm); |
550 | set_mm_exe_file(mm, NULL); | 568 | set_mm_exe_file(mm, NULL); |
551 | if (!list_empty(&mm->mmlist)) { | 569 | if (!list_empty(&mm->mmlist)) { |
@@ -561,6 +579,57 @@ void mmput(struct mm_struct *mm) | |||
561 | } | 579 | } |
562 | EXPORT_SYMBOL_GPL(mmput); | 580 | EXPORT_SYMBOL_GPL(mmput); |
563 | 581 | ||
582 | /* | ||
583 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
584 | * during exec and are not mapped with the mmap system call. | ||
585 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
586 | */ | ||
587 | void added_exe_file_vma(struct mm_struct *mm) | ||
588 | { | ||
589 | mm->num_exe_file_vmas++; | ||
590 | } | ||
591 | |||
592 | void removed_exe_file_vma(struct mm_struct *mm) | ||
593 | { | ||
594 | mm->num_exe_file_vmas--; | ||
595 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ | ||
596 | fput(mm->exe_file); | ||
597 | mm->exe_file = NULL; | ||
598 | } | ||
599 | |||
600 | } | ||
601 | |||
602 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | ||
603 | { | ||
604 | if (new_exe_file) | ||
605 | get_file(new_exe_file); | ||
606 | if (mm->exe_file) | ||
607 | fput(mm->exe_file); | ||
608 | mm->exe_file = new_exe_file; | ||
609 | mm->num_exe_file_vmas = 0; | ||
610 | } | ||
611 | |||
612 | struct file *get_mm_exe_file(struct mm_struct *mm) | ||
613 | { | ||
614 | struct file *exe_file; | ||
615 | |||
616 | /* We need mmap_sem to protect against races with removal of | ||
617 | * VM_EXECUTABLE vmas */ | ||
618 | down_read(&mm->mmap_sem); | ||
619 | exe_file = mm->exe_file; | ||
620 | if (exe_file) | ||
621 | get_file(exe_file); | ||
622 | up_read(&mm->mmap_sem); | ||
623 | return exe_file; | ||
624 | } | ||
625 | |||
626 | static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) | ||
627 | { | ||
628 | /* It's safe to write the exe_file pointer without exe_file_lock because | ||
629 | * this is called during fork when the task is not yet in /proc */ | ||
630 | newmm->exe_file = get_mm_exe_file(oldmm); | ||
631 | } | ||
632 | |||
564 | /** | 633 | /** |
565 | * get_task_mm - acquire a reference to the task's mm | 634 | * get_task_mm - acquire a reference to the task's mm |
566 | * | 635 | * |
@@ -667,11 +736,16 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
667 | goto fail_nomem; | 736 | goto fail_nomem; |
668 | 737 | ||
669 | memcpy(mm, oldmm, sizeof(*mm)); | 738 | memcpy(mm, oldmm, sizeof(*mm)); |
739 | mm_init_cpumask(mm); | ||
670 | 740 | ||
671 | /* Initializing for Swap token stuff */ | 741 | /* Initializing for Swap token stuff */ |
672 | mm->token_priority = 0; | 742 | mm->token_priority = 0; |
673 | mm->last_interval = 0; | 743 | mm->last_interval = 0; |
674 | 744 | ||
745 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
746 | mm->pmd_huge_pte = NULL; | ||
747 | #endif | ||
748 | |||
675 | if (!mm_init(mm, tsk)) | 749 | if (!mm_init(mm, tsk)) |
676 | goto fail_nomem; | 750 | goto fail_nomem; |
677 | 751 | ||
@@ -748,6 +822,8 @@ good_mm: | |||
748 | /* Initializing for Swap token stuff */ | 822 | /* Initializing for Swap token stuff */ |
749 | mm->token_priority = 0; | 823 | mm->token_priority = 0; |
750 | mm->last_interval = 0; | 824 | mm->last_interval = 0; |
825 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
826 | atomic_inc(&mm->oom_disable_count); | ||
751 | 827 | ||
752 | tsk->mm = mm; | 828 | tsk->mm = mm; |
753 | tsk->active_mm = mm; | 829 | tsk->active_mm = mm; |
@@ -907,9 +983,17 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
907 | posix_cpu_timers_init_group(sig); | 983 | posix_cpu_timers_init_group(sig); |
908 | 984 | ||
909 | tty_audit_fork(sig); | 985 | tty_audit_fork(sig); |
986 | sched_autogroup_fork(sig); | ||
987 | |||
988 | #ifdef CONFIG_CGROUPS | ||
989 | init_rwsem(&sig->threadgroup_fork_lock); | ||
990 | #endif | ||
910 | 991 | ||
911 | sig->oom_adj = current->signal->oom_adj; | 992 | sig->oom_adj = current->signal->oom_adj; |
912 | sig->oom_score_adj = current->signal->oom_score_adj; | 993 | sig->oom_score_adj = current->signal->oom_score_adj; |
994 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | ||
995 | |||
996 | mutex_init(&sig->cred_guard_mutex); | ||
913 | 997 | ||
914 | return 0; | 998 | return 0; |
915 | } | 999 | } |
@@ -1081,12 +1165,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1081 | 1165 | ||
1082 | posix_cpu_timers_init(p); | 1166 | posix_cpu_timers_init(p); |
1083 | 1167 | ||
1084 | p->lock_depth = -1; /* -1 = no lock */ | ||
1085 | do_posix_clock_monotonic_gettime(&p->start_time); | 1168 | do_posix_clock_monotonic_gettime(&p->start_time); |
1086 | p->real_start_time = p->start_time; | 1169 | p->real_start_time = p->start_time; |
1087 | monotonic_to_bootbased(&p->real_start_time); | 1170 | monotonic_to_bootbased(&p->real_start_time); |
1088 | p->io_context = NULL; | 1171 | p->io_context = NULL; |
1089 | p->audit_context = NULL; | 1172 | p->audit_context = NULL; |
1173 | if (clone_flags & CLONE_THREAD) | ||
1174 | threadgroup_fork_read_lock(current); | ||
1090 | cgroup_fork(p); | 1175 | cgroup_fork(p); |
1091 | #ifdef CONFIG_NUMA | 1176 | #ifdef CONFIG_NUMA |
1092 | p->mempolicy = mpol_dup(p->mempolicy); | 1177 | p->mempolicy = mpol_dup(p->mempolicy); |
@@ -1131,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1131 | #endif | 1216 | #endif |
1132 | 1217 | ||
1133 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1218 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1134 | sched_fork(p, clone_flags); | 1219 | sched_fork(p); |
1135 | 1220 | ||
1136 | retval = perf_event_init_task(p); | 1221 | retval = perf_event_init_task(p); |
1137 | if (retval) | 1222 | if (retval) |
@@ -1165,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1165 | pid = alloc_pid(p->nsproxy->pid_ns); | 1250 | pid = alloc_pid(p->nsproxy->pid_ns); |
1166 | if (!pid) | 1251 | if (!pid) |
1167 | goto bad_fork_cleanup_io; | 1252 | goto bad_fork_cleanup_io; |
1168 | |||
1169 | if (clone_flags & CLONE_NEWPID) { | ||
1170 | retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); | ||
1171 | if (retval < 0) | ||
1172 | goto bad_fork_free_pid; | ||
1173 | } | ||
1174 | } | 1253 | } |
1175 | 1254 | ||
1176 | p->pid = pid_nr(pid); | 1255 | p->pid = pid_nr(pid); |
@@ -1178,17 +1257,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1178 | if (clone_flags & CLONE_THREAD) | 1257 | if (clone_flags & CLONE_THREAD) |
1179 | p->tgid = current->tgid; | 1258 | p->tgid = current->tgid; |
1180 | 1259 | ||
1181 | if (current->nsproxy != p->nsproxy) { | ||
1182 | retval = ns_cgroup_clone(p, pid); | ||
1183 | if (retval) | ||
1184 | goto bad_fork_free_pid; | ||
1185 | } | ||
1186 | |||
1187 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1260 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
1188 | /* | 1261 | /* |
1189 | * Clear TID on mm_release()? | 1262 | * Clear TID on mm_release()? |
1190 | */ | 1263 | */ |
1191 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1264 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; |
1265 | #ifdef CONFIG_BLOCK | ||
1266 | p->plug = NULL; | ||
1267 | #endif | ||
1192 | #ifdef CONFIG_FUTEX | 1268 | #ifdef CONFIG_FUTEX |
1193 | p->robust_list = NULL; | 1269 | p->robust_list = NULL; |
1194 | #ifdef CONFIG_COMPAT | 1270 | #ifdef CONFIG_COMPAT |
@@ -1274,7 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1274 | tracehook_finish_clone(p, clone_flags, trace); | 1350 | tracehook_finish_clone(p, clone_flags, trace); |
1275 | 1351 | ||
1276 | if (thread_group_leader(p)) { | 1352 | if (thread_group_leader(p)) { |
1277 | if (clone_flags & CLONE_NEWPID) | 1353 | if (is_child_reaper(pid)) |
1278 | p->nsproxy->pid_ns->child_reaper = p; | 1354 | p->nsproxy->pid_ns->child_reaper = p; |
1279 | 1355 | ||
1280 | p->signal->leader_pid = pid; | 1356 | p->signal->leader_pid = pid; |
@@ -1283,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1283 | attach_pid(p, PIDTYPE_SID, task_session(current)); | 1359 | attach_pid(p, PIDTYPE_SID, task_session(current)); |
1284 | list_add_tail(&p->sibling, &p->real_parent->children); | 1360 | list_add_tail(&p->sibling, &p->real_parent->children); |
1285 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1361 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1286 | __get_cpu_var(process_counts)++; | 1362 | __this_cpu_inc(process_counts); |
1287 | } | 1363 | } |
1288 | attach_pid(p, PIDTYPE_PID, pid); | 1364 | attach_pid(p, PIDTYPE_PID, pid); |
1289 | nr_threads++; | 1365 | nr_threads++; |
@@ -1294,6 +1370,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1294 | write_unlock_irq(&tasklist_lock); | 1370 | write_unlock_irq(&tasklist_lock); |
1295 | proc_fork_connector(p); | 1371 | proc_fork_connector(p); |
1296 | cgroup_post_fork(p); | 1372 | cgroup_post_fork(p); |
1373 | if (clone_flags & CLONE_THREAD) | ||
1374 | threadgroup_fork_read_unlock(current); | ||
1297 | perf_event_fork(p); | 1375 | perf_event_fork(p); |
1298 | return p; | 1376 | return p; |
1299 | 1377 | ||
@@ -1306,8 +1384,13 @@ bad_fork_cleanup_io: | |||
1306 | bad_fork_cleanup_namespaces: | 1384 | bad_fork_cleanup_namespaces: |
1307 | exit_task_namespaces(p); | 1385 | exit_task_namespaces(p); |
1308 | bad_fork_cleanup_mm: | 1386 | bad_fork_cleanup_mm: |
1309 | if (p->mm) | 1387 | if (p->mm) { |
1388 | task_lock(p); | ||
1389 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1390 | atomic_dec(&p->mm->oom_disable_count); | ||
1391 | task_unlock(p); | ||
1310 | mmput(p->mm); | 1392 | mmput(p->mm); |
1393 | } | ||
1311 | bad_fork_cleanup_signal: | 1394 | bad_fork_cleanup_signal: |
1312 | if (!(clone_flags & CLONE_THREAD)) | 1395 | if (!(clone_flags & CLONE_THREAD)) |
1313 | free_signal_struct(p->signal); | 1396 | free_signal_struct(p->signal); |
@@ -1327,6 +1410,8 @@ bad_fork_cleanup_policy: | |||
1327 | mpol_put(p->mempolicy); | 1410 | mpol_put(p->mempolicy); |
1328 | bad_fork_cleanup_cgroup: | 1411 | bad_fork_cleanup_cgroup: |
1329 | #endif | 1412 | #endif |
1413 | if (clone_flags & CLONE_THREAD) | ||
1414 | threadgroup_fork_read_unlock(current); | ||
1330 | cgroup_exit(p, cgroup_callbacks_done); | 1415 | cgroup_exit(p, cgroup_callbacks_done); |
1331 | delayacct_tsk_free(p); | 1416 | delayacct_tsk_free(p); |
1332 | module_put(task_thread_info(p)->exec_domain->module); | 1417 | module_put(task_thread_info(p)->exec_domain->module); |
@@ -1403,23 +1488,6 @@ long do_fork(unsigned long clone_flags, | |||
1403 | } | 1488 | } |
1404 | 1489 | ||
1405 | /* | 1490 | /* |
1406 | * We hope to recycle these flags after 2.6.26 | ||
1407 | */ | ||
1408 | if (unlikely(clone_flags & CLONE_STOPPED)) { | ||
1409 | static int __read_mostly count = 100; | ||
1410 | |||
1411 | if (count > 0 && printk_ratelimit()) { | ||
1412 | char comm[TASK_COMM_LEN]; | ||
1413 | |||
1414 | count--; | ||
1415 | printk(KERN_INFO "fork(): process `%s' used deprecated " | ||
1416 | "clone flags 0x%lx\n", | ||
1417 | get_task_comm(comm, current), | ||
1418 | clone_flags & CLONE_STOPPED); | ||
1419 | } | ||
1420 | } | ||
1421 | |||
1422 | /* | ||
1423 | * When called from kernel_thread, don't do user tracing stuff. | 1491 | * When called from kernel_thread, don't do user tracing stuff. |
1424 | */ | 1492 | */ |
1425 | if (likely(user_mode(regs))) | 1493 | if (likely(user_mode(regs))) |
@@ -1457,16 +1525,7 @@ long do_fork(unsigned long clone_flags, | |||
1457 | */ | 1525 | */ |
1458 | p->flags &= ~PF_STARTING; | 1526 | p->flags &= ~PF_STARTING; |
1459 | 1527 | ||
1460 | if (unlikely(clone_flags & CLONE_STOPPED)) { | 1528 | wake_up_new_task(p); |
1461 | /* | ||
1462 | * We'll start up with an immediate SIGSTOP. | ||
1463 | */ | ||
1464 | sigaddset(&p->pending.signal, SIGSTOP); | ||
1465 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1466 | __set_task_state(p, TASK_STOPPED); | ||
1467 | } else { | ||
1468 | wake_up_new_task(p, clone_flags); | ||
1469 | } | ||
1470 | 1529 | ||
1471 | tracehook_report_clone_complete(trace, regs, | 1530 | tracehook_report_clone_complete(trace, regs, |
1472 | clone_flags, nr, p); | 1531 | clone_flags, nr, p); |
@@ -1510,6 +1569,13 @@ void __init proc_caches_init(void) | |||
1510 | fs_cachep = kmem_cache_create("fs_cache", | 1569 | fs_cachep = kmem_cache_create("fs_cache", |
1511 | sizeof(struct fs_struct), 0, | 1570 | sizeof(struct fs_struct), 0, |
1512 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1571 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
1572 | /* | ||
1573 | * FIXME! The "sizeof(struct mm_struct)" currently includes the | ||
1574 | * whole struct cpumask for the OFFSTACK case. We could change | ||
1575 | * this to *only* allocate as much of it as required by the | ||
1576 | * maximum number of CPU's we can ever have. The cpumask_allocation | ||
1577 | * is at the end of the structure, exactly for that reason. | ||
1578 | */ | ||
1513 | mm_cachep = kmem_cache_create("mm_struct", | 1579 | mm_cachep = kmem_cache_create("mm_struct", |
1514 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1580 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1515 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1581 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
@@ -1518,38 +1584,24 @@ void __init proc_caches_init(void) | |||
1518 | } | 1584 | } |
1519 | 1585 | ||
1520 | /* | 1586 | /* |
1521 | * Check constraints on flags passed to the unshare system call and | 1587 | * Check constraints on flags passed to the unshare system call. |
1522 | * force unsharing of additional process context as appropriate. | ||
1523 | */ | 1588 | */ |
1524 | static void check_unshare_flags(unsigned long *flags_ptr) | 1589 | static int check_unshare_flags(unsigned long unshare_flags) |
1525 | { | 1590 | { |
1591 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
1592 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | ||
1593 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | ||
1594 | return -EINVAL; | ||
1526 | /* | 1595 | /* |
1527 | * If unsharing a thread from a thread group, must also | 1596 | * Not implemented, but pretend it works if there is nothing to |
1528 | * unshare vm. | 1597 | * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND |
1529 | */ | 1598 | * needs to unshare vm. |
1530 | if (*flags_ptr & CLONE_THREAD) | ||
1531 | *flags_ptr |= CLONE_VM; | ||
1532 | |||
1533 | /* | ||
1534 | * If unsharing vm, must also unshare signal handlers. | ||
1535 | */ | ||
1536 | if (*flags_ptr & CLONE_VM) | ||
1537 | *flags_ptr |= CLONE_SIGHAND; | ||
1538 | |||
1539 | /* | ||
1540 | * If unsharing namespace, must also unshare filesystem information. | ||
1541 | */ | 1599 | */ |
1542 | if (*flags_ptr & CLONE_NEWNS) | 1600 | if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { |
1543 | *flags_ptr |= CLONE_FS; | 1601 | /* FIXME: get_task_mm() increments ->mm_users */ |
1544 | } | 1602 | if (atomic_read(¤t->mm->mm_users) > 1) |
1545 | 1603 | return -EINVAL; | |
1546 | /* | 1604 | } |
1547 | * Unsharing of tasks created with CLONE_THREAD is not supported yet | ||
1548 | */ | ||
1549 | static int unshare_thread(unsigned long unshare_flags) | ||
1550 | { | ||
1551 | if (unshare_flags & CLONE_THREAD) | ||
1552 | return -EINVAL; | ||
1553 | 1605 | ||
1554 | return 0; | 1606 | return 0; |
1555 | } | 1607 | } |
@@ -1576,34 +1628,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
1576 | } | 1628 | } |
1577 | 1629 | ||
1578 | /* | 1630 | /* |
1579 | * Unsharing of sighand is not supported yet | ||
1580 | */ | ||
1581 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | ||
1582 | { | ||
1583 | struct sighand_struct *sigh = current->sighand; | ||
1584 | |||
1585 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) | ||
1586 | return -EINVAL; | ||
1587 | else | ||
1588 | return 0; | ||
1589 | } | ||
1590 | |||
1591 | /* | ||
1592 | * Unshare vm if it is being shared | ||
1593 | */ | ||
1594 | static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) | ||
1595 | { | ||
1596 | struct mm_struct *mm = current->mm; | ||
1597 | |||
1598 | if ((unshare_flags & CLONE_VM) && | ||
1599 | (mm && atomic_read(&mm->mm_users) > 1)) { | ||
1600 | return -EINVAL; | ||
1601 | } | ||
1602 | |||
1603 | return 0; | ||
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * Unshare file descriptor table if it is being shared | 1631 | * Unshare file descriptor table if it is being shared |
1608 | */ | 1632 | */ |
1609 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) | 1633 | static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
@@ -1631,45 +1655,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp | |||
1631 | */ | 1655 | */ |
1632 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | 1656 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) |
1633 | { | 1657 | { |
1634 | int err = 0; | ||
1635 | struct fs_struct *fs, *new_fs = NULL; | 1658 | struct fs_struct *fs, *new_fs = NULL; |
1636 | struct sighand_struct *new_sigh = NULL; | ||
1637 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | ||
1638 | struct files_struct *fd, *new_fd = NULL; | 1659 | struct files_struct *fd, *new_fd = NULL; |
1639 | struct nsproxy *new_nsproxy = NULL; | 1660 | struct nsproxy *new_nsproxy = NULL; |
1640 | int do_sysvsem = 0; | 1661 | int do_sysvsem = 0; |
1662 | int err; | ||
1641 | 1663 | ||
1642 | check_unshare_flags(&unshare_flags); | 1664 | err = check_unshare_flags(unshare_flags); |
1643 | 1665 | if (err) | |
1644 | /* Return -EINVAL for all unsupported flags */ | ||
1645 | err = -EINVAL; | ||
1646 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| | ||
1647 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| | ||
1648 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) | ||
1649 | goto bad_unshare_out; | 1666 | goto bad_unshare_out; |
1650 | 1667 | ||
1651 | /* | 1668 | /* |
1669 | * If unsharing namespace, must also unshare filesystem information. | ||
1670 | */ | ||
1671 | if (unshare_flags & CLONE_NEWNS) | ||
1672 | unshare_flags |= CLONE_FS; | ||
1673 | /* | ||
1652 | * CLONE_NEWIPC must also detach from the undolist: after switching | 1674 | * CLONE_NEWIPC must also detach from the undolist: after switching |
1653 | * to a new ipc namespace, the semaphore arrays from the old | 1675 | * to a new ipc namespace, the semaphore arrays from the old |
1654 | * namespace are unreachable. | 1676 | * namespace are unreachable. |
1655 | */ | 1677 | */ |
1656 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | 1678 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) |
1657 | do_sysvsem = 1; | 1679 | do_sysvsem = 1; |
1658 | if ((err = unshare_thread(unshare_flags))) | ||
1659 | goto bad_unshare_out; | ||
1660 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1680 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
1661 | goto bad_unshare_cleanup_thread; | 1681 | goto bad_unshare_out; |
1662 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | ||
1663 | goto bad_unshare_cleanup_fs; | ||
1664 | if ((err = unshare_vm(unshare_flags, &new_mm))) | ||
1665 | goto bad_unshare_cleanup_sigh; | ||
1666 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1682 | if ((err = unshare_fd(unshare_flags, &new_fd))) |
1667 | goto bad_unshare_cleanup_vm; | 1683 | goto bad_unshare_cleanup_fs; |
1668 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1684 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, |
1669 | new_fs))) | 1685 | new_fs))) |
1670 | goto bad_unshare_cleanup_fd; | 1686 | goto bad_unshare_cleanup_fd; |
1671 | 1687 | ||
1672 | if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { | 1688 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { |
1673 | if (do_sysvsem) { | 1689 | if (do_sysvsem) { |
1674 | /* | 1690 | /* |
1675 | * CLONE_SYSVSEM is equivalent to sys_exit(). | 1691 | * CLONE_SYSVSEM is equivalent to sys_exit(). |
@@ -1695,15 +1711,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1695 | spin_unlock(&fs->lock); | 1711 | spin_unlock(&fs->lock); |
1696 | } | 1712 | } |
1697 | 1713 | ||
1698 | if (new_mm) { | ||
1699 | mm = current->mm; | ||
1700 | active_mm = current->active_mm; | ||
1701 | current->mm = new_mm; | ||
1702 | current->active_mm = new_mm; | ||
1703 | activate_mm(active_mm, new_mm); | ||
1704 | new_mm = mm; | ||
1705 | } | ||
1706 | |||
1707 | if (new_fd) { | 1714 | if (new_fd) { |
1708 | fd = current->files; | 1715 | fd = current->files; |
1709 | current->files = new_fd; | 1716 | current->files = new_fd; |
@@ -1720,20 +1727,10 @@ bad_unshare_cleanup_fd: | |||
1720 | if (new_fd) | 1727 | if (new_fd) |
1721 | put_files_struct(new_fd); | 1728 | put_files_struct(new_fd); |
1722 | 1729 | ||
1723 | bad_unshare_cleanup_vm: | ||
1724 | if (new_mm) | ||
1725 | mmput(new_mm); | ||
1726 | |||
1727 | bad_unshare_cleanup_sigh: | ||
1728 | if (new_sigh) | ||
1729 | if (atomic_dec_and_test(&new_sigh->count)) | ||
1730 | kmem_cache_free(sighand_cachep, new_sigh); | ||
1731 | |||
1732 | bad_unshare_cleanup_fs: | 1730 | bad_unshare_cleanup_fs: |
1733 | if (new_fs) | 1731 | if (new_fs) |
1734 | free_fs_struct(new_fs); | 1732 | free_fs_struct(new_fs); |
1735 | 1733 | ||
1736 | bad_unshare_cleanup_thread: | ||
1737 | bad_unshare_out: | 1734 | bad_unshare_out: |
1738 | return err; | 1735 | return err; |
1739 | } | 1736 | } |
diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb2..7b01de98bb6a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -17,7 +17,7 @@ static inline void frozen_process(void) | |||
17 | { | 17 | { |
18 | if (!unlikely(current->flags & PF_NOFREEZE)) { | 18 | if (!unlikely(current->flags & PF_NOFREEZE)) { |
19 | current->flags |= PF_FROZEN; | 19 | current->flags |= PF_FROZEN; |
20 | wmb(); | 20 | smp_wmb(); |
21 | } | 21 | } |
22 | clear_freeze_flag(current); | 22 | clear_freeze_flag(current); |
23 | } | 23 | } |
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
93 | * the task as frozen and next clears its TIF_FREEZE. | 93 | * the task as frozen and next clears its TIF_FREEZE. |
94 | */ | 94 | */ |
95 | if (!freezing(p)) { | 95 | if (!freezing(p)) { |
96 | rmb(); | 96 | smp_rmb(); |
97 | if (frozen(p)) | 97 | if (frozen(p)) |
98 | return false; | 98 | return false; |
99 | 99 | ||
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
104 | } | 104 | } |
105 | 105 | ||
106 | if (should_send_signal(p)) { | 106 | if (should_send_signal(p)) { |
107 | if (!signal_pending(p)) | 107 | fake_signal_wake_up(p); |
108 | fake_signal_wake_up(p); | 108 | /* |
109 | * fake_signal_wake_up() goes through p's scheduler | ||
110 | * lock and guarantees that TASK_STOPPED/TRACED -> | ||
111 | * TASK_RUNNING transition can't race with task state | ||
112 | * testing in try_to_freeze_tasks(). | ||
113 | */ | ||
109 | } else if (sig_only) { | 114 | } else if (sig_only) { |
110 | return false; | 115 | return false; |
111 | } else { | 116 | } else { |
diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..fe28dc282eae 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; | |||
69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 69 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * Futex flags used to encode options to functions and preserve them across | ||
73 | * restarts. | ||
74 | */ | ||
75 | #define FLAGS_SHARED 0x01 | ||
76 | #define FLAGS_CLOCKRT 0x02 | ||
77 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
78 | |||
79 | /* | ||
72 | * Priority Inheritance state: | 80 | * Priority Inheritance state: |
73 | */ | 81 | */ |
74 | struct futex_pi_state { | 82 | struct futex_pi_state { |
@@ -91,6 +99,7 @@ struct futex_pi_state { | |||
91 | 99 | ||
92 | /** | 100 | /** |
93 | * struct futex_q - The hashed futex queue entry, one per waiting task | 101 | * struct futex_q - The hashed futex queue entry, one per waiting task |
102 | * @list: priority-sorted list of tasks waiting on this futex | ||
94 | * @task: the task waiting on the futex | 103 | * @task: the task waiting on the futex |
95 | * @lock_ptr: the hash bucket lock | 104 | * @lock_ptr: the hash bucket lock |
96 | * @key: the key the futex is hashed on | 105 | * @key: the key the futex is hashed on |
@@ -104,7 +113,7 @@ struct futex_pi_state { | |||
104 | * | 113 | * |
105 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | 114 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. |
106 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. | 115 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. |
107 | * The order of wakup is always to make the first condition true, then | 116 | * The order of wakeup is always to make the first condition true, then |
108 | * the second. | 117 | * the second. |
109 | * | 118 | * |
110 | * PI futexes are typically woken before they are removed from the hash list via | 119 | * PI futexes are typically woken before they are removed from the hash list via |
@@ -122,6 +131,12 @@ struct futex_q { | |||
122 | u32 bitset; | 131 | u32 bitset; |
123 | }; | 132 | }; |
124 | 133 | ||
134 | static const struct futex_q futex_q_init = { | ||
135 | /* list gets initialized in queue_me()*/ | ||
136 | .key = FUTEX_KEY_INIT, | ||
137 | .bitset = FUTEX_BITSET_MATCH_ANY | ||
138 | }; | ||
139 | |||
125 | /* | 140 | /* |
126 | * Hash buckets are shared by all the futex_keys that hash to the same | 141 | * Hash buckets are shared by all the futex_keys that hash to the same |
127 | * location. Each key may have multiple futex_q structures, one for each task | 142 | * location. Each key may have multiple futex_q structures, one for each task |
@@ -168,7 +183,7 @@ static void get_futex_key_refs(union futex_key *key) | |||
168 | 183 | ||
169 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 184 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
170 | case FUT_OFF_INODE: | 185 | case FUT_OFF_INODE: |
171 | atomic_inc(&key->shared.inode->i_count); | 186 | ihold(key->shared.inode); |
172 | break; | 187 | break; |
173 | case FUT_OFF_MMSHARED: | 188 | case FUT_OFF_MMSHARED: |
174 | atomic_inc(&key->private.mm->mm_count); | 189 | atomic_inc(&key->private.mm->mm_count); |
@@ -218,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
218 | { | 233 | { |
219 | unsigned long address = (unsigned long)uaddr; | 234 | unsigned long address = (unsigned long)uaddr; |
220 | struct mm_struct *mm = current->mm; | 235 | struct mm_struct *mm = current->mm; |
221 | struct page *page; | 236 | struct page *page, *page_head; |
222 | int err; | 237 | int err; |
223 | 238 | ||
224 | /* | 239 | /* |
@@ -250,11 +265,46 @@ again: | |||
250 | if (err < 0) | 265 | if (err < 0) |
251 | return err; | 266 | return err; |
252 | 267 | ||
253 | page = compound_head(page); | 268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
254 | lock_page(page); | 269 | page_head = page; |
255 | if (!page->mapping) { | 270 | if (unlikely(PageTail(page))) { |
256 | unlock_page(page); | ||
257 | put_page(page); | 271 | put_page(page); |
272 | /* serialize against __split_huge_page_splitting() */ | ||
273 | local_irq_disable(); | ||
274 | if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { | ||
275 | page_head = compound_head(page); | ||
276 | /* | ||
277 | * page_head is valid pointer but we must pin | ||
278 | * it before taking the PG_lock and/or | ||
279 | * PG_compound_lock. The moment we re-enable | ||
280 | * irqs __split_huge_page_splitting() can | ||
281 | * return and the head page can be freed from | ||
282 | * under us. We can't take the PG_lock and/or | ||
283 | * PG_compound_lock on a page that could be | ||
284 | * freed from under us. | ||
285 | */ | ||
286 | if (page != page_head) { | ||
287 | get_page(page_head); | ||
288 | put_page(page); | ||
289 | } | ||
290 | local_irq_enable(); | ||
291 | } else { | ||
292 | local_irq_enable(); | ||
293 | goto again; | ||
294 | } | ||
295 | } | ||
296 | #else | ||
297 | page_head = compound_head(page); | ||
298 | if (page != page_head) { | ||
299 | get_page(page_head); | ||
300 | put_page(page); | ||
301 | } | ||
302 | #endif | ||
303 | |||
304 | lock_page(page_head); | ||
305 | if (!page_head->mapping) { | ||
306 | unlock_page(page_head); | ||
307 | put_page(page_head); | ||
258 | goto again; | 308 | goto again; |
259 | } | 309 | } |
260 | 310 | ||
@@ -265,25 +315,24 @@ again: | |||
265 | * it's a read-only handle, it's expected that futexes attach to | 315 | * it's a read-only handle, it's expected that futexes attach to |
266 | * the object not the particular process. | 316 | * the object not the particular process. |
267 | */ | 317 | */ |
268 | if (PageAnon(page)) { | 318 | if (PageAnon(page_head)) { |
269 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
270 | key->private.mm = mm; | 320 | key->private.mm = mm; |
271 | key->private.address = address; | 321 | key->private.address = address; |
272 | } else { | 322 | } else { |
273 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 323 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
274 | key->shared.inode = page->mapping->host; | 324 | key->shared.inode = page_head->mapping->host; |
275 | key->shared.pgoff = page->index; | 325 | key->shared.pgoff = page_head->index; |
276 | } | 326 | } |
277 | 327 | ||
278 | get_futex_key_refs(key); | 328 | get_futex_key_refs(key); |
279 | 329 | ||
280 | unlock_page(page); | 330 | unlock_page(page_head); |
281 | put_page(page); | 331 | put_page(page_head); |
282 | return 0; | 332 | return 0; |
283 | } | 333 | } |
284 | 334 | ||
285 | static inline | 335 | static inline void put_futex_key(union futex_key *key) |
286 | void put_futex_key(int fshared, union futex_key *key) | ||
287 | { | 336 | { |
288 | drop_futex_key_refs(key); | 337 | drop_futex_key_refs(key); |
289 | } | 338 | } |
@@ -295,7 +344,7 @@ void put_futex_key(int fshared, union futex_key *key) | |||
295 | * Slow path to fixup the fault we just took in the atomic write | 344 | * Slow path to fixup the fault we just took in the atomic write |
296 | * access to @uaddr. | 345 | * access to @uaddr. |
297 | * | 346 | * |
298 | * We have no generic implementation of a non destructive write to the | 347 | * We have no generic implementation of a non-destructive write to the |
299 | * user address. We know that we faulted in the atomic pagefault | 348 | * user address. We know that we faulted in the atomic pagefault |
300 | * disabled section so we can as well avoid the #PF overhead by | 349 | * disabled section so we can as well avoid the #PF overhead by |
301 | * calling get_user_pages() right away. | 350 | * calling get_user_pages() right away. |
@@ -332,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | |||
332 | return NULL; | 381 | return NULL; |
333 | } | 382 | } |
334 | 383 | ||
335 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 384 | static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, |
385 | u32 uval, u32 newval) | ||
336 | { | 386 | { |
337 | u32 curval; | 387 | int ret; |
338 | 388 | ||
339 | pagefault_disable(); | 389 | pagefault_disable(); |
340 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 390 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); |
341 | pagefault_enable(); | 391 | pagefault_enable(); |
342 | 392 | ||
343 | return curval; | 393 | return ret; |
344 | } | 394 | } |
345 | 395 | ||
346 | static int get_futex_value_locked(u32 *dest, u32 __user *from) | 396 | static int get_futex_value_locked(u32 *dest, u32 __user *from) |
@@ -515,7 +565,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
515 | */ | 565 | */ |
516 | pi_state = this->pi_state; | 566 | pi_state = this->pi_state; |
517 | /* | 567 | /* |
518 | * Userspace might have messed up non PI and PI futexes | 568 | * Userspace might have messed up non-PI and PI futexes |
519 | */ | 569 | */ |
520 | if (unlikely(!pi_state)) | 570 | if (unlikely(!pi_state)) |
521 | return -EINVAL; | 571 | return -EINVAL; |
@@ -625,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
625 | struct task_struct *task, int set_waiters) | 675 | struct task_struct *task, int set_waiters) |
626 | { | 676 | { |
627 | int lock_taken, ret, ownerdied = 0; | 677 | int lock_taken, ret, ownerdied = 0; |
628 | u32 uval, newval, curval; | 678 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
629 | 679 | ||
630 | retry: | 680 | retry: |
631 | ret = lock_taken = 0; | 681 | ret = lock_taken = 0; |
@@ -635,19 +685,17 @@ retry: | |||
635 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | 685 | * (by doing a 0 -> TID atomic cmpxchg), while holding all |
636 | * the locks. It will most likely not succeed. | 686 | * the locks. It will most likely not succeed. |
637 | */ | 687 | */ |
638 | newval = task_pid_vnr(task); | 688 | newval = vpid; |
639 | if (set_waiters) | 689 | if (set_waiters) |
640 | newval |= FUTEX_WAITERS; | 690 | newval |= FUTEX_WAITERS; |
641 | 691 | ||
642 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | 692 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) |
643 | |||
644 | if (unlikely(curval == -EFAULT)) | ||
645 | return -EFAULT; | 693 | return -EFAULT; |
646 | 694 | ||
647 | /* | 695 | /* |
648 | * Detect deadlocks. | 696 | * Detect deadlocks. |
649 | */ | 697 | */ |
650 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | 698 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) |
651 | return -EDEADLK; | 699 | return -EDEADLK; |
652 | 700 | ||
653 | /* | 701 | /* |
@@ -674,14 +722,12 @@ retry: | |||
674 | */ | 722 | */ |
675 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | 723 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { |
676 | /* Keep the OWNER_DIED bit */ | 724 | /* Keep the OWNER_DIED bit */ |
677 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | 725 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
678 | ownerdied = 0; | 726 | ownerdied = 0; |
679 | lock_taken = 1; | 727 | lock_taken = 1; |
680 | } | 728 | } |
681 | 729 | ||
682 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 730 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
683 | |||
684 | if (unlikely(curval == -EFAULT)) | ||
685 | return -EFAULT; | 731 | return -EFAULT; |
686 | if (unlikely(curval != uval)) | 732 | if (unlikely(curval != uval)) |
687 | goto retry; | 733 | goto retry; |
@@ -726,6 +772,24 @@ retry: | |||
726 | return ret; | 772 | return ret; |
727 | } | 773 | } |
728 | 774 | ||
775 | /** | ||
776 | * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket | ||
777 | * @q: The futex_q to unqueue | ||
778 | * | ||
779 | * The q->lock_ptr must not be NULL and must be held by the caller. | ||
780 | */ | ||
781 | static void __unqueue_futex(struct futex_q *q) | ||
782 | { | ||
783 | struct futex_hash_bucket *hb; | ||
784 | |||
785 | if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) | ||
786 | || WARN_ON(plist_node_empty(&q->list))) | ||
787 | return; | ||
788 | |||
789 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | ||
790 | plist_del(&q->list, &hb->chain); | ||
791 | } | ||
792 | |||
729 | /* | 793 | /* |
730 | * The hash bucket lock must be held when this is called. | 794 | * The hash bucket lock must be held when this is called. |
731 | * Afterwards, the futex_q must not be accessed. | 795 | * Afterwards, the futex_q must not be accessed. |
@@ -736,14 +800,14 @@ static void wake_futex(struct futex_q *q) | |||
736 | 800 | ||
737 | /* | 801 | /* |
738 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 802 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
739 | * a non futex wake up happens on another CPU then the task | 803 | * a non-futex wake up happens on another CPU then the task |
740 | * might exit and p would dereference a non existing task | 804 | * might exit and p would dereference a non-existing task |
741 | * struct. Prevent this by holding a reference on p across the | 805 | * struct. Prevent this by holding a reference on p across the |
742 | * wake up. | 806 | * wake up. |
743 | */ | 807 | */ |
744 | get_task_struct(p); | 808 | get_task_struct(p); |
745 | 809 | ||
746 | plist_del(&q->list, &q->list.plist); | 810 | __unqueue_futex(q); |
747 | /* | 811 | /* |
748 | * The waiting task can free the futex_q as soon as | 812 | * The waiting task can free the futex_q as soon as |
749 | * q->lock_ptr = NULL is written, without taking any locks. A | 813 | * q->lock_ptr = NULL is written, without taking any locks. A |
@@ -777,10 +841,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
777 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 841 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
778 | 842 | ||
779 | /* | 843 | /* |
780 | * This happens when we have stolen the lock and the original | 844 | * It is possible that the next waiter (the one that brought |
781 | * pending owner did not enqueue itself back on the rt_mutex. | 845 | * this owner to the kernel) timed out and is no longer |
782 | * Thats not a tragedy. We know that way, that a lock waiter | 846 | * waiting on the lock. |
783 | * is on the fly. We make the futex_q waiter the pending owner. | ||
784 | */ | 847 | */ |
785 | if (!new_owner) | 848 | if (!new_owner) |
786 | new_owner = this->task; | 849 | new_owner = this->task; |
@@ -795,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
795 | 858 | ||
796 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 859 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
797 | 860 | ||
798 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 861 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
799 | |||
800 | if (curval == -EFAULT) | ||
801 | ret = -EFAULT; | 862 | ret = -EFAULT; |
802 | else if (curval != uval) | 863 | else if (curval != uval) |
803 | ret = -EINVAL; | 864 | ret = -EINVAL; |
@@ -832,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
832 | * There is no waiter, so we unlock the futex. The owner died | 893 | * There is no waiter, so we unlock the futex. The owner died |
833 | * bit has not to be preserved here. We are the owner: | 894 | * bit has not to be preserved here. We are the owner: |
834 | */ | 895 | */ |
835 | oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); | 896 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) |
836 | 897 | return -EFAULT; | |
837 | if (oldval == -EFAULT) | ||
838 | return oldval; | ||
839 | if (oldval != uval) | 898 | if (oldval != uval) |
840 | return -EAGAIN; | 899 | return -EAGAIN; |
841 | 900 | ||
@@ -869,7 +928,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | |||
869 | /* | 928 | /* |
870 | * Wake up waiters matching bitset queued on this futex (uaddr). | 929 | * Wake up waiters matching bitset queued on this futex (uaddr). |
871 | */ | 930 | */ |
872 | static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | 931 | static int |
932 | futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | ||
873 | { | 933 | { |
874 | struct futex_hash_bucket *hb; | 934 | struct futex_hash_bucket *hb; |
875 | struct futex_q *this, *next; | 935 | struct futex_q *this, *next; |
@@ -880,7 +940,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
880 | if (!bitset) | 940 | if (!bitset) |
881 | return -EINVAL; | 941 | return -EINVAL; |
882 | 942 | ||
883 | ret = get_futex_key(uaddr, fshared, &key); | 943 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
884 | if (unlikely(ret != 0)) | 944 | if (unlikely(ret != 0)) |
885 | goto out; | 945 | goto out; |
886 | 946 | ||
@@ -906,7 +966,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
906 | } | 966 | } |
907 | 967 | ||
908 | spin_unlock(&hb->lock); | 968 | spin_unlock(&hb->lock); |
909 | put_futex_key(fshared, &key); | 969 | put_futex_key(&key); |
910 | out: | 970 | out: |
911 | return ret; | 971 | return ret; |
912 | } | 972 | } |
@@ -916,7 +976,7 @@ out: | |||
916 | * to this virtual address: | 976 | * to this virtual address: |
917 | */ | 977 | */ |
918 | static int | 978 | static int |
919 | futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 979 | futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, |
920 | int nr_wake, int nr_wake2, int op) | 980 | int nr_wake, int nr_wake2, int op) |
921 | { | 981 | { |
922 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 982 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
@@ -926,10 +986,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | |||
926 | int ret, op_ret; | 986 | int ret, op_ret; |
927 | 987 | ||
928 | retry: | 988 | retry: |
929 | ret = get_futex_key(uaddr1, fshared, &key1); | 989 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
930 | if (unlikely(ret != 0)) | 990 | if (unlikely(ret != 0)) |
931 | goto out; | 991 | goto out; |
932 | ret = get_futex_key(uaddr2, fshared, &key2); | 992 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
933 | if (unlikely(ret != 0)) | 993 | if (unlikely(ret != 0)) |
934 | goto out_put_key1; | 994 | goto out_put_key1; |
935 | 995 | ||
@@ -961,11 +1021,11 @@ retry_private: | |||
961 | if (ret) | 1021 | if (ret) |
962 | goto out_put_keys; | 1022 | goto out_put_keys; |
963 | 1023 | ||
964 | if (!fshared) | 1024 | if (!(flags & FLAGS_SHARED)) |
965 | goto retry_private; | 1025 | goto retry_private; |
966 | 1026 | ||
967 | put_futex_key(fshared, &key2); | 1027 | put_futex_key(&key2); |
968 | put_futex_key(fshared, &key1); | 1028 | put_futex_key(&key1); |
969 | goto retry; | 1029 | goto retry; |
970 | } | 1030 | } |
971 | 1031 | ||
@@ -995,9 +1055,9 @@ retry_private: | |||
995 | 1055 | ||
996 | double_unlock_hb(hb1, hb2); | 1056 | double_unlock_hb(hb1, hb2); |
997 | out_put_keys: | 1057 | out_put_keys: |
998 | put_futex_key(fshared, &key2); | 1058 | put_futex_key(&key2); |
999 | out_put_key1: | 1059 | out_put_key1: |
1000 | put_futex_key(fshared, &key1); | 1060 | put_futex_key(&key1); |
1001 | out: | 1061 | out: |
1002 | return ret; | 1062 | return ret; |
1003 | } | 1063 | } |
@@ -1022,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
1022 | plist_del(&q->list, &hb1->chain); | 1082 | plist_del(&q->list, &hb1->chain); |
1023 | plist_add(&q->list, &hb2->chain); | 1083 | plist_add(&q->list, &hb2->chain); |
1024 | q->lock_ptr = &hb2->lock; | 1084 | q->lock_ptr = &hb2->lock; |
1025 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1026 | q->list.plist.spinlock = &hb2->lock; | ||
1027 | #endif | ||
1028 | } | 1085 | } |
1029 | get_futex_key_refs(key2); | 1086 | get_futex_key_refs(key2); |
1030 | q->key = *key2; | 1087 | q->key = *key2; |
@@ -1051,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
1051 | get_futex_key_refs(key); | 1108 | get_futex_key_refs(key); |
1052 | q->key = *key; | 1109 | q->key = *key; |
1053 | 1110 | ||
1054 | WARN_ON(plist_node_empty(&q->list)); | 1111 | __unqueue_futex(q); |
1055 | plist_del(&q->list, &q->list.plist); | ||
1056 | 1112 | ||
1057 | WARN_ON(!q->rt_waiter); | 1113 | WARN_ON(!q->rt_waiter); |
1058 | q->rt_waiter = NULL; | 1114 | q->rt_waiter = NULL; |
1059 | 1115 | ||
1060 | q->lock_ptr = &hb->lock; | 1116 | q->lock_ptr = &hb->lock; |
1061 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1062 | q->list.plist.spinlock = &hb->lock; | ||
1063 | #endif | ||
1064 | 1117 | ||
1065 | wake_up_state(q->task, TASK_NORMAL); | 1118 | wake_up_state(q->task, TASK_NORMAL); |
1066 | } | 1119 | } |
@@ -1131,12 +1184,14 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1131 | 1184 | ||
1132 | /** | 1185 | /** |
1133 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1186 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
1134 | * uaddr1: source futex user address | 1187 | * @uaddr1: source futex user address |
1135 | * uaddr2: target futex user address | 1188 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1136 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1189 | * @uaddr2: target futex user address |
1137 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | 1190 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
1138 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1191 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
1139 | * pi futex (pi to pi requeue is not supported) | 1192 | * @cmpval: @uaddr1 expected value (or %NULL) |
1193 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
1194 | * pi futex (pi to pi requeue is not supported) | ||
1140 | * | 1195 | * |
1141 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1196 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
1142 | * uaddr2 atomically on behalf of the top waiter. | 1197 | * uaddr2 atomically on behalf of the top waiter. |
@@ -1145,9 +1200,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1145 | * >=0 - on success, the number of tasks requeued or woken | 1200 | * >=0 - on success, the number of tasks requeued or woken |
1146 | * <0 - on error | 1201 | * <0 - on error |
1147 | */ | 1202 | */ |
1148 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1203 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
1149 | int nr_wake, int nr_requeue, u32 *cmpval, | 1204 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
1150 | int requeue_pi) | 1205 | u32 *cmpval, int requeue_pi) |
1151 | { | 1206 | { |
1152 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1207 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1153 | int drop_count = 0, task_count = 0, ret; | 1208 | int drop_count = 0, task_count = 0, ret; |
@@ -1188,10 +1243,10 @@ retry: | |||
1188 | pi_state = NULL; | 1243 | pi_state = NULL; |
1189 | } | 1244 | } |
1190 | 1245 | ||
1191 | ret = get_futex_key(uaddr1, fshared, &key1); | 1246 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); |
1192 | if (unlikely(ret != 0)) | 1247 | if (unlikely(ret != 0)) |
1193 | goto out; | 1248 | goto out; |
1194 | ret = get_futex_key(uaddr2, fshared, &key2); | 1249 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
1195 | if (unlikely(ret != 0)) | 1250 | if (unlikely(ret != 0)) |
1196 | goto out_put_key1; | 1251 | goto out_put_key1; |
1197 | 1252 | ||
@@ -1213,11 +1268,11 @@ retry_private: | |||
1213 | if (ret) | 1268 | if (ret) |
1214 | goto out_put_keys; | 1269 | goto out_put_keys; |
1215 | 1270 | ||
1216 | if (!fshared) | 1271 | if (!(flags & FLAGS_SHARED)) |
1217 | goto retry_private; | 1272 | goto retry_private; |
1218 | 1273 | ||
1219 | put_futex_key(fshared, &key2); | 1274 | put_futex_key(&key2); |
1220 | put_futex_key(fshared, &key1); | 1275 | put_futex_key(&key1); |
1221 | goto retry; | 1276 | goto retry; |
1222 | } | 1277 | } |
1223 | if (curval != *cmpval) { | 1278 | if (curval != *cmpval) { |
@@ -1257,8 +1312,8 @@ retry_private: | |||
1257 | break; | 1312 | break; |
1258 | case -EFAULT: | 1313 | case -EFAULT: |
1259 | double_unlock_hb(hb1, hb2); | 1314 | double_unlock_hb(hb1, hb2); |
1260 | put_futex_key(fshared, &key2); | 1315 | put_futex_key(&key2); |
1261 | put_futex_key(fshared, &key1); | 1316 | put_futex_key(&key1); |
1262 | ret = fault_in_user_writeable(uaddr2); | 1317 | ret = fault_in_user_writeable(uaddr2); |
1263 | if (!ret) | 1318 | if (!ret) |
1264 | goto retry; | 1319 | goto retry; |
@@ -1266,8 +1321,8 @@ retry_private: | |||
1266 | case -EAGAIN: | 1321 | case -EAGAIN: |
1267 | /* The owner was exiting, try again. */ | 1322 | /* The owner was exiting, try again. */ |
1268 | double_unlock_hb(hb1, hb2); | 1323 | double_unlock_hb(hb1, hb2); |
1269 | put_futex_key(fshared, &key2); | 1324 | put_futex_key(&key2); |
1270 | put_futex_key(fshared, &key1); | 1325 | put_futex_key(&key1); |
1271 | cond_resched(); | 1326 | cond_resched(); |
1272 | goto retry; | 1327 | goto retry; |
1273 | default: | 1328 | default: |
@@ -1349,9 +1404,9 @@ out_unlock: | |||
1349 | drop_futex_key_refs(&key1); | 1404 | drop_futex_key_refs(&key1); |
1350 | 1405 | ||
1351 | out_put_keys: | 1406 | out_put_keys: |
1352 | put_futex_key(fshared, &key2); | 1407 | put_futex_key(&key2); |
1353 | out_put_key1: | 1408 | out_put_key1: |
1354 | put_futex_key(fshared, &key1); | 1409 | put_futex_key(&key1); |
1355 | out: | 1410 | out: |
1356 | if (pi_state != NULL) | 1411 | if (pi_state != NULL) |
1357 | free_pi_state(pi_state); | 1412 | free_pi_state(pi_state); |
@@ -1360,10 +1415,10 @@ out: | |||
1360 | 1415 | ||
1361 | /* The key must be already stored in q->key. */ | 1416 | /* The key must be already stored in q->key. */ |
1362 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | 1417 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) |
1418 | __acquires(&hb->lock) | ||
1363 | { | 1419 | { |
1364 | struct futex_hash_bucket *hb; | 1420 | struct futex_hash_bucket *hb; |
1365 | 1421 | ||
1366 | get_futex_key_refs(&q->key); | ||
1367 | hb = hash_futex(&q->key); | 1422 | hb = hash_futex(&q->key); |
1368 | q->lock_ptr = &hb->lock; | 1423 | q->lock_ptr = &hb->lock; |
1369 | 1424 | ||
@@ -1373,9 +1428,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
1373 | 1428 | ||
1374 | static inline void | 1429 | static inline void |
1375 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 1430 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
1431 | __releases(&hb->lock) | ||
1376 | { | 1432 | { |
1377 | spin_unlock(&hb->lock); | 1433 | spin_unlock(&hb->lock); |
1378 | drop_futex_key_refs(&q->key); | ||
1379 | } | 1434 | } |
1380 | 1435 | ||
1381 | /** | 1436 | /** |
@@ -1391,6 +1446,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | |||
1391 | * an example). | 1446 | * an example). |
1392 | */ | 1447 | */ |
1393 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 1448 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
1449 | __releases(&hb->lock) | ||
1394 | { | 1450 | { |
1395 | int prio; | 1451 | int prio; |
1396 | 1452 | ||
@@ -1405,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
1405 | prio = min(current->normal_prio, MAX_RT_PRIO); | 1461 | prio = min(current->normal_prio, MAX_RT_PRIO); |
1406 | 1462 | ||
1407 | plist_node_init(&q->list, prio); | 1463 | plist_node_init(&q->list, prio); |
1408 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1409 | q->list.plist.spinlock = &hb->lock; | ||
1410 | #endif | ||
1411 | plist_add(&q->list, &hb->chain); | 1464 | plist_add(&q->list, &hb->chain); |
1412 | q->task = current; | 1465 | q->task = current; |
1413 | spin_unlock(&hb->lock); | 1466 | spin_unlock(&hb->lock); |
@@ -1452,8 +1505,7 @@ retry: | |||
1452 | spin_unlock(lock_ptr); | 1505 | spin_unlock(lock_ptr); |
1453 | goto retry; | 1506 | goto retry; |
1454 | } | 1507 | } |
1455 | WARN_ON(plist_node_empty(&q->list)); | 1508 | __unqueue_futex(q); |
1456 | plist_del(&q->list, &q->list.plist); | ||
1457 | 1509 | ||
1458 | BUG_ON(q->pi_state); | 1510 | BUG_ON(q->pi_state); |
1459 | 1511 | ||
@@ -1471,17 +1523,15 @@ retry: | |||
1471 | * and dropped here. | 1523 | * and dropped here. |
1472 | */ | 1524 | */ |
1473 | static void unqueue_me_pi(struct futex_q *q) | 1525 | static void unqueue_me_pi(struct futex_q *q) |
1526 | __releases(q->lock_ptr) | ||
1474 | { | 1527 | { |
1475 | WARN_ON(plist_node_empty(&q->list)); | 1528 | __unqueue_futex(q); |
1476 | plist_del(&q->list, &q->list.plist); | ||
1477 | 1529 | ||
1478 | BUG_ON(!q->pi_state); | 1530 | BUG_ON(!q->pi_state); |
1479 | free_pi_state(q->pi_state); | 1531 | free_pi_state(q->pi_state); |
1480 | q->pi_state = NULL; | 1532 | q->pi_state = NULL; |
1481 | 1533 | ||
1482 | spin_unlock(q->lock_ptr); | 1534 | spin_unlock(q->lock_ptr); |
1483 | |||
1484 | drop_futex_key_refs(&q->key); | ||
1485 | } | 1535 | } |
1486 | 1536 | ||
1487 | /* | 1537 | /* |
@@ -1491,7 +1541,7 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1491 | * private futexes. | 1541 | * private futexes. |
1492 | */ | 1542 | */ |
1493 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1543 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1494 | struct task_struct *newowner, int fshared) | 1544 | struct task_struct *newowner) |
1495 | { | 1545 | { |
1496 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1546 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1497 | struct futex_pi_state *pi_state = q->pi_state; | 1547 | struct futex_pi_state *pi_state = q->pi_state; |
@@ -1505,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1505 | 1555 | ||
1506 | /* | 1556 | /* |
1507 | * We are here either because we stole the rtmutex from the | 1557 | * We are here either because we stole the rtmutex from the |
1508 | * pending owner or we are the pending owner which failed to | 1558 | * previous highest priority waiter or we are the highest priority |
1509 | * get the rtmutex. We have to replace the pending owner TID | 1559 | * waiter but failed to get the rtmutex the first time. |
1510 | * in the user space variable. This must be atomic as we have | 1560 | * We have to replace the newowner TID in the user space variable. |
1511 | * to preserve the owner died bit here. | 1561 | * This must be atomic as we have to preserve the owner died bit here. |
1512 | * | 1562 | * |
1513 | * Note: We write the user space value _before_ changing the pi_state | 1563 | * Note: We write the user space value _before_ changing the pi_state |
1514 | * because we can fault here. Imagine swapped out pages or a fork | 1564 | * because we can fault here. Imagine swapped out pages or a fork |
@@ -1527,9 +1577,7 @@ retry: | |||
1527 | while (1) { | 1577 | while (1) { |
1528 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1578 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
1529 | 1579 | ||
1530 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 1580 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1531 | |||
1532 | if (curval == -EFAULT) | ||
1533 | goto handle_fault; | 1581 | goto handle_fault; |
1534 | if (curval == uval) | 1582 | if (curval == uval) |
1535 | break; | 1583 | break; |
@@ -1557,8 +1605,8 @@ retry: | |||
1557 | 1605 | ||
1558 | /* | 1606 | /* |
1559 | * To handle the page fault we need to drop the hash bucket | 1607 | * To handle the page fault we need to drop the hash bucket |
1560 | * lock here. That gives the other task (either the pending | 1608 | * lock here. That gives the other task (either the highest priority |
1561 | * owner itself or the task which stole the rtmutex) the | 1609 | * waiter itself or the task which stole the rtmutex) the |
1562 | * chance to try the fixup of the pi_state. So once we are | 1610 | * chance to try the fixup of the pi_state. So once we are |
1563 | * back from handling the fault we need to check the pi_state | 1611 | * back from handling the fault we need to check the pi_state |
1564 | * after reacquiring the hash bucket lock and before trying to | 1612 | * after reacquiring the hash bucket lock and before trying to |
@@ -1584,20 +1632,11 @@ handle_fault: | |||
1584 | goto retry; | 1632 | goto retry; |
1585 | } | 1633 | } |
1586 | 1634 | ||
1587 | /* | ||
1588 | * In case we must use restart_block to restart a futex_wait, | ||
1589 | * we encode in the 'flags' shared capability | ||
1590 | */ | ||
1591 | #define FLAGS_SHARED 0x01 | ||
1592 | #define FLAGS_CLOCKRT 0x02 | ||
1593 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
1594 | |||
1595 | static long futex_wait_restart(struct restart_block *restart); | 1635 | static long futex_wait_restart(struct restart_block *restart); |
1596 | 1636 | ||
1597 | /** | 1637 | /** |
1598 | * fixup_owner() - Post lock pi_state and corner case management | 1638 | * fixup_owner() - Post lock pi_state and corner case management |
1599 | * @uaddr: user address of the futex | 1639 | * @uaddr: user address of the futex |
1600 | * @fshared: whether the futex is shared (1) or not (0) | ||
1601 | * @q: futex_q (contains pi_state and access to the rt_mutex) | 1640 | * @q: futex_q (contains pi_state and access to the rt_mutex) |
1602 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | 1641 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) |
1603 | * | 1642 | * |
@@ -1610,8 +1649,7 @@ static long futex_wait_restart(struct restart_block *restart); | |||
1610 | * 0 - success, lock not taken | 1649 | * 0 - success, lock not taken |
1611 | * <0 - on error (-EFAULT) | 1650 | * <0 - on error (-EFAULT) |
1612 | */ | 1651 | */ |
1613 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | 1652 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
1614 | int locked) | ||
1615 | { | 1653 | { |
1616 | struct task_struct *owner; | 1654 | struct task_struct *owner; |
1617 | int ret = 0; | 1655 | int ret = 0; |
@@ -1622,7 +1660,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
1622 | * did a lock-steal - fix up the PI-state in that case: | 1660 | * did a lock-steal - fix up the PI-state in that case: |
1623 | */ | 1661 | */ |
1624 | if (q->pi_state->owner != current) | 1662 | if (q->pi_state->owner != current) |
1625 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | 1663 | ret = fixup_pi_state_owner(uaddr, q, current); |
1626 | goto out; | 1664 | goto out; |
1627 | } | 1665 | } |
1628 | 1666 | ||
@@ -1644,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | |||
1644 | /* | 1682 | /* |
1645 | * pi_state is incorrect, some other task did a lock steal and | 1683 | * pi_state is incorrect, some other task did a lock steal and |
1646 | * we returned due to timeout or signal without taking the | 1684 | * we returned due to timeout or signal without taking the |
1647 | * rt_mutex. Too late. We can access the rt_mutex_owner without | 1685 | * rt_mutex. Too late. |
1648 | * locking, as the other task is now blocked on the hash bucket | ||
1649 | * lock. Fix the state up. | ||
1650 | */ | 1686 | */ |
1687 | raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); | ||
1651 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1688 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
1652 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | 1689 | if (!owner) |
1690 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); | ||
1691 | raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); | ||
1692 | ret = fixup_pi_state_owner(uaddr, q, owner); | ||
1653 | goto out; | 1693 | goto out; |
1654 | } | 1694 | } |
1655 | 1695 | ||
1656 | /* | 1696 | /* |
1657 | * Paranoia check. If we did not take the lock, then we should not be | 1697 | * Paranoia check. If we did not take the lock, then we should not be |
1658 | * the owner, nor the pending owner, of the rt_mutex. | 1698 | * the owner of the rt_mutex. |
1659 | */ | 1699 | */ |
1660 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | 1700 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) |
1661 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | 1701 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " |
@@ -1712,7 +1752,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1712 | * futex_wait_setup() - Prepare to wait on a futex | 1752 | * futex_wait_setup() - Prepare to wait on a futex |
1713 | * @uaddr: the futex userspace address | 1753 | * @uaddr: the futex userspace address |
1714 | * @val: the expected value | 1754 | * @val: the expected value |
1715 | * @fshared: whether the futex is shared (1) or not (0) | 1755 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1716 | * @q: the associated futex_q | 1756 | * @q: the associated futex_q |
1717 | * @hb: storage for hash_bucket pointer to be returned to caller | 1757 | * @hb: storage for hash_bucket pointer to be returned to caller |
1718 | * | 1758 | * |
@@ -1725,7 +1765,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1725 | * 0 - uaddr contains val and hb has been locked | 1765 | * 0 - uaddr contains val and hb has been locked |
1726 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | 1766 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked |
1727 | */ | 1767 | */ |
1728 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | 1768 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
1729 | struct futex_q *q, struct futex_hash_bucket **hb) | 1769 | struct futex_q *q, struct futex_hash_bucket **hb) |
1730 | { | 1770 | { |
1731 | u32 uval; | 1771 | u32 uval; |
@@ -1740,17 +1780,17 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | |||
1740 | * | 1780 | * |
1741 | * The basic logical guarantee of a futex is that it blocks ONLY | 1781 | * The basic logical guarantee of a futex is that it blocks ONLY |
1742 | * if cond(var) is known to be true at the time of blocking, for | 1782 | * if cond(var) is known to be true at the time of blocking, for |
1743 | * any cond. If we queued after testing *uaddr, that would open | 1783 | * any cond. If we locked the hash-bucket after testing *uaddr, that |
1744 | * a race condition where we could block indefinitely with | 1784 | * would open a race condition where we could block indefinitely with |
1745 | * cond(var) false, which would violate the guarantee. | 1785 | * cond(var) false, which would violate the guarantee. |
1746 | * | 1786 | * |
1747 | * A consequence is that futex_wait() can return zero and absorb | 1787 | * On the other hand, we insert q and release the hash-bucket only |
1748 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1788 | * after testing *uaddr. This guarantees that futex_wait() will NOT |
1749 | * rare, but normal. | 1789 | * absorb a wakeup if *uaddr does not match the desired values |
1790 | * while the syscall executes. | ||
1750 | */ | 1791 | */ |
1751 | retry: | 1792 | retry: |
1752 | q->key = FUTEX_KEY_INIT; | 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
1753 | ret = get_futex_key(uaddr, fshared, &q->key); | ||
1754 | if (unlikely(ret != 0)) | 1794 | if (unlikely(ret != 0)) |
1755 | return ret; | 1795 | return ret; |
1756 | 1796 | ||
@@ -1766,10 +1806,10 @@ retry_private: | |||
1766 | if (ret) | 1806 | if (ret) |
1767 | goto out; | 1807 | goto out; |
1768 | 1808 | ||
1769 | if (!fshared) | 1809 | if (!(flags & FLAGS_SHARED)) |
1770 | goto retry_private; | 1810 | goto retry_private; |
1771 | 1811 | ||
1772 | put_futex_key(fshared, &q->key); | 1812 | put_futex_key(&q->key); |
1773 | goto retry; | 1813 | goto retry; |
1774 | } | 1814 | } |
1775 | 1815 | ||
@@ -1780,40 +1820,40 @@ retry_private: | |||
1780 | 1820 | ||
1781 | out: | 1821 | out: |
1782 | if (ret) | 1822 | if (ret) |
1783 | put_futex_key(fshared, &q->key); | 1823 | put_futex_key(&q->key); |
1784 | return ret; | 1824 | return ret; |
1785 | } | 1825 | } |
1786 | 1826 | ||
1787 | static int futex_wait(u32 __user *uaddr, int fshared, | 1827 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
1788 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1828 | ktime_t *abs_time, u32 bitset) |
1789 | { | 1829 | { |
1790 | struct hrtimer_sleeper timeout, *to = NULL; | 1830 | struct hrtimer_sleeper timeout, *to = NULL; |
1791 | struct restart_block *restart; | 1831 | struct restart_block *restart; |
1792 | struct futex_hash_bucket *hb; | 1832 | struct futex_hash_bucket *hb; |
1793 | struct futex_q q; | 1833 | struct futex_q q = futex_q_init; |
1794 | int ret; | 1834 | int ret; |
1795 | 1835 | ||
1796 | if (!bitset) | 1836 | if (!bitset) |
1797 | return -EINVAL; | 1837 | return -EINVAL; |
1798 | |||
1799 | q.pi_state = NULL; | ||
1800 | q.bitset = bitset; | 1838 | q.bitset = bitset; |
1801 | q.rt_waiter = NULL; | ||
1802 | q.requeue_pi_key = NULL; | ||
1803 | 1839 | ||
1804 | if (abs_time) { | 1840 | if (abs_time) { |
1805 | to = &timeout; | 1841 | to = &timeout; |
1806 | 1842 | ||
1807 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 1843 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
1808 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1844 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
1845 | HRTIMER_MODE_ABS); | ||
1809 | hrtimer_init_sleeper(to, current); | 1846 | hrtimer_init_sleeper(to, current); |
1810 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 1847 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
1811 | current->timer_slack_ns); | 1848 | current->timer_slack_ns); |
1812 | } | 1849 | } |
1813 | 1850 | ||
1814 | retry: | 1851 | retry: |
1815 | /* Prepare to wait on uaddr. */ | 1852 | /* |
1816 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1853 | * Prepare to wait on uaddr. On success, holds hb lock and increments |
1854 | * q.key refs. | ||
1855 | */ | ||
1856 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); | ||
1817 | if (ret) | 1857 | if (ret) |
1818 | goto out; | 1858 | goto out; |
1819 | 1859 | ||
@@ -1822,42 +1862,34 @@ retry: | |||
1822 | 1862 | ||
1823 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1863 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
1824 | ret = 0; | 1864 | ret = 0; |
1865 | /* unqueue_me() drops q.key ref */ | ||
1825 | if (!unqueue_me(&q)) | 1866 | if (!unqueue_me(&q)) |
1826 | goto out_put_key; | 1867 | goto out; |
1827 | ret = -ETIMEDOUT; | 1868 | ret = -ETIMEDOUT; |
1828 | if (to && !to->task) | 1869 | if (to && !to->task) |
1829 | goto out_put_key; | 1870 | goto out; |
1830 | 1871 | ||
1831 | /* | 1872 | /* |
1832 | * We expect signal_pending(current), but we might be the | 1873 | * We expect signal_pending(current), but we might be the |
1833 | * victim of a spurious wakeup as well. | 1874 | * victim of a spurious wakeup as well. |
1834 | */ | 1875 | */ |
1835 | if (!signal_pending(current)) { | 1876 | if (!signal_pending(current)) |
1836 | put_futex_key(fshared, &q.key); | ||
1837 | goto retry; | 1877 | goto retry; |
1838 | } | ||
1839 | 1878 | ||
1840 | ret = -ERESTARTSYS; | 1879 | ret = -ERESTARTSYS; |
1841 | if (!abs_time) | 1880 | if (!abs_time) |
1842 | goto out_put_key; | 1881 | goto out; |
1843 | 1882 | ||
1844 | restart = ¤t_thread_info()->restart_block; | 1883 | restart = ¤t_thread_info()->restart_block; |
1845 | restart->fn = futex_wait_restart; | 1884 | restart->fn = futex_wait_restart; |
1846 | restart->futex.uaddr = (u32 *)uaddr; | 1885 | restart->futex.uaddr = uaddr; |
1847 | restart->futex.val = val; | 1886 | restart->futex.val = val; |
1848 | restart->futex.time = abs_time->tv64; | 1887 | restart->futex.time = abs_time->tv64; |
1849 | restart->futex.bitset = bitset; | 1888 | restart->futex.bitset = bitset; |
1850 | restart->futex.flags = FLAGS_HAS_TIMEOUT; | 1889 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; |
1851 | |||
1852 | if (fshared) | ||
1853 | restart->futex.flags |= FLAGS_SHARED; | ||
1854 | if (clockrt) | ||
1855 | restart->futex.flags |= FLAGS_CLOCKRT; | ||
1856 | 1890 | ||
1857 | ret = -ERESTART_RESTARTBLOCK; | 1891 | ret = -ERESTART_RESTARTBLOCK; |
1858 | 1892 | ||
1859 | out_put_key: | ||
1860 | put_futex_key(fshared, &q.key); | ||
1861 | out: | 1893 | out: |
1862 | if (to) { | 1894 | if (to) { |
1863 | hrtimer_cancel(&to->timer); | 1895 | hrtimer_cancel(&to->timer); |
@@ -1869,8 +1901,7 @@ out: | |||
1869 | 1901 | ||
1870 | static long futex_wait_restart(struct restart_block *restart) | 1902 | static long futex_wait_restart(struct restart_block *restart) |
1871 | { | 1903 | { |
1872 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1904 | u32 __user *uaddr = restart->futex.uaddr; |
1873 | int fshared = 0; | ||
1874 | ktime_t t, *tp = NULL; | 1905 | ktime_t t, *tp = NULL; |
1875 | 1906 | ||
1876 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | 1907 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
@@ -1878,11 +1909,9 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1878 | tp = &t; | 1909 | tp = &t; |
1879 | } | 1910 | } |
1880 | restart->fn = do_no_restart_syscall; | 1911 | restart->fn = do_no_restart_syscall; |
1881 | if (restart->futex.flags & FLAGS_SHARED) | 1912 | |
1882 | fshared = 1; | 1913 | return (long)futex_wait(uaddr, restart->futex.flags, |
1883 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, | 1914 | restart->futex.val, tp, restart->futex.bitset); |
1884 | restart->futex.bitset, | ||
1885 | restart->futex.flags & FLAGS_CLOCKRT); | ||
1886 | } | 1915 | } |
1887 | 1916 | ||
1888 | 1917 | ||
@@ -1892,12 +1921,12 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1892 | * if there are waiters then it will block, it does PI, etc. (Due to | 1921 | * if there are waiters then it will block, it does PI, etc. (Due to |
1893 | * races the kernel might see a 0 value of the futex too.) | 1922 | * races the kernel might see a 0 value of the futex too.) |
1894 | */ | 1923 | */ |
1895 | static int futex_lock_pi(u32 __user *uaddr, int fshared, | 1924 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, |
1896 | int detect, ktime_t *time, int trylock) | 1925 | ktime_t *time, int trylock) |
1897 | { | 1926 | { |
1898 | struct hrtimer_sleeper timeout, *to = NULL; | 1927 | struct hrtimer_sleeper timeout, *to = NULL; |
1899 | struct futex_hash_bucket *hb; | 1928 | struct futex_hash_bucket *hb; |
1900 | struct futex_q q; | 1929 | struct futex_q q = futex_q_init; |
1901 | int res, ret; | 1930 | int res, ret; |
1902 | 1931 | ||
1903 | if (refill_pi_state_cache()) | 1932 | if (refill_pi_state_cache()) |
@@ -1911,12 +1940,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1911 | hrtimer_set_expires(&to->timer, *time); | 1940 | hrtimer_set_expires(&to->timer, *time); |
1912 | } | 1941 | } |
1913 | 1942 | ||
1914 | q.pi_state = NULL; | ||
1915 | q.rt_waiter = NULL; | ||
1916 | q.requeue_pi_key = NULL; | ||
1917 | retry: | 1943 | retry: |
1918 | q.key = FUTEX_KEY_INIT; | 1944 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); |
1919 | ret = get_futex_key(uaddr, fshared, &q.key); | ||
1920 | if (unlikely(ret != 0)) | 1945 | if (unlikely(ret != 0)) |
1921 | goto out; | 1946 | goto out; |
1922 | 1947 | ||
@@ -1938,7 +1963,7 @@ retry_private: | |||
1938 | * exit to complete. | 1963 | * exit to complete. |
1939 | */ | 1964 | */ |
1940 | queue_unlock(&q, hb); | 1965 | queue_unlock(&q, hb); |
1941 | put_futex_key(fshared, &q.key); | 1966 | put_futex_key(&q.key); |
1942 | cond_resched(); | 1967 | cond_resched(); |
1943 | goto retry; | 1968 | goto retry; |
1944 | default: | 1969 | default: |
@@ -1968,7 +1993,7 @@ retry_private: | |||
1968 | * Fixup the pi_state owner and possibly acquire the lock if we | 1993 | * Fixup the pi_state owner and possibly acquire the lock if we |
1969 | * haven't already. | 1994 | * haven't already. |
1970 | */ | 1995 | */ |
1971 | res = fixup_owner(uaddr, fshared, &q, !ret); | 1996 | res = fixup_owner(uaddr, &q, !ret); |
1972 | /* | 1997 | /* |
1973 | * If fixup_owner() returned an error, proprogate that. If it acquired | 1998 | * If fixup_owner() returned an error, proprogate that. If it acquired |
1974 | * the lock, clear our -ETIMEDOUT or -EINTR. | 1999 | * the lock, clear our -ETIMEDOUT or -EINTR. |
@@ -1992,7 +2017,7 @@ out_unlock_put_key: | |||
1992 | queue_unlock(&q, hb); | 2017 | queue_unlock(&q, hb); |
1993 | 2018 | ||
1994 | out_put_key: | 2019 | out_put_key: |
1995 | put_futex_key(fshared, &q.key); | 2020 | put_futex_key(&q.key); |
1996 | out: | 2021 | out: |
1997 | if (to) | 2022 | if (to) |
1998 | destroy_hrtimer_on_stack(&to->timer); | 2023 | destroy_hrtimer_on_stack(&to->timer); |
@@ -2005,10 +2030,10 @@ uaddr_faulted: | |||
2005 | if (ret) | 2030 | if (ret) |
2006 | goto out_put_key; | 2031 | goto out_put_key; |
2007 | 2032 | ||
2008 | if (!fshared) | 2033 | if (!(flags & FLAGS_SHARED)) |
2009 | goto retry_private; | 2034 | goto retry_private; |
2010 | 2035 | ||
2011 | put_futex_key(fshared, &q.key); | 2036 | put_futex_key(&q.key); |
2012 | goto retry; | 2037 | goto retry; |
2013 | } | 2038 | } |
2014 | 2039 | ||
@@ -2017,13 +2042,13 @@ uaddr_faulted: | |||
2017 | * This is the in-kernel slowpath: we look up the PI state (if any), | 2042 | * This is the in-kernel slowpath: we look up the PI state (if any), |
2018 | * and do the rt-mutex unlock. | 2043 | * and do the rt-mutex unlock. |
2019 | */ | 2044 | */ |
2020 | static int futex_unlock_pi(u32 __user *uaddr, int fshared) | 2045 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
2021 | { | 2046 | { |
2022 | struct futex_hash_bucket *hb; | 2047 | struct futex_hash_bucket *hb; |
2023 | struct futex_q *this, *next; | 2048 | struct futex_q *this, *next; |
2024 | u32 uval; | ||
2025 | struct plist_head *head; | 2049 | struct plist_head *head; |
2026 | union futex_key key = FUTEX_KEY_INIT; | 2050 | union futex_key key = FUTEX_KEY_INIT; |
2051 | u32 uval, vpid = task_pid_vnr(current); | ||
2027 | int ret; | 2052 | int ret; |
2028 | 2053 | ||
2029 | retry: | 2054 | retry: |
@@ -2032,10 +2057,10 @@ retry: | |||
2032 | /* | 2057 | /* |
2033 | * We release only a lock we actually own: | 2058 | * We release only a lock we actually own: |
2034 | */ | 2059 | */ |
2035 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2060 | if ((uval & FUTEX_TID_MASK) != vpid) |
2036 | return -EPERM; | 2061 | return -EPERM; |
2037 | 2062 | ||
2038 | ret = get_futex_key(uaddr, fshared, &key); | 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
2039 | if (unlikely(ret != 0)) | 2064 | if (unlikely(ret != 0)) |
2040 | goto out; | 2065 | goto out; |
2041 | 2066 | ||
@@ -2047,17 +2072,14 @@ retry: | |||
2047 | * again. If it succeeds then we can return without waking | 2072 | * again. If it succeeds then we can return without waking |
2048 | * anyone else up: | 2073 | * anyone else up: |
2049 | */ | 2074 | */ |
2050 | if (!(uval & FUTEX_OWNER_DIED)) | 2075 | if (!(uval & FUTEX_OWNER_DIED) && |
2051 | uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); | 2076 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
2052 | |||
2053 | |||
2054 | if (unlikely(uval == -EFAULT)) | ||
2055 | goto pi_faulted; | 2077 | goto pi_faulted; |
2056 | /* | 2078 | /* |
2057 | * Rare case: we managed to release the lock atomically, | 2079 | * Rare case: we managed to release the lock atomically, |
2058 | * no need to wake anyone else up: | 2080 | * no need to wake anyone else up: |
2059 | */ | 2081 | */ |
2060 | if (unlikely(uval == task_pid_vnr(current))) | 2082 | if (unlikely(uval == vpid)) |
2061 | goto out_unlock; | 2083 | goto out_unlock; |
2062 | 2084 | ||
2063 | /* | 2085 | /* |
@@ -2090,14 +2112,14 @@ retry: | |||
2090 | 2112 | ||
2091 | out_unlock: | 2113 | out_unlock: |
2092 | spin_unlock(&hb->lock); | 2114 | spin_unlock(&hb->lock); |
2093 | put_futex_key(fshared, &key); | 2115 | put_futex_key(&key); |
2094 | 2116 | ||
2095 | out: | 2117 | out: |
2096 | return ret; | 2118 | return ret; |
2097 | 2119 | ||
2098 | pi_faulted: | 2120 | pi_faulted: |
2099 | spin_unlock(&hb->lock); | 2121 | spin_unlock(&hb->lock); |
2100 | put_futex_key(fshared, &key); | 2122 | put_futex_key(&key); |
2101 | 2123 | ||
2102 | ret = fault_in_user_writeable(uaddr); | 2124 | ret = fault_in_user_writeable(uaddr); |
2103 | if (!ret) | 2125 | if (!ret) |
@@ -2142,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2142 | * We were woken prior to requeue by a timeout or a signal. | 2164 | * We were woken prior to requeue by a timeout or a signal. |
2143 | * Unqueue the futex_q and determine which it was. | 2165 | * Unqueue the futex_q and determine which it was. |
2144 | */ | 2166 | */ |
2145 | plist_del(&q->list, &q->list.plist); | 2167 | plist_del(&q->list, &hb->chain); |
2146 | 2168 | ||
2147 | /* Handle spurious wakeups gracefully */ | 2169 | /* Handle spurious wakeups gracefully */ |
2148 | ret = -EWOULDBLOCK; | 2170 | ret = -EWOULDBLOCK; |
@@ -2157,7 +2179,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2157 | /** | 2179 | /** |
2158 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 2180 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 |
2159 | * @uaddr: the futex we initially wait on (non-pi) | 2181 | * @uaddr: the futex we initially wait on (non-pi) |
2160 | * @fshared: whether the futexes are shared (1) or not (0). They must be | 2182 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be |
2161 | * the same type, no requeueing from private to shared, etc. | 2183 | * the same type, no requeueing from private to shared, etc. |
2162 | * @val: the expected value of uaddr | 2184 | * @val: the expected value of uaddr |
2163 | * @abs_time: absolute timeout | 2185 | * @abs_time: absolute timeout |
@@ -2195,16 +2217,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2195 | * 0 - On success | 2217 | * 0 - On success |
2196 | * <0 - On error | 2218 | * <0 - On error |
2197 | */ | 2219 | */ |
2198 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | 2220 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
2199 | u32 val, ktime_t *abs_time, u32 bitset, | 2221 | u32 val, ktime_t *abs_time, u32 bitset, |
2200 | int clockrt, u32 __user *uaddr2) | 2222 | u32 __user *uaddr2) |
2201 | { | 2223 | { |
2202 | struct hrtimer_sleeper timeout, *to = NULL; | 2224 | struct hrtimer_sleeper timeout, *to = NULL; |
2203 | struct rt_mutex_waiter rt_waiter; | 2225 | struct rt_mutex_waiter rt_waiter; |
2204 | struct rt_mutex *pi_mutex = NULL; | 2226 | struct rt_mutex *pi_mutex = NULL; |
2205 | struct futex_hash_bucket *hb; | 2227 | struct futex_hash_bucket *hb; |
2206 | union futex_key key2; | 2228 | union futex_key key2 = FUTEX_KEY_INIT; |
2207 | struct futex_q q; | 2229 | struct futex_q q = futex_q_init; |
2208 | int res, ret; | 2230 | int res, ret; |
2209 | 2231 | ||
2210 | if (!bitset) | 2232 | if (!bitset) |
@@ -2212,8 +2234,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2212 | 2234 | ||
2213 | if (abs_time) { | 2235 | if (abs_time) { |
2214 | to = &timeout; | 2236 | to = &timeout; |
2215 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | 2237 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
2216 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 2238 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
2239 | HRTIMER_MODE_ABS); | ||
2217 | hrtimer_init_sleeper(to, current); | 2240 | hrtimer_init_sleeper(to, current); |
2218 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 2241 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
2219 | current->timer_slack_ns); | 2242 | current->timer_slack_ns); |
@@ -2226,18 +2249,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2226 | debug_rt_mutex_init_waiter(&rt_waiter); | 2249 | debug_rt_mutex_init_waiter(&rt_waiter); |
2227 | rt_waiter.task = NULL; | 2250 | rt_waiter.task = NULL; |
2228 | 2251 | ||
2229 | key2 = FUTEX_KEY_INIT; | 2252 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); |
2230 | ret = get_futex_key(uaddr2, fshared, &key2); | ||
2231 | if (unlikely(ret != 0)) | 2253 | if (unlikely(ret != 0)) |
2232 | goto out; | 2254 | goto out; |
2233 | 2255 | ||
2234 | q.pi_state = NULL; | ||
2235 | q.bitset = bitset; | 2256 | q.bitset = bitset; |
2236 | q.rt_waiter = &rt_waiter; | 2257 | q.rt_waiter = &rt_waiter; |
2237 | q.requeue_pi_key = &key2; | 2258 | q.requeue_pi_key = &key2; |
2238 | 2259 | ||
2239 | /* Prepare to wait on uaddr. */ | 2260 | /* |
2240 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2261 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref |
2262 | * count. | ||
2263 | */ | ||
2264 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); | ||
2241 | if (ret) | 2265 | if (ret) |
2242 | goto out_key2; | 2266 | goto out_key2; |
2243 | 2267 | ||
@@ -2254,7 +2278,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2254 | * In order for us to be here, we know our q.key == key2, and since | 2278 | * In order for us to be here, we know our q.key == key2, and since |
2255 | * we took the hb->lock above, we also know that futex_requeue() has | 2279 | * we took the hb->lock above, we also know that futex_requeue() has |
2256 | * completed and we no longer have to concern ourselves with a wakeup | 2280 | * completed and we no longer have to concern ourselves with a wakeup |
2257 | * race with the atomic proxy lock acquition by the requeue code. | 2281 | * race with the atomic proxy lock acquisition by the requeue code. The |
2282 | * futex_requeue dropped our key1 reference and incremented our key2 | ||
2283 | * reference count. | ||
2258 | */ | 2284 | */ |
2259 | 2285 | ||
2260 | /* Check if the requeue code acquired the second futex for us. */ | 2286 | /* Check if the requeue code acquired the second futex for us. */ |
@@ -2265,8 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2265 | */ | 2291 | */ |
2266 | if (q.pi_state && (q.pi_state->owner != current)) { | 2292 | if (q.pi_state && (q.pi_state->owner != current)) { |
2267 | spin_lock(q.lock_ptr); | 2293 | spin_lock(q.lock_ptr); |
2268 | ret = fixup_pi_state_owner(uaddr2, &q, current, | 2294 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
2269 | fshared); | ||
2270 | spin_unlock(q.lock_ptr); | 2295 | spin_unlock(q.lock_ptr); |
2271 | } | 2296 | } |
2272 | } else { | 2297 | } else { |
@@ -2285,7 +2310,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2285 | * Fixup the pi_state owner and possibly acquire the lock if we | 2310 | * Fixup the pi_state owner and possibly acquire the lock if we |
2286 | * haven't already. | 2311 | * haven't already. |
2287 | */ | 2312 | */ |
2288 | res = fixup_owner(uaddr2, fshared, &q, !ret); | 2313 | res = fixup_owner(uaddr2, &q, !ret); |
2289 | /* | 2314 | /* |
2290 | * If fixup_owner() returned an error, proprogate that. If it | 2315 | * If fixup_owner() returned an error, proprogate that. If it |
2291 | * acquired the lock, clear -ETIMEDOUT or -EINTR. | 2316 | * acquired the lock, clear -ETIMEDOUT or -EINTR. |
@@ -2316,9 +2341,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2316 | } | 2341 | } |
2317 | 2342 | ||
2318 | out_put_keys: | 2343 | out_put_keys: |
2319 | put_futex_key(fshared, &q.key); | 2344 | put_futex_key(&q.key); |
2320 | out_key2: | 2345 | out_key2: |
2321 | put_futex_key(fshared, &key2); | 2346 | put_futex_key(&key2); |
2322 | 2347 | ||
2323 | out: | 2348 | out: |
2324 | if (to) { | 2349 | if (to) { |
@@ -2393,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, | |||
2393 | goto err_unlock; | 2418 | goto err_unlock; |
2394 | ret = -EPERM; | 2419 | ret = -EPERM; |
2395 | pcred = __task_cred(p); | 2420 | pcred = __task_cred(p); |
2421 | /* If victim is in different user_ns, then uids are not | ||
2422 | comparable, so we must have CAP_SYS_PTRACE */ | ||
2423 | if (cred->user->user_ns != pcred->user->user_ns) { | ||
2424 | if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
2425 | goto err_unlock; | ||
2426 | goto ok; | ||
2427 | } | ||
2428 | /* If victim is in same user_ns, then uids are comparable */ | ||
2396 | if (cred->euid != pcred->euid && | 2429 | if (cred->euid != pcred->euid && |
2397 | cred->euid != pcred->uid && | 2430 | cred->euid != pcred->uid && |
2398 | !capable(CAP_SYS_PTRACE)) | 2431 | !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) |
2399 | goto err_unlock; | 2432 | goto err_unlock; |
2433 | ok: | ||
2400 | head = p->robust_list; | 2434 | head = p->robust_list; |
2401 | rcu_read_unlock(); | 2435 | rcu_read_unlock(); |
2402 | } | 2436 | } |
@@ -2435,11 +2469,20 @@ retry: | |||
2435 | * userspace. | 2469 | * userspace. |
2436 | */ | 2470 | */ |
2437 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 2471 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
2438 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); | 2472 | /* |
2439 | 2473 | * We are not holding a lock here, but we want to have | |
2440 | if (nval == -EFAULT) | 2474 | * the pagefault_disable/enable() protection because |
2441 | return -1; | 2475 | * we want to handle the fault gracefully. If the |
2442 | 2476 | * access fails we try to fault in the futex with R/W | |
2477 | * verification via get_user_pages. get_user() above | ||
2478 | * does not guarantee R/W access. If that fails we | ||
2479 | * give up and leave the futex locked. | ||
2480 | */ | ||
2481 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { | ||
2482 | if (fault_in_user_writeable(uaddr)) | ||
2483 | return -1; | ||
2484 | goto retry; | ||
2485 | } | ||
2443 | if (nval != uval) | 2486 | if (nval != uval) |
2444 | goto retry; | 2487 | goto retry; |
2445 | 2488 | ||
@@ -2458,7 +2501,7 @@ retry: | |||
2458 | */ | 2501 | */ |
2459 | static inline int fetch_robust_entry(struct robust_list __user **entry, | 2502 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
2460 | struct robust_list __user * __user *head, | 2503 | struct robust_list __user * __user *head, |
2461 | int *pi) | 2504 | unsigned int *pi) |
2462 | { | 2505 | { |
2463 | unsigned long uentry; | 2506 | unsigned long uentry; |
2464 | 2507 | ||
@@ -2481,7 +2524,8 @@ void exit_robust_list(struct task_struct *curr) | |||
2481 | { | 2524 | { |
2482 | struct robust_list_head __user *head = curr->robust_list; | 2525 | struct robust_list_head __user *head = curr->robust_list; |
2483 | struct robust_list __user *entry, *next_entry, *pending; | 2526 | struct robust_list __user *entry, *next_entry, *pending; |
2484 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; | 2527 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
2528 | unsigned int uninitialized_var(next_pi); | ||
2485 | unsigned long futex_offset; | 2529 | unsigned long futex_offset; |
2486 | int rc; | 2530 | int rc; |
2487 | 2531 | ||
@@ -2542,58 +2586,57 @@ void exit_robust_list(struct task_struct *curr) | |||
2542 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2586 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
2543 | u32 __user *uaddr2, u32 val2, u32 val3) | 2587 | u32 __user *uaddr2, u32 val2, u32 val3) |
2544 | { | 2588 | { |
2545 | int clockrt, ret = -ENOSYS; | 2589 | int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; |
2546 | int cmd = op & FUTEX_CMD_MASK; | 2590 | unsigned int flags = 0; |
2547 | int fshared = 0; | ||
2548 | 2591 | ||
2549 | if (!(op & FUTEX_PRIVATE_FLAG)) | 2592 | if (!(op & FUTEX_PRIVATE_FLAG)) |
2550 | fshared = 1; | 2593 | flags |= FLAGS_SHARED; |
2551 | 2594 | ||
2552 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2595 | if (op & FUTEX_CLOCK_REALTIME) { |
2553 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) | 2596 | flags |= FLAGS_CLOCKRT; |
2554 | return -ENOSYS; | 2597 | if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
2598 | return -ENOSYS; | ||
2599 | } | ||
2555 | 2600 | ||
2556 | switch (cmd) { | 2601 | switch (cmd) { |
2557 | case FUTEX_WAIT: | 2602 | case FUTEX_WAIT: |
2558 | val3 = FUTEX_BITSET_MATCH_ANY; | 2603 | val3 = FUTEX_BITSET_MATCH_ANY; |
2559 | case FUTEX_WAIT_BITSET: | 2604 | case FUTEX_WAIT_BITSET: |
2560 | ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); | 2605 | ret = futex_wait(uaddr, flags, val, timeout, val3); |
2561 | break; | 2606 | break; |
2562 | case FUTEX_WAKE: | 2607 | case FUTEX_WAKE: |
2563 | val3 = FUTEX_BITSET_MATCH_ANY; | 2608 | val3 = FUTEX_BITSET_MATCH_ANY; |
2564 | case FUTEX_WAKE_BITSET: | 2609 | case FUTEX_WAKE_BITSET: |
2565 | ret = futex_wake(uaddr, fshared, val, val3); | 2610 | ret = futex_wake(uaddr, flags, val, val3); |
2566 | break; | 2611 | break; |
2567 | case FUTEX_REQUEUE: | 2612 | case FUTEX_REQUEUE: |
2568 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); | 2613 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); |
2569 | break; | 2614 | break; |
2570 | case FUTEX_CMP_REQUEUE: | 2615 | case FUTEX_CMP_REQUEUE: |
2571 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2616 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); |
2572 | 0); | ||
2573 | break; | 2617 | break; |
2574 | case FUTEX_WAKE_OP: | 2618 | case FUTEX_WAKE_OP: |
2575 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2619 | ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); |
2576 | break; | 2620 | break; |
2577 | case FUTEX_LOCK_PI: | 2621 | case FUTEX_LOCK_PI: |
2578 | if (futex_cmpxchg_enabled) | 2622 | if (futex_cmpxchg_enabled) |
2579 | ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); | 2623 | ret = futex_lock_pi(uaddr, flags, val, timeout, 0); |
2580 | break; | 2624 | break; |
2581 | case FUTEX_UNLOCK_PI: | 2625 | case FUTEX_UNLOCK_PI: |
2582 | if (futex_cmpxchg_enabled) | 2626 | if (futex_cmpxchg_enabled) |
2583 | ret = futex_unlock_pi(uaddr, fshared); | 2627 | ret = futex_unlock_pi(uaddr, flags); |
2584 | break; | 2628 | break; |
2585 | case FUTEX_TRYLOCK_PI: | 2629 | case FUTEX_TRYLOCK_PI: |
2586 | if (futex_cmpxchg_enabled) | 2630 | if (futex_cmpxchg_enabled) |
2587 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2631 | ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); |
2588 | break; | 2632 | break; |
2589 | case FUTEX_WAIT_REQUEUE_PI: | 2633 | case FUTEX_WAIT_REQUEUE_PI: |
2590 | val3 = FUTEX_BITSET_MATCH_ANY; | 2634 | val3 = FUTEX_BITSET_MATCH_ANY; |
2591 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | 2635 | ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, |
2592 | clockrt, uaddr2); | 2636 | uaddr2); |
2593 | break; | 2637 | break; |
2594 | case FUTEX_CMP_REQUEUE_PI: | 2638 | case FUTEX_CMP_REQUEUE_PI: |
2595 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | 2639 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
2596 | 1); | ||
2597 | break; | 2640 | break; |
2598 | default: | 2641 | default: |
2599 | ret = -ENOSYS; | 2642 | ret = -ENOSYS; |
@@ -2647,11 +2690,10 @@ static int __init futex_init(void) | |||
2647 | * of the complex code paths. Also we want to prevent | 2690 | * of the complex code paths. Also we want to prevent |
2648 | * registration of robust lists in that case. NULL is | 2691 | * registration of robust lists in that case. NULL is |
2649 | * guaranteed to fault and we get -EFAULT on functional | 2692 | * guaranteed to fault and we get -EFAULT on functional |
2650 | * implementation, the non functional ones will return | 2693 | * implementation, the non-functional ones will return |
2651 | * -ENOSYS. | 2694 | * -ENOSYS. |
2652 | */ | 2695 | */ |
2653 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2696 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) |
2654 | if (curval == -EFAULT) | ||
2655 | futex_cmpxchg_enabled = 1; | 2697 | futex_cmpxchg_enabled = 1; |
2656 | 2698 | ||
2657 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2699 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e5..5f9e689dc8f0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -19,7 +19,7 @@ | |||
19 | */ | 19 | */ |
20 | static inline int | 20 | static inline int |
21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | 21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, |
22 | compat_uptr_t __user *head, int *pi) | 22 | compat_uptr_t __user *head, unsigned int *pi) |
23 | { | 23 | { |
24 | if (get_user(*uentry, head)) | 24 | if (get_user(*uentry, head)) |
25 | return -EFAULT; | 25 | return -EFAULT; |
@@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
49 | { | 49 | { |
50 | struct compat_robust_list_head __user *head = curr->compat_robust_list; | 50 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
51 | struct robust_list __user *entry, *next_entry, *pending; | 51 | struct robust_list __user *entry, *next_entry, *pending; |
52 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; | 52 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
53 | unsigned int uninitialized_var(next_pi); | ||
53 | compat_uptr_t uentry, next_uentry, upending; | 54 | compat_uptr_t uentry, next_uentry, upending; |
54 | compat_long_t futex_offset; | 55 | compat_long_t futex_offset; |
55 | int rc; | 56 | int rc; |
@@ -152,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
152 | goto err_unlock; | 153 | goto err_unlock; |
153 | ret = -EPERM; | 154 | ret = -EPERM; |
154 | pcred = __task_cred(p); | 155 | pcred = __task_cred(p); |
156 | /* If victim is in different user_ns, then uids are not | ||
157 | comparable, so we must have CAP_SYS_PTRACE */ | ||
158 | if (cred->user->user_ns != pcred->user->user_ns) { | ||
159 | if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
160 | goto err_unlock; | ||
161 | goto ok; | ||
162 | } | ||
163 | /* If victim is in same user_ns, then uids are comparable */ | ||
155 | if (cred->euid != pcred->euid && | 164 | if (cred->euid != pcred->euid && |
156 | cred->euid != pcred->uid && | 165 | cred->euid != pcred->uid && |
157 | !capable(CAP_SYS_PTRACE)) | 166 | !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) |
158 | goto err_unlock; | 167 | goto err_unlock; |
168 | ok: | ||
159 | head = p->compat_robust_list; | 169 | head = p->compat_robust_list; |
160 | rcu_read_unlock(); | 170 | rcu_read_unlock(); |
161 | } | 171 | } |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 70a298d6da71..5bf924d80b5c 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling" | |||
2 | 2 | ||
3 | config GCOV_KERNEL | 3 | config GCOV_KERNEL |
4 | bool "Enable gcov-based kernel profiling" | 4 | bool "Enable gcov-based kernel profiling" |
5 | depends on DEBUG_FS && CONSTRUCTORS | 5 | depends on DEBUG_FS |
6 | select CONSTRUCTORS | ||
6 | default n | 7 | default n |
7 | ---help--- | 8 | ---help--- |
8 | This option enables gcov-based code profiling (e.g. for code coverage | 9 | This option enables gcov-based code profiling (e.g. for code coverage |
@@ -34,7 +35,7 @@ config GCOV_KERNEL | |||
34 | config GCOV_PROFILE_ALL | 35 | config GCOV_PROFILE_ALL |
35 | bool "Profile entire Kernel" | 36 | bool "Profile entire Kernel" |
36 | depends on GCOV_KERNEL | 37 | depends on GCOV_KERNEL |
37 | depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE | 38 | depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE |
38 | default n | 39 | default n |
39 | ---help--- | 40 | ---help--- |
40 | This options activates profiling for the entire kernel. | 41 | This options activates profiling for the entire kernel. |
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f761001d517..e97ca59e2520 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile | |||
@@ -1,3 +1,3 @@ | |||
1 | EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' | 1 | ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' |
2 | 2 | ||
3 | obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o | 3 | obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index f83972b16564..9bd0934f6c33 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len, | |||
561 | static const struct file_operations gcov_reset_fops = { | 561 | static const struct file_operations gcov_reset_fops = { |
562 | .write = reset_write, | 562 | .write = reset_write, |
563 | .read = reset_read, | 563 | .read = reset_read, |
564 | .llseek = noop_llseek, | ||
564 | }; | 565 | }; |
565 | 566 | ||
566 | /* | 567 | /* |
diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f35cf4..1cc476d52dd3 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
233 | struct group_info *group_info; | 233 | struct group_info *group_info; |
234 | int retval; | 234 | int retval; |
235 | 235 | ||
236 | if (!capable(CAP_SETGID)) | 236 | if (!nsown_capable(CAP_SETGID)) |
237 | return -EPERM; | 237 | return -EPERM; |
238 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 238 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
239 | return -EINVAL; | 239 | return -EINVAL; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cb49883b64e5..11e896903828 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -55,11 +55,10 @@ | |||
55 | /* | 55 | /* |
56 | * The timer bases: | 56 | * The timer bases: |
57 | * | 57 | * |
58 | * Note: If we want to add new timer bases, we have to skip the two | 58 | * There are more clockids then hrtimer bases. Thus, we index |
59 | * clock ids captured by the cpu-timers. We do this by holding empty | 59 | * into the timer bases by the hrtimer_base_type enum. When trying |
60 | * entries rather than doing math adjustment of the clock ids. | 60 | * to reach a base using a clockid, hrtimer_clockid_to_base() |
61 | * This ensures that we capture erroneous accesses to these clock ids | 61 | * is used to convert from clockid to the proper hrtimer_base_type. |
62 | * rather than moving them into the range of valid clock id's. | ||
63 | */ | 62 | */ |
64 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | 63 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = |
65 | { | 64 | { |
@@ -67,39 +66,55 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
67 | .clock_base = | 66 | .clock_base = |
68 | { | 67 | { |
69 | { | 68 | { |
70 | .index = CLOCK_REALTIME, | 69 | .index = HRTIMER_BASE_MONOTONIC, |
70 | .clockid = CLOCK_MONOTONIC, | ||
71 | .get_time = &ktime_get, | ||
72 | .resolution = KTIME_LOW_RES, | ||
73 | }, | ||
74 | { | ||
75 | .index = HRTIMER_BASE_REALTIME, | ||
76 | .clockid = CLOCK_REALTIME, | ||
71 | .get_time = &ktime_get_real, | 77 | .get_time = &ktime_get_real, |
72 | .resolution = KTIME_LOW_RES, | 78 | .resolution = KTIME_LOW_RES, |
73 | }, | 79 | }, |
74 | { | 80 | { |
75 | .index = CLOCK_MONOTONIC, | 81 | .index = HRTIMER_BASE_BOOTTIME, |
76 | .get_time = &ktime_get, | 82 | .clockid = CLOCK_BOOTTIME, |
83 | .get_time = &ktime_get_boottime, | ||
77 | .resolution = KTIME_LOW_RES, | 84 | .resolution = KTIME_LOW_RES, |
78 | }, | 85 | }, |
79 | } | 86 | } |
80 | }; | 87 | }; |
81 | 88 | ||
89 | static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { | ||
90 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, | ||
91 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, | ||
92 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, | ||
93 | }; | ||
94 | |||
95 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) | ||
96 | { | ||
97 | return hrtimer_clock_to_base_table[clock_id]; | ||
98 | } | ||
99 | |||
100 | |||
82 | /* | 101 | /* |
83 | * Get the coarse grained time at the softirq based on xtime and | 102 | * Get the coarse grained time at the softirq based on xtime and |
84 | * wall_to_monotonic. | 103 | * wall_to_monotonic. |
85 | */ | 104 | */ |
86 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | 105 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) |
87 | { | 106 | { |
88 | ktime_t xtim, tomono; | 107 | ktime_t xtim, mono, boot; |
89 | struct timespec xts, tom; | 108 | struct timespec xts, tom, slp; |
90 | unsigned long seq; | ||
91 | 109 | ||
92 | do { | 110 | get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); |
93 | seq = read_seqbegin(&xtime_lock); | ||
94 | xts = __current_kernel_time(); | ||
95 | tom = __get_wall_to_monotonic(); | ||
96 | } while (read_seqretry(&xtime_lock, seq)); | ||
97 | 111 | ||
98 | xtim = timespec_to_ktime(xts); | 112 | xtim = timespec_to_ktime(xts); |
99 | tomono = timespec_to_ktime(tom); | 113 | mono = ktime_add(xtim, timespec_to_ktime(tom)); |
100 | base->clock_base[CLOCK_REALTIME].softirq_time = xtim; | 114 | boot = ktime_add(mono, timespec_to_ktime(slp)); |
101 | base->clock_base[CLOCK_MONOTONIC].softirq_time = | 115 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; |
102 | ktime_add(xtim, tomono); | 116 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; |
117 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; | ||
103 | } | 118 | } |
104 | 119 | ||
105 | /* | 120 | /* |
@@ -186,10 +201,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, | |||
186 | struct hrtimer_cpu_base *new_cpu_base; | 201 | struct hrtimer_cpu_base *new_cpu_base; |
187 | int this_cpu = smp_processor_id(); | 202 | int this_cpu = smp_processor_id(); |
188 | int cpu = hrtimer_get_target(this_cpu, pinned); | 203 | int cpu = hrtimer_get_target(this_cpu, pinned); |
204 | int basenum = base->index; | ||
189 | 205 | ||
190 | again: | 206 | again: |
191 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); | 207 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); |
192 | new_base = &new_cpu_base->clock_base[base->index]; | 208 | new_base = &new_cpu_base->clock_base[basenum]; |
193 | 209 | ||
194 | if (base != new_base) { | 210 | if (base != new_base) { |
195 | /* | 211 | /* |
@@ -336,6 +352,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); | |||
336 | 352 | ||
337 | static struct debug_obj_descr hrtimer_debug_descr; | 353 | static struct debug_obj_descr hrtimer_debug_descr; |
338 | 354 | ||
355 | static void *hrtimer_debug_hint(void *addr) | ||
356 | { | ||
357 | return ((struct hrtimer *) addr)->function; | ||
358 | } | ||
359 | |||
339 | /* | 360 | /* |
340 | * fixup_init is called when: | 361 | * fixup_init is called when: |
341 | * - an active object is initialized | 362 | * - an active object is initialized |
@@ -395,6 +416,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) | |||
395 | 416 | ||
396 | static struct debug_obj_descr hrtimer_debug_descr = { | 417 | static struct debug_obj_descr hrtimer_debug_descr = { |
397 | .name = "hrtimer", | 418 | .name = "hrtimer", |
419 | .debug_hint = hrtimer_debug_hint, | ||
398 | .fixup_init = hrtimer_fixup_init, | 420 | .fixup_init = hrtimer_fixup_init, |
399 | .fixup_activate = hrtimer_fixup_activate, | 421 | .fixup_activate = hrtimer_fixup_activate, |
400 | .fixup_free = hrtimer_fixup_free, | 422 | .fixup_free = hrtimer_fixup_free, |
@@ -499,7 +521,7 @@ static inline int hrtimer_is_hres_enabled(void) | |||
499 | */ | 521 | */ |
500 | static inline int hrtimer_hres_active(void) | 522 | static inline int hrtimer_hres_active(void) |
501 | { | 523 | { |
502 | return __get_cpu_var(hrtimer_bases).hres_active; | 524 | return __this_cpu_read(hrtimer_bases.hres_active); |
503 | } | 525 | } |
504 | 526 | ||
505 | /* | 527 | /* |
@@ -518,10 +540,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
518 | 540 | ||
519 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 541 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
520 | struct hrtimer *timer; | 542 | struct hrtimer *timer; |
543 | struct timerqueue_node *next; | ||
521 | 544 | ||
522 | if (!base->first) | 545 | next = timerqueue_getnext(&base->active); |
546 | if (!next) | ||
523 | continue; | 547 | continue; |
524 | timer = rb_entry(base->first, struct hrtimer, node); | 548 | timer = container_of(next, struct hrtimer, node); |
549 | |||
525 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 550 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
526 | /* | 551 | /* |
527 | * clock_was_set() has changed base->offset so the | 552 | * clock_was_set() has changed base->offset so the |
@@ -601,67 +626,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
601 | return res; | 626 | return res; |
602 | } | 627 | } |
603 | 628 | ||
604 | |||
605 | /* | ||
606 | * Retrigger next event is called after clock was set | ||
607 | * | ||
608 | * Called with interrupts disabled via on_each_cpu() | ||
609 | */ | ||
610 | static void retrigger_next_event(void *arg) | ||
611 | { | ||
612 | struct hrtimer_cpu_base *base; | ||
613 | struct timespec realtime_offset, wtm; | ||
614 | unsigned long seq; | ||
615 | |||
616 | if (!hrtimer_hres_active()) | ||
617 | return; | ||
618 | |||
619 | do { | ||
620 | seq = read_seqbegin(&xtime_lock); | ||
621 | wtm = __get_wall_to_monotonic(); | ||
622 | } while (read_seqretry(&xtime_lock, seq)); | ||
623 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
624 | |||
625 | base = &__get_cpu_var(hrtimer_bases); | ||
626 | |||
627 | /* Adjust CLOCK_REALTIME offset */ | ||
628 | raw_spin_lock(&base->lock); | ||
629 | base->clock_base[CLOCK_REALTIME].offset = | ||
630 | timespec_to_ktime(realtime_offset); | ||
631 | |||
632 | hrtimer_force_reprogram(base, 0); | ||
633 | raw_spin_unlock(&base->lock); | ||
634 | } | ||
635 | |||
636 | /* | ||
637 | * Clock realtime was set | ||
638 | * | ||
639 | * Change the offset of the realtime clock vs. the monotonic | ||
640 | * clock. | ||
641 | * | ||
642 | * We might have to reprogram the high resolution timer interrupt. On | ||
643 | * SMP we call the architecture specific code to retrigger _all_ high | ||
644 | * resolution timer interrupts. On UP we just disable interrupts and | ||
645 | * call the high resolution interrupt code. | ||
646 | */ | ||
647 | void clock_was_set(void) | ||
648 | { | ||
649 | /* Retrigger the CPU local events everywhere */ | ||
650 | on_each_cpu(retrigger_next_event, NULL, 1); | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * During resume we might have to reprogram the high resolution timer | ||
655 | * interrupt (on the local CPU): | ||
656 | */ | ||
657 | void hres_timers_resume(void) | ||
658 | { | ||
659 | WARN_ONCE(!irqs_disabled(), | ||
660 | KERN_INFO "hres_timers_resume() called with IRQs enabled!"); | ||
661 | |||
662 | retrigger_next_event(NULL); | ||
663 | } | ||
664 | |||
665 | /* | 629 | /* |
666 | * Initialize the high resolution related parts of cpu_base | 630 | * Initialize the high resolution related parts of cpu_base |
667 | */ | 631 | */ |
@@ -672,14 +636,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
672 | } | 636 | } |
673 | 637 | ||
674 | /* | 638 | /* |
675 | * Initialize the high resolution related parts of a hrtimer | ||
676 | */ | ||
677 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | ||
678 | { | ||
679 | } | ||
680 | |||
681 | |||
682 | /* | ||
683 | * When High resolution timers are active, try to reprogram. Note, that in case | 639 | * When High resolution timers are active, try to reprogram. Note, that in case |
684 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry | 640 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry |
685 | * check happens. The timer gets enqueued into the rbtree. The reprogramming | 641 | * check happens. The timer gets enqueued into the rbtree. The reprogramming |
@@ -704,11 +660,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
704 | } | 660 | } |
705 | 661 | ||
706 | /* | 662 | /* |
663 | * Retrigger next event is called after clock was set | ||
664 | * | ||
665 | * Called with interrupts disabled via on_each_cpu() | ||
666 | */ | ||
667 | static void retrigger_next_event(void *arg) | ||
668 | { | ||
669 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | ||
670 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
671 | |||
672 | if (!hrtimer_hres_active()) | ||
673 | return; | ||
674 | |||
675 | /* Optimized out for !HIGH_RES */ | ||
676 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
677 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
678 | |||
679 | /* Adjust CLOCK_REALTIME offset */ | ||
680 | raw_spin_lock(&base->lock); | ||
681 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | ||
682 | timespec_to_ktime(realtime_offset); | ||
683 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
684 | timespec_to_ktime(sleep); | ||
685 | |||
686 | hrtimer_force_reprogram(base, 0); | ||
687 | raw_spin_unlock(&base->lock); | ||
688 | } | ||
689 | |||
690 | /* | ||
707 | * Switch to high resolution mode | 691 | * Switch to high resolution mode |
708 | */ | 692 | */ |
709 | static int hrtimer_switch_to_hres(void) | 693 | static int hrtimer_switch_to_hres(void) |
710 | { | 694 | { |
711 | int cpu = smp_processor_id(); | 695 | int i, cpu = smp_processor_id(); |
712 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); | 696 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); |
713 | unsigned long flags; | 697 | unsigned long flags; |
714 | 698 | ||
@@ -724,8 +708,8 @@ static int hrtimer_switch_to_hres(void) | |||
724 | return 0; | 708 | return 0; |
725 | } | 709 | } |
726 | base->hres_active = 1; | 710 | base->hres_active = 1; |
727 | base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; | 711 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
728 | base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; | 712 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
729 | 713 | ||
730 | tick_setup_sched_timer(); | 714 | tick_setup_sched_timer(); |
731 | 715 | ||
@@ -749,10 +733,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
749 | return 0; | 733 | return 0; |
750 | } | 734 | } |
751 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 735 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
752 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 736 | static inline void retrigger_next_event(void *arg) { } |
753 | 737 | ||
754 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 738 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
755 | 739 | ||
740 | /* | ||
741 | * Clock realtime was set | ||
742 | * | ||
743 | * Change the offset of the realtime clock vs. the monotonic | ||
744 | * clock. | ||
745 | * | ||
746 | * We might have to reprogram the high resolution timer interrupt. On | ||
747 | * SMP we call the architecture specific code to retrigger _all_ high | ||
748 | * resolution timer interrupts. On UP we just disable interrupts and | ||
749 | * call the high resolution interrupt code. | ||
750 | */ | ||
751 | void clock_was_set(void) | ||
752 | { | ||
753 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
754 | /* Retrigger the CPU local events everywhere */ | ||
755 | on_each_cpu(retrigger_next_event, NULL, 1); | ||
756 | #endif | ||
757 | timerfd_clock_was_set(); | ||
758 | } | ||
759 | |||
760 | /* | ||
761 | * During resume we might have to reprogram the high resolution timer | ||
762 | * interrupt (on the local CPU): | ||
763 | */ | ||
764 | void hrtimers_resume(void) | ||
765 | { | ||
766 | WARN_ONCE(!irqs_disabled(), | ||
767 | KERN_INFO "hrtimers_resume() called with IRQs enabled!"); | ||
768 | |||
769 | retrigger_next_event(NULL); | ||
770 | timerfd_clock_was_set(); | ||
771 | } | ||
772 | |||
756 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) | 773 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) |
757 | { | 774 | { |
758 | #ifdef CONFIG_TIMER_STATS | 775 | #ifdef CONFIG_TIMER_STATS |
@@ -842,48 +859,18 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); | |||
842 | static int enqueue_hrtimer(struct hrtimer *timer, | 859 | static int enqueue_hrtimer(struct hrtimer *timer, |
843 | struct hrtimer_clock_base *base) | 860 | struct hrtimer_clock_base *base) |
844 | { | 861 | { |
845 | struct rb_node **link = &base->active.rb_node; | ||
846 | struct rb_node *parent = NULL; | ||
847 | struct hrtimer *entry; | ||
848 | int leftmost = 1; | ||
849 | |||
850 | debug_activate(timer); | 862 | debug_activate(timer); |
851 | 863 | ||
852 | /* | 864 | timerqueue_add(&base->active, &timer->node); |
853 | * Find the right place in the rbtree: | 865 | base->cpu_base->active_bases |= 1 << base->index; |
854 | */ | ||
855 | while (*link) { | ||
856 | parent = *link; | ||
857 | entry = rb_entry(parent, struct hrtimer, node); | ||
858 | /* | ||
859 | * We dont care about collisions. Nodes with | ||
860 | * the same expiry time stay together. | ||
861 | */ | ||
862 | if (hrtimer_get_expires_tv64(timer) < | ||
863 | hrtimer_get_expires_tv64(entry)) { | ||
864 | link = &(*link)->rb_left; | ||
865 | } else { | ||
866 | link = &(*link)->rb_right; | ||
867 | leftmost = 0; | ||
868 | } | ||
869 | } | ||
870 | |||
871 | /* | ||
872 | * Insert the timer to the rbtree and check whether it | ||
873 | * replaces the first pending timer | ||
874 | */ | ||
875 | if (leftmost) | ||
876 | base->first = &timer->node; | ||
877 | 866 | ||
878 | rb_link_node(&timer->node, parent, link); | ||
879 | rb_insert_color(&timer->node, &base->active); | ||
880 | /* | 867 | /* |
881 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | 868 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the |
882 | * state of a possibly running callback. | 869 | * state of a possibly running callback. |
883 | */ | 870 | */ |
884 | timer->state |= HRTIMER_STATE_ENQUEUED; | 871 | timer->state |= HRTIMER_STATE_ENQUEUED; |
885 | 872 | ||
886 | return leftmost; | 873 | return (&timer->node == base->active.next); |
887 | } | 874 | } |
888 | 875 | ||
889 | /* | 876 | /* |
@@ -903,12 +890,7 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
903 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 890 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
904 | goto out; | 891 | goto out; |
905 | 892 | ||
906 | /* | 893 | if (&timer->node == timerqueue_getnext(&base->active)) { |
907 | * Remove the timer from the rbtree and replace the first | ||
908 | * entry pointer if necessary. | ||
909 | */ | ||
910 | if (base->first == &timer->node) { | ||
911 | base->first = rb_next(&timer->node); | ||
912 | #ifdef CONFIG_HIGH_RES_TIMERS | 894 | #ifdef CONFIG_HIGH_RES_TIMERS |
913 | /* Reprogram the clock event device. if enabled */ | 895 | /* Reprogram the clock event device. if enabled */ |
914 | if (reprogram && hrtimer_hres_active()) { | 896 | if (reprogram && hrtimer_hres_active()) { |
@@ -921,7 +903,9 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
921 | } | 903 | } |
922 | #endif | 904 | #endif |
923 | } | 905 | } |
924 | rb_erase(&timer->node, &base->active); | 906 | timerqueue_del(&base->active, &timer->node); |
907 | if (!timerqueue_getnext(&base->active)) | ||
908 | base->cpu_base->active_bases &= ~(1 << base->index); | ||
925 | out: | 909 | out: |
926 | timer->state = newstate; | 910 | timer->state = newstate; |
927 | } | 911 | } |
@@ -1222,11 +1206,13 @@ ktime_t hrtimer_get_next_event(void) | |||
1222 | if (!hrtimer_hres_active()) { | 1206 | if (!hrtimer_hres_active()) { |
1223 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { | 1207 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { |
1224 | struct hrtimer *timer; | 1208 | struct hrtimer *timer; |
1209 | struct timerqueue_node *next; | ||
1225 | 1210 | ||
1226 | if (!base->first) | 1211 | next = timerqueue_getnext(&base->active); |
1212 | if (!next) | ||
1227 | continue; | 1213 | continue; |
1228 | 1214 | ||
1229 | timer = rb_entry(base->first, struct hrtimer, node); | 1215 | timer = container_of(next, struct hrtimer, node); |
1230 | delta.tv64 = hrtimer_get_expires_tv64(timer); | 1216 | delta.tv64 = hrtimer_get_expires_tv64(timer); |
1231 | delta = ktime_sub(delta, base->get_time()); | 1217 | delta = ktime_sub(delta, base->get_time()); |
1232 | if (delta.tv64 < mindelta.tv64) | 1218 | if (delta.tv64 < mindelta.tv64) |
@@ -1246,6 +1232,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1246 | enum hrtimer_mode mode) | 1232 | enum hrtimer_mode mode) |
1247 | { | 1233 | { |
1248 | struct hrtimer_cpu_base *cpu_base; | 1234 | struct hrtimer_cpu_base *cpu_base; |
1235 | int base; | ||
1249 | 1236 | ||
1250 | memset(timer, 0, sizeof(struct hrtimer)); | 1237 | memset(timer, 0, sizeof(struct hrtimer)); |
1251 | 1238 | ||
@@ -1254,8 +1241,9 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
1254 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) | 1241 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
1255 | clock_id = CLOCK_MONOTONIC; | 1242 | clock_id = CLOCK_MONOTONIC; |
1256 | 1243 | ||
1257 | timer->base = &cpu_base->clock_base[clock_id]; | 1244 | base = hrtimer_clockid_to_base(clock_id); |
1258 | hrtimer_init_timer_hres(timer); | 1245 | timer->base = &cpu_base->clock_base[base]; |
1246 | timerqueue_init(&timer->node); | ||
1259 | 1247 | ||
1260 | #ifdef CONFIG_TIMER_STATS | 1248 | #ifdef CONFIG_TIMER_STATS |
1261 | timer->start_site = NULL; | 1249 | timer->start_site = NULL; |
@@ -1289,9 +1277,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); | |||
1289 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | 1277 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) |
1290 | { | 1278 | { |
1291 | struct hrtimer_cpu_base *cpu_base; | 1279 | struct hrtimer_cpu_base *cpu_base; |
1280 | int base = hrtimer_clockid_to_base(which_clock); | ||
1292 | 1281 | ||
1293 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1282 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); |
1294 | *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); | 1283 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); |
1295 | 1284 | ||
1296 | return 0; | 1285 | return 0; |
1297 | } | 1286 | } |
@@ -1346,7 +1335,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
1346 | void hrtimer_interrupt(struct clock_event_device *dev) | 1335 | void hrtimer_interrupt(struct clock_event_device *dev) |
1347 | { | 1336 | { |
1348 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1337 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1349 | struct hrtimer_clock_base *base; | ||
1350 | ktime_t expires_next, now, entry_time, delta; | 1338 | ktime_t expires_next, now, entry_time, delta; |
1351 | int i, retries = 0; | 1339 | int i, retries = 0; |
1352 | 1340 | ||
@@ -1368,18 +1356,21 @@ retry: | |||
1368 | */ | 1356 | */ |
1369 | cpu_base->expires_next.tv64 = KTIME_MAX; | 1357 | cpu_base->expires_next.tv64 = KTIME_MAX; |
1370 | 1358 | ||
1371 | base = cpu_base->clock_base; | ||
1372 | |||
1373 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1359 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1360 | struct hrtimer_clock_base *base; | ||
1361 | struct timerqueue_node *node; | ||
1374 | ktime_t basenow; | 1362 | ktime_t basenow; |
1375 | struct rb_node *node; | ||
1376 | 1363 | ||
1364 | if (!(cpu_base->active_bases & (1 << i))) | ||
1365 | continue; | ||
1366 | |||
1367 | base = cpu_base->clock_base + i; | ||
1377 | basenow = ktime_add(now, base->offset); | 1368 | basenow = ktime_add(now, base->offset); |
1378 | 1369 | ||
1379 | while ((node = base->first)) { | 1370 | while ((node = timerqueue_getnext(&base->active))) { |
1380 | struct hrtimer *timer; | 1371 | struct hrtimer *timer; |
1381 | 1372 | ||
1382 | timer = rb_entry(node, struct hrtimer, node); | 1373 | timer = container_of(node, struct hrtimer, node); |
1383 | 1374 | ||
1384 | /* | 1375 | /* |
1385 | * The immediate goal for using the softexpires is | 1376 | * The immediate goal for using the softexpires is |
@@ -1406,7 +1397,6 @@ retry: | |||
1406 | 1397 | ||
1407 | __run_hrtimer(timer, &basenow); | 1398 | __run_hrtimer(timer, &basenow); |
1408 | } | 1399 | } |
1409 | base++; | ||
1410 | } | 1400 | } |
1411 | 1401 | ||
1412 | /* | 1402 | /* |
@@ -1535,7 +1525,7 @@ void hrtimer_run_pending(void) | |||
1535 | */ | 1525 | */ |
1536 | void hrtimer_run_queues(void) | 1526 | void hrtimer_run_queues(void) |
1537 | { | 1527 | { |
1538 | struct rb_node *node; | 1528 | struct timerqueue_node *node; |
1539 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1529 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1540 | struct hrtimer_clock_base *base; | 1530 | struct hrtimer_clock_base *base; |
1541 | int index, gettime = 1; | 1531 | int index, gettime = 1; |
@@ -1545,8 +1535,7 @@ void hrtimer_run_queues(void) | |||
1545 | 1535 | ||
1546 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { | 1536 | for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { |
1547 | base = &cpu_base->clock_base[index]; | 1537 | base = &cpu_base->clock_base[index]; |
1548 | 1538 | if (!timerqueue_getnext(&base->active)) | |
1549 | if (!base->first) | ||
1550 | continue; | 1539 | continue; |
1551 | 1540 | ||
1552 | if (gettime) { | 1541 | if (gettime) { |
@@ -1556,10 +1545,10 @@ void hrtimer_run_queues(void) | |||
1556 | 1545 | ||
1557 | raw_spin_lock(&cpu_base->lock); | 1546 | raw_spin_lock(&cpu_base->lock); |
1558 | 1547 | ||
1559 | while ((node = base->first)) { | 1548 | while ((node = timerqueue_getnext(&base->active))) { |
1560 | struct hrtimer *timer; | 1549 | struct hrtimer *timer; |
1561 | 1550 | ||
1562 | timer = rb_entry(node, struct hrtimer, node); | 1551 | timer = container_of(node, struct hrtimer, node); |
1563 | if (base->softirq_time.tv64 <= | 1552 | if (base->softirq_time.tv64 <= |
1564 | hrtimer_get_expires_tv64(timer)) | 1553 | hrtimer_get_expires_tv64(timer)) |
1565 | break; | 1554 | break; |
@@ -1638,7 +1627,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
1638 | struct timespec __user *rmtp; | 1627 | struct timespec __user *rmtp; |
1639 | int ret = 0; | 1628 | int ret = 0; |
1640 | 1629 | ||
1641 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, | 1630 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, |
1642 | HRTIMER_MODE_ABS); | 1631 | HRTIMER_MODE_ABS); |
1643 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); | 1632 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); |
1644 | 1633 | ||
@@ -1690,7 +1679,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
1690 | 1679 | ||
1691 | restart = ¤t_thread_info()->restart_block; | 1680 | restart = ¤t_thread_info()->restart_block; |
1692 | restart->fn = hrtimer_nanosleep_restart; | 1681 | restart->fn = hrtimer_nanosleep_restart; |
1693 | restart->nanosleep.index = t.timer.base->index; | 1682 | restart->nanosleep.clockid = t.timer.base->clockid; |
1694 | restart->nanosleep.rmtp = rmtp; | 1683 | restart->nanosleep.rmtp = rmtp; |
1695 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); | 1684 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); |
1696 | 1685 | ||
@@ -1724,8 +1713,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) | |||
1724 | 1713 | ||
1725 | raw_spin_lock_init(&cpu_base->lock); | 1714 | raw_spin_lock_init(&cpu_base->lock); |
1726 | 1715 | ||
1727 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1716 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1728 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1717 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1718 | timerqueue_init_head(&cpu_base->clock_base[i].active); | ||
1719 | } | ||
1729 | 1720 | ||
1730 | hrtimer_init_hres(cpu_base); | 1721 | hrtimer_init_hres(cpu_base); |
1731 | INIT_LIST_HEAD(&cpu_base->to_pull); | 1722 | INIT_LIST_HEAD(&cpu_base->to_pull); |
@@ -1737,10 +1728,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
1737 | struct hrtimer_clock_base *new_base) | 1728 | struct hrtimer_clock_base *new_base) |
1738 | { | 1729 | { |
1739 | struct hrtimer *timer; | 1730 | struct hrtimer *timer; |
1740 | struct rb_node *node; | 1731 | struct timerqueue_node *node; |
1741 | 1732 | ||
1742 | while ((node = rb_first(&old_base->active))) { | 1733 | while ((node = timerqueue_getnext(&old_base->active))) { |
1743 | timer = rb_entry(node, struct hrtimer, node); | 1734 | timer = container_of(node, struct hrtimer, node); |
1744 | BUG_ON(hrtimer_callback_running(timer)); | 1735 | BUG_ON(hrtimer_callback_running(timer)); |
1745 | debug_deactivate(timer); | 1736 | debug_deactivate(timer); |
1746 | 1737 | ||
@@ -1869,7 +1860,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
1869 | } | 1860 | } |
1870 | 1861 | ||
1871 | /* | 1862 | /* |
1872 | * A NULL parameter means "inifinte" | 1863 | * A NULL parameter means "infinite" |
1873 | */ | 1864 | */ |
1874 | if (!expires) { | 1865 | if (!expires) { |
1875 | schedule(); | 1866 | schedule(); |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..ea640120ab86 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; | |||
33 | /* | 33 | /* |
34 | * Zero means infinite timeout - no checking done: | 34 | * Zero means infinite timeout - no checking done: |
35 | */ | 35 | */ |
36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | 36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; |
37 | 37 | ||
38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | 38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; |
39 | 39 | ||
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
99 | " disables this message.\n"); | 99 | " disables this message.\n"); |
100 | sched_show_task(t); | 100 | sched_show_task(t); |
101 | __debug_show_held_locks(t); | 101 | debug_show_held_locks(t); |
102 | 102 | ||
103 | touch_nmi_watchdog(); | 103 | touch_nmi_watchdog(); |
104 | 104 | ||
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
111 | * periodically exit the critical section and enter a new one. | 111 | * periodically exit the critical section and enter a new one. |
112 | * | 112 | * |
113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order | 113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order |
114 | * exit the grace period. For classic RCU, a reschedule is required. | 114 | * to exit the grace period. For classic RCU, a reschedule is required. |
115 | */ | 115 | */ |
116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) | 116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) |
117 | { | 117 | { |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 000000000000..d1d051b38e0b --- /dev/null +++ b/kernel/irq/Kconfig | |||
@@ -0,0 +1,74 @@ | |||
1 | # Select this to activate the generic irq options below | ||
2 | config HAVE_GENERIC_HARDIRQS | ||
3 | bool | ||
4 | |||
5 | if HAVE_GENERIC_HARDIRQS | ||
6 | menu "IRQ subsystem" | ||
7 | # | ||
8 | # Interrupt subsystem related configuration options | ||
9 | # | ||
10 | config GENERIC_HARDIRQS | ||
11 | def_bool y | ||
12 | |||
13 | # Options selectable by the architecture code | ||
14 | |||
15 | # Make sparse irq Kconfig switch below available | ||
16 | config HAVE_SPARSE_IRQ | ||
17 | bool | ||
18 | |||
19 | # Enable the generic irq autoprobe mechanism | ||
20 | config GENERIC_IRQ_PROBE | ||
21 | bool | ||
22 | |||
23 | # Use the generic /proc/interrupts implementation | ||
24 | config GENERIC_IRQ_SHOW | ||
25 | bool | ||
26 | |||
27 | # Print level/edge extra information | ||
28 | config GENERIC_IRQ_SHOW_LEVEL | ||
29 | bool | ||
30 | |||
31 | # Support for delayed migration from interrupt context | ||
32 | config GENERIC_PENDING_IRQ | ||
33 | bool | ||
34 | |||
35 | # Alpha specific irq affinity mechanism | ||
36 | config AUTO_IRQ_AFFINITY | ||
37 | bool | ||
38 | |||
39 | # Tasklet based software resend for pending interrupts on enable_irq() | ||
40 | config HARDIRQS_SW_RESEND | ||
41 | bool | ||
42 | |||
43 | # Preflow handler support for fasteoi (sparc64) | ||
44 | config IRQ_PREFLOW_FASTEOI | ||
45 | bool | ||
46 | |||
47 | # Edge style eoi based handler (cell) | ||
48 | config IRQ_EDGE_EOI_HANDLER | ||
49 | bool | ||
50 | |||
51 | # Generic configurable interrupt chip implementation | ||
52 | config GENERIC_IRQ_CHIP | ||
53 | bool | ||
54 | |||
55 | # Support forced irq threading | ||
56 | config IRQ_FORCED_THREADING | ||
57 | bool | ||
58 | |||
59 | config SPARSE_IRQ | ||
60 | bool "Support sparse irq numbering" | ||
61 | depends on HAVE_SPARSE_IRQ | ||
62 | ---help--- | ||
63 | |||
64 | Sparse irq numbering is useful for distro kernels that want | ||
65 | to define a high CONFIG_NR_CPUS value but still want to have | ||
66 | low kernel memory footprint on smaller machines. | ||
67 | |||
68 | ( Sparse irqs can also be beneficial on NUMA boxes, as they spread | ||
69 | out the interrupt descriptors in a more NUMA-friendly way. ) | ||
70 | |||
71 | If you don't know what to do here, say N. | ||
72 | |||
73 | endmenu | ||
74 | endif | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d047808419d..73290056cfb6 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,7 +1,7 @@ | |||
1 | 1 | ||
2 | obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | ||
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 5 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 6 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
6 | obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o | ||
7 | obj-$(CONFIG_PM_SLEEP) += pm.o | 7 | obj-$(CONFIG_PM_SLEEP) += pm.o |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31ef110..342d8f44e401 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -17,7 +17,7 @@ | |||
17 | /* | 17 | /* |
18 | * Autodetection depends on the fact that any interrupt that | 18 | * Autodetection depends on the fact that any interrupt that |
19 | * comes in on to an unassigned handler will get stuck with | 19 | * comes in on to an unassigned handler will get stuck with |
20 | * "IRQ_WAITING" cleared and the interrupt disabled. | 20 | * "IRQS_WAITING" cleared and the interrupt disabled. |
21 | */ | 21 | */ |
22 | static DEFINE_MUTEX(probing_active); | 22 | static DEFINE_MUTEX(probing_active); |
23 | 23 | ||
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) | |||
32 | { | 32 | { |
33 | struct irq_desc *desc; | 33 | struct irq_desc *desc; |
34 | unsigned long mask = 0; | 34 | unsigned long mask = 0; |
35 | unsigned int status; | ||
36 | int i; | 35 | int i; |
37 | 36 | ||
38 | /* | 37 | /* |
@@ -46,20 +45,15 @@ unsigned long probe_irq_on(void) | |||
46 | */ | 45 | */ |
47 | for_each_irq_desc_reverse(i, desc) { | 46 | for_each_irq_desc_reverse(i, desc) { |
48 | raw_spin_lock_irq(&desc->lock); | 47 | raw_spin_lock_irq(&desc->lock); |
49 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 48 | if (!desc->action && irq_settings_can_probe(desc)) { |
50 | /* | ||
51 | * An old-style architecture might still have | ||
52 | * the handle_bad_irq handler there: | ||
53 | */ | ||
54 | compat_irq_chip_set_default_handler(desc); | ||
55 | |||
56 | /* | 49 | /* |
57 | * Some chips need to know about probing in | 50 | * Some chips need to know about probing in |
58 | * progress: | 51 | * progress: |
59 | */ | 52 | */ |
60 | if (desc->chip->set_type) | 53 | if (desc->irq_data.chip->irq_set_type) |
61 | desc->chip->set_type(i, IRQ_TYPE_PROBE); | 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
62 | desc->chip->startup(i); | 55 | IRQ_TYPE_PROBE); |
56 | irq_startup(desc); | ||
63 | } | 57 | } |
64 | raw_spin_unlock_irq(&desc->lock); | 58 | raw_spin_unlock_irq(&desc->lock); |
65 | } | 59 | } |
@@ -74,10 +68,10 @@ unsigned long probe_irq_on(void) | |||
74 | */ | 68 | */ |
75 | for_each_irq_desc_reverse(i, desc) { | 69 | for_each_irq_desc_reverse(i, desc) { |
76 | raw_spin_lock_irq(&desc->lock); | 70 | raw_spin_lock_irq(&desc->lock); |
77 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 71 | if (!desc->action && irq_settings_can_probe(desc)) { |
78 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
79 | if (desc->chip->startup(i)) | 73 | if (irq_startup(desc)) |
80 | desc->status |= IRQ_PENDING; | 74 | desc->istate |= IRQS_PENDING; |
81 | } | 75 | } |
82 | raw_spin_unlock_irq(&desc->lock); | 76 | raw_spin_unlock_irq(&desc->lock); |
83 | } | 77 | } |
@@ -92,13 +86,12 @@ unsigned long probe_irq_on(void) | |||
92 | */ | 86 | */ |
93 | for_each_irq_desc(i, desc) { | 87 | for_each_irq_desc(i, desc) { |
94 | raw_spin_lock_irq(&desc->lock); | 88 | raw_spin_lock_irq(&desc->lock); |
95 | status = desc->status; | ||
96 | 89 | ||
97 | if (status & IRQ_AUTODETECT) { | 90 | if (desc->istate & IRQS_AUTODETECT) { |
98 | /* It triggered already - consider it spurious. */ | 91 | /* It triggered already - consider it spurious. */ |
99 | if (!(status & IRQ_WAITING)) { | 92 | if (!(desc->istate & IRQS_WAITING)) { |
100 | desc->status = status & ~IRQ_AUTODETECT; | 93 | desc->istate &= ~IRQS_AUTODETECT; |
101 | desc->chip->shutdown(i); | 94 | irq_shutdown(desc); |
102 | } else | 95 | } else |
103 | if (i < 32) | 96 | if (i < 32) |
104 | mask |= 1 << i; | 97 | mask |= 1 << i; |
@@ -124,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on); | |||
124 | */ | 117 | */ |
125 | unsigned int probe_irq_mask(unsigned long val) | 118 | unsigned int probe_irq_mask(unsigned long val) |
126 | { | 119 | { |
127 | unsigned int status, mask = 0; | 120 | unsigned int mask = 0; |
128 | struct irq_desc *desc; | 121 | struct irq_desc *desc; |
129 | int i; | 122 | int i; |
130 | 123 | ||
131 | for_each_irq_desc(i, desc) { | 124 | for_each_irq_desc(i, desc) { |
132 | raw_spin_lock_irq(&desc->lock); | 125 | raw_spin_lock_irq(&desc->lock); |
133 | status = desc->status; | 126 | if (desc->istate & IRQS_AUTODETECT) { |
134 | 127 | if (i < 16 && !(desc->istate & IRQS_WAITING)) | |
135 | if (status & IRQ_AUTODETECT) { | ||
136 | if (i < 16 && !(status & IRQ_WAITING)) | ||
137 | mask |= 1 << i; | 128 | mask |= 1 << i; |
138 | 129 | ||
139 | desc->status = status & ~IRQ_AUTODETECT; | 130 | desc->istate &= ~IRQS_AUTODETECT; |
140 | desc->chip->shutdown(i); | 131 | irq_shutdown(desc); |
141 | } | 132 | } |
142 | raw_spin_unlock_irq(&desc->lock); | 133 | raw_spin_unlock_irq(&desc->lock); |
143 | } | 134 | } |
@@ -168,20 +159,18 @@ int probe_irq_off(unsigned long val) | |||
168 | { | 159 | { |
169 | int i, irq_found = 0, nr_of_irqs = 0; | 160 | int i, irq_found = 0, nr_of_irqs = 0; |
170 | struct irq_desc *desc; | 161 | struct irq_desc *desc; |
171 | unsigned int status; | ||
172 | 162 | ||
173 | for_each_irq_desc(i, desc) { | 163 | for_each_irq_desc(i, desc) { |
174 | raw_spin_lock_irq(&desc->lock); | 164 | raw_spin_lock_irq(&desc->lock); |
175 | status = desc->status; | ||
176 | 165 | ||
177 | if (status & IRQ_AUTODETECT) { | 166 | if (desc->istate & IRQS_AUTODETECT) { |
178 | if (!(status & IRQ_WAITING)) { | 167 | if (!(desc->istate & IRQS_WAITING)) { |
179 | if (!nr_of_irqs) | 168 | if (!nr_of_irqs) |
180 | irq_found = i; | 169 | irq_found = i; |
181 | nr_of_irqs++; | 170 | nr_of_irqs++; |
182 | } | 171 | } |
183 | desc->status = status & ~IRQ_AUTODETECT; | 172 | desc->istate &= ~IRQS_AUTODETECT; |
184 | desc->chip->shutdown(i); | 173 | irq_shutdown(desc); |
185 | } | 174 | } |
186 | raw_spin_unlock_irq(&desc->lock); | 175 | raw_spin_unlock_irq(&desc->lock); |
187 | } | 176 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5ca2f8..d5a3009da71a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -18,363 +18,217 @@ | |||
18 | 18 | ||
19 | #include "internals.h" | 19 | #include "internals.h" |
20 | 20 | ||
21 | static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) | ||
22 | { | ||
23 | struct irq_desc *desc; | ||
24 | unsigned long flags; | ||
25 | |||
26 | desc = irq_to_desc(irq); | ||
27 | if (!desc) { | ||
28 | WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); | ||
29 | return; | ||
30 | } | ||
31 | |||
32 | /* Ensure we don't have left over values from a previous use of this irq */ | ||
33 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
34 | desc->status = IRQ_DISABLED; | ||
35 | desc->chip = &no_irq_chip; | ||
36 | desc->handle_irq = handle_bad_irq; | ||
37 | desc->depth = 1; | ||
38 | desc->msi_desc = NULL; | ||
39 | desc->handler_data = NULL; | ||
40 | if (!keep_chip_data) | ||
41 | desc->chip_data = NULL; | ||
42 | desc->action = NULL; | ||
43 | desc->irq_count = 0; | ||
44 | desc->irqs_unhandled = 0; | ||
45 | #ifdef CONFIG_SMP | ||
46 | cpumask_setall(desc->affinity); | ||
47 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
48 | cpumask_clear(desc->pending_mask); | ||
49 | #endif | ||
50 | #endif | ||
51 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
52 | } | ||
53 | |||
54 | /** | 21 | /** |
55 | * dynamic_irq_init - initialize a dynamically allocated irq | 22 | * irq_set_chip - set the irq chip for an irq |
56 | * @irq: irq number to initialize | ||
57 | */ | ||
58 | void dynamic_irq_init(unsigned int irq) | ||
59 | { | ||
60 | dynamic_irq_init_x(irq, false); | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq | ||
65 | * @irq: irq number to initialize | ||
66 | * | ||
67 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
68 | */ | ||
69 | void dynamic_irq_init_keep_chip_data(unsigned int irq) | ||
70 | { | ||
71 | dynamic_irq_init_x(irq, true); | ||
72 | } | ||
73 | |||
74 | static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) | ||
75 | { | ||
76 | struct irq_desc *desc = irq_to_desc(irq); | ||
77 | unsigned long flags; | ||
78 | |||
79 | if (!desc) { | ||
80 | WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); | ||
81 | return; | ||
82 | } | ||
83 | |||
84 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
85 | if (desc->action) { | ||
86 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
87 | WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", | ||
88 | irq); | ||
89 | return; | ||
90 | } | ||
91 | desc->msi_desc = NULL; | ||
92 | desc->handler_data = NULL; | ||
93 | if (!keep_chip_data) | ||
94 | desc->chip_data = NULL; | ||
95 | desc->handle_irq = handle_bad_irq; | ||
96 | desc->chip = &no_irq_chip; | ||
97 | desc->name = NULL; | ||
98 | clear_kstat_irqs(desc); | ||
99 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
100 | } | ||
101 | |||
102 | /** | ||
103 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
104 | * @irq: irq number to initialize | ||
105 | */ | ||
106 | void dynamic_irq_cleanup(unsigned int irq) | ||
107 | { | ||
108 | dynamic_irq_cleanup_x(irq, false); | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq | ||
113 | * @irq: irq number to initialize | ||
114 | * | ||
115 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
116 | */ | ||
117 | void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) | ||
118 | { | ||
119 | dynamic_irq_cleanup_x(irq, true); | ||
120 | } | ||
121 | |||
122 | |||
123 | /** | ||
124 | * set_irq_chip - set the irq chip for an irq | ||
125 | * @irq: irq number | 23 | * @irq: irq number |
126 | * @chip: pointer to irq chip description structure | 24 | * @chip: pointer to irq chip description structure |
127 | */ | 25 | */ |
128 | int set_irq_chip(unsigned int irq, struct irq_chip *chip) | 26 | int irq_set_chip(unsigned int irq, struct irq_chip *chip) |
129 | { | 27 | { |
130 | struct irq_desc *desc = irq_to_desc(irq); | ||
131 | unsigned long flags; | 28 | unsigned long flags; |
29 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
132 | 30 | ||
133 | if (!desc) { | 31 | if (!desc) |
134 | WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); | ||
135 | return -EINVAL; | 32 | return -EINVAL; |
136 | } | ||
137 | 33 | ||
138 | if (!chip) | 34 | if (!chip) |
139 | chip = &no_irq_chip; | 35 | chip = &no_irq_chip; |
140 | 36 | ||
141 | raw_spin_lock_irqsave(&desc->lock, flags); | 37 | desc->irq_data.chip = chip; |
142 | irq_chip_set_defaults(chip); | 38 | irq_put_desc_unlock(desc, flags); |
143 | desc->chip = chip; | 39 | /* |
144 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 40 | * For !CONFIG_SPARSE_IRQ make the irq show up in |
145 | 41 | * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is | |
42 | * already marked, and this call is harmless. | ||
43 | */ | ||
44 | irq_reserve_irq(irq); | ||
146 | return 0; | 45 | return 0; |
147 | } | 46 | } |
148 | EXPORT_SYMBOL(set_irq_chip); | 47 | EXPORT_SYMBOL(irq_set_chip); |
149 | 48 | ||
150 | /** | 49 | /** |
151 | * set_irq_type - set the irq trigger type for an irq | 50 | * irq_set_type - set the irq trigger type for an irq |
152 | * @irq: irq number | 51 | * @irq: irq number |
153 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h | 52 | * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h |
154 | */ | 53 | */ |
155 | int set_irq_type(unsigned int irq, unsigned int type) | 54 | int irq_set_irq_type(unsigned int irq, unsigned int type) |
156 | { | 55 | { |
157 | struct irq_desc *desc = irq_to_desc(irq); | ||
158 | unsigned long flags; | 56 | unsigned long flags; |
159 | int ret = -ENXIO; | 57 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); |
58 | int ret = 0; | ||
160 | 59 | ||
161 | if (!desc) { | 60 | if (!desc) |
162 | printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); | 61 | return -EINVAL; |
163 | return -ENODEV; | ||
164 | } | ||
165 | 62 | ||
166 | type &= IRQ_TYPE_SENSE_MASK; | 63 | type &= IRQ_TYPE_SENSE_MASK; |
167 | if (type == IRQ_TYPE_NONE) | 64 | if (type != IRQ_TYPE_NONE) |
168 | return 0; | 65 | ret = __irq_set_trigger(desc, irq, type); |
169 | 66 | irq_put_desc_busunlock(desc, flags); | |
170 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
171 | ret = __irq_set_trigger(desc, irq, type); | ||
172 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
173 | return ret; | 67 | return ret; |
174 | } | 68 | } |
175 | EXPORT_SYMBOL(set_irq_type); | 69 | EXPORT_SYMBOL(irq_set_irq_type); |
176 | 70 | ||
177 | /** | 71 | /** |
178 | * set_irq_data - set irq type data for an irq | 72 | * irq_set_handler_data - set irq handler data for an irq |
179 | * @irq: Interrupt number | 73 | * @irq: Interrupt number |
180 | * @data: Pointer to interrupt specific data | 74 | * @data: Pointer to interrupt specific data |
181 | * | 75 | * |
182 | * Set the hardware irq controller data for an irq | 76 | * Set the hardware irq controller data for an irq |
183 | */ | 77 | */ |
184 | int set_irq_data(unsigned int irq, void *data) | 78 | int irq_set_handler_data(unsigned int irq, void *data) |
185 | { | 79 | { |
186 | struct irq_desc *desc = irq_to_desc(irq); | ||
187 | unsigned long flags; | 80 | unsigned long flags; |
81 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
188 | 82 | ||
189 | if (!desc) { | 83 | if (!desc) |
190 | printk(KERN_ERR | ||
191 | "Trying to install controller data for IRQ%d\n", irq); | ||
192 | return -EINVAL; | 84 | return -EINVAL; |
193 | } | 85 | desc->irq_data.handler_data = data; |
194 | 86 | irq_put_desc_unlock(desc, flags); | |
195 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
196 | desc->handler_data = data; | ||
197 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
198 | return 0; | 87 | return 0; |
199 | } | 88 | } |
200 | EXPORT_SYMBOL(set_irq_data); | 89 | EXPORT_SYMBOL(irq_set_handler_data); |
201 | 90 | ||
202 | /** | 91 | /** |
203 | * set_irq_msi - set MSI descriptor data for an irq | 92 | * irq_set_msi_desc - set MSI descriptor data for an irq |
204 | * @irq: Interrupt number | 93 | * @irq: Interrupt number |
205 | * @entry: Pointer to MSI descriptor data | 94 | * @entry: Pointer to MSI descriptor data |
206 | * | 95 | * |
207 | * Set the MSI descriptor entry for an irq | 96 | * Set the MSI descriptor entry for an irq |
208 | */ | 97 | */ |
209 | int set_irq_msi(unsigned int irq, struct msi_desc *entry) | 98 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) |
210 | { | 99 | { |
211 | struct irq_desc *desc = irq_to_desc(irq); | ||
212 | unsigned long flags; | 100 | unsigned long flags; |
101 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
213 | 102 | ||
214 | if (!desc) { | 103 | if (!desc) |
215 | printk(KERN_ERR | ||
216 | "Trying to install msi data for IRQ%d\n", irq); | ||
217 | return -EINVAL; | 104 | return -EINVAL; |
218 | } | 105 | desc->irq_data.msi_desc = entry; |
219 | |||
220 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
221 | desc->msi_desc = entry; | ||
222 | if (entry) | 106 | if (entry) |
223 | entry->irq = irq; | 107 | entry->irq = irq; |
224 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 108 | irq_put_desc_unlock(desc, flags); |
225 | return 0; | 109 | return 0; |
226 | } | 110 | } |
227 | 111 | ||
228 | /** | 112 | /** |
229 | * set_irq_chip_data - set irq chip data for an irq | 113 | * irq_set_chip_data - set irq chip data for an irq |
230 | * @irq: Interrupt number | 114 | * @irq: Interrupt number |
231 | * @data: Pointer to chip specific data | 115 | * @data: Pointer to chip specific data |
232 | * | 116 | * |
233 | * Set the hardware irq chip data for an irq | 117 | * Set the hardware irq chip data for an irq |
234 | */ | 118 | */ |
235 | int set_irq_chip_data(unsigned int irq, void *data) | 119 | int irq_set_chip_data(unsigned int irq, void *data) |
236 | { | 120 | { |
237 | struct irq_desc *desc = irq_to_desc(irq); | ||
238 | unsigned long flags; | 121 | unsigned long flags; |
122 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
239 | 123 | ||
240 | if (!desc) { | 124 | if (!desc) |
241 | printk(KERN_ERR | ||
242 | "Trying to install chip data for IRQ%d\n", irq); | ||
243 | return -EINVAL; | ||
244 | } | ||
245 | |||
246 | if (!desc->chip) { | ||
247 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | ||
248 | return -EINVAL; | 125 | return -EINVAL; |
249 | } | 126 | desc->irq_data.chip_data = data; |
250 | 127 | irq_put_desc_unlock(desc, flags); | |
251 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
252 | desc->chip_data = data; | ||
253 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
254 | |||
255 | return 0; | 128 | return 0; |
256 | } | 129 | } |
257 | EXPORT_SYMBOL(set_irq_chip_data); | 130 | EXPORT_SYMBOL(irq_set_chip_data); |
258 | 131 | ||
259 | /** | 132 | struct irq_data *irq_get_irq_data(unsigned int irq) |
260 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq | ||
261 | * | ||
262 | * @irq: Interrupt number | ||
263 | * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag | ||
264 | * | ||
265 | * The IRQ_NESTED_THREAD flag indicates that on | ||
266 | * request_threaded_irq() no separate interrupt thread should be | ||
267 | * created for the irq as the handler are called nested in the | ||
268 | * context of a demultiplexing interrupt handler thread. | ||
269 | */ | ||
270 | void set_irq_nested_thread(unsigned int irq, int nest) | ||
271 | { | 133 | { |
272 | struct irq_desc *desc = irq_to_desc(irq); | 134 | struct irq_desc *desc = irq_to_desc(irq); |
273 | unsigned long flags; | ||
274 | 135 | ||
275 | if (!desc) | 136 | return desc ? &desc->irq_data : NULL; |
276 | return; | 137 | } |
138 | EXPORT_SYMBOL_GPL(irq_get_irq_data); | ||
277 | 139 | ||
278 | raw_spin_lock_irqsave(&desc->lock, flags); | 140 | static void irq_state_clr_disabled(struct irq_desc *desc) |
279 | if (nest) | 141 | { |
280 | desc->status |= IRQ_NESTED_THREAD; | 142 | irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); |
281 | else | ||
282 | desc->status &= ~IRQ_NESTED_THREAD; | ||
283 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
284 | } | 143 | } |
285 | EXPORT_SYMBOL_GPL(set_irq_nested_thread); | ||
286 | 144 | ||
287 | /* | 145 | static void irq_state_set_disabled(struct irq_desc *desc) |
288 | * default enable function | ||
289 | */ | ||
290 | static void default_enable(unsigned int irq) | ||
291 | { | 146 | { |
292 | struct irq_desc *desc = irq_to_desc(irq); | 147 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); |
148 | } | ||
293 | 149 | ||
294 | desc->chip->unmask(irq); | 150 | static void irq_state_clr_masked(struct irq_desc *desc) |
295 | desc->status &= ~IRQ_MASKED; | 151 | { |
152 | irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); | ||
296 | } | 153 | } |
297 | 154 | ||
298 | /* | 155 | static void irq_state_set_masked(struct irq_desc *desc) |
299 | * default disable function | ||
300 | */ | ||
301 | static void default_disable(unsigned int irq) | ||
302 | { | 156 | { |
157 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | ||
303 | } | 158 | } |
304 | 159 | ||
305 | /* | 160 | int irq_startup(struct irq_desc *desc) |
306 | * default startup function | ||
307 | */ | ||
308 | static unsigned int default_startup(unsigned int irq) | ||
309 | { | 161 | { |
310 | struct irq_desc *desc = irq_to_desc(irq); | 162 | irq_state_clr_disabled(desc); |
163 | desc->depth = 0; | ||
311 | 164 | ||
312 | desc->chip->enable(irq); | 165 | if (desc->irq_data.chip->irq_startup) { |
166 | int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
167 | irq_state_clr_masked(desc); | ||
168 | return ret; | ||
169 | } | ||
170 | |||
171 | irq_enable(desc); | ||
313 | return 0; | 172 | return 0; |
314 | } | 173 | } |
315 | 174 | ||
316 | /* | 175 | void irq_shutdown(struct irq_desc *desc) |
317 | * default shutdown function | ||
318 | */ | ||
319 | static void default_shutdown(unsigned int irq) | ||
320 | { | 176 | { |
321 | struct irq_desc *desc = irq_to_desc(irq); | 177 | irq_state_set_disabled(desc); |
178 | desc->depth = 1; | ||
179 | if (desc->irq_data.chip->irq_shutdown) | ||
180 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | ||
181 | if (desc->irq_data.chip->irq_disable) | ||
182 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
183 | else | ||
184 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
185 | irq_state_set_masked(desc); | ||
186 | } | ||
322 | 187 | ||
323 | desc->chip->mask(irq); | 188 | void irq_enable(struct irq_desc *desc) |
324 | desc->status |= IRQ_MASKED; | 189 | { |
190 | irq_state_clr_disabled(desc); | ||
191 | if (desc->irq_data.chip->irq_enable) | ||
192 | desc->irq_data.chip->irq_enable(&desc->irq_data); | ||
193 | else | ||
194 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
195 | irq_state_clr_masked(desc); | ||
325 | } | 196 | } |
326 | 197 | ||
327 | /* | 198 | void irq_disable(struct irq_desc *desc) |
328 | * Fixup enable/disable function pointers | ||
329 | */ | ||
330 | void irq_chip_set_defaults(struct irq_chip *chip) | ||
331 | { | 199 | { |
332 | if (!chip->enable) | 200 | irq_state_set_disabled(desc); |
333 | chip->enable = default_enable; | 201 | if (desc->irq_data.chip->irq_disable) { |
334 | if (!chip->disable) | 202 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
335 | chip->disable = default_disable; | 203 | irq_state_set_masked(desc); |
336 | if (!chip->startup) | 204 | } |
337 | chip->startup = default_startup; | ||
338 | /* | ||
339 | * We use chip->disable, when the user provided its own. When | ||
340 | * we have default_disable set for chip->disable, then we need | ||
341 | * to use default_shutdown, otherwise the irq line is not | ||
342 | * disabled on free_irq(): | ||
343 | */ | ||
344 | if (!chip->shutdown) | ||
345 | chip->shutdown = chip->disable != default_disable ? | ||
346 | chip->disable : default_shutdown; | ||
347 | if (!chip->name) | ||
348 | chip->name = chip->typename; | ||
349 | if (!chip->end) | ||
350 | chip->end = dummy_irq_chip.end; | ||
351 | } | 205 | } |
352 | 206 | ||
353 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | 207 | static inline void mask_ack_irq(struct irq_desc *desc) |
354 | { | 208 | { |
355 | if (desc->chip->mask_ack) | 209 | if (desc->irq_data.chip->irq_mask_ack) |
356 | desc->chip->mask_ack(irq); | 210 | desc->irq_data.chip->irq_mask_ack(&desc->irq_data); |
357 | else { | 211 | else { |
358 | desc->chip->mask(irq); | 212 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
359 | if (desc->chip->ack) | 213 | if (desc->irq_data.chip->irq_ack) |
360 | desc->chip->ack(irq); | 214 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
361 | } | 215 | } |
362 | desc->status |= IRQ_MASKED; | 216 | irq_state_set_masked(desc); |
363 | } | 217 | } |
364 | 218 | ||
365 | static inline void mask_irq(struct irq_desc *desc, int irq) | 219 | void mask_irq(struct irq_desc *desc) |
366 | { | 220 | { |
367 | if (desc->chip->mask) { | 221 | if (desc->irq_data.chip->irq_mask) { |
368 | desc->chip->mask(irq); | 222 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
369 | desc->status |= IRQ_MASKED; | 223 | irq_state_set_masked(desc); |
370 | } | 224 | } |
371 | } | 225 | } |
372 | 226 | ||
373 | static inline void unmask_irq(struct irq_desc *desc, int irq) | 227 | void unmask_irq(struct irq_desc *desc) |
374 | { | 228 | { |
375 | if (desc->chip->unmask) { | 229 | if (desc->irq_data.chip->irq_unmask) { |
376 | desc->chip->unmask(irq); | 230 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
377 | desc->status &= ~IRQ_MASKED; | 231 | irq_state_clr_masked(desc); |
378 | } | 232 | } |
379 | } | 233 | } |
380 | 234 | ||
@@ -399,10 +253,10 @@ void handle_nested_irq(unsigned int irq) | |||
399 | kstat_incr_irqs_this_cpu(irq, desc); | 253 | kstat_incr_irqs_this_cpu(irq, desc); |
400 | 254 | ||
401 | action = desc->action; | 255 | action = desc->action; |
402 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | 256 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) |
403 | goto out_unlock; | 257 | goto out_unlock; |
404 | 258 | ||
405 | desc->status |= IRQ_INPROGRESS; | 259 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
406 | raw_spin_unlock_irq(&desc->lock); | 260 | raw_spin_unlock_irq(&desc->lock); |
407 | 261 | ||
408 | action_ret = action->thread_fn(action->irq, action->dev_id); | 262 | action_ret = action->thread_fn(action->irq, action->dev_id); |
@@ -410,13 +264,20 @@ void handle_nested_irq(unsigned int irq) | |||
410 | note_interrupt(irq, desc, action_ret); | 264 | note_interrupt(irq, desc, action_ret); |
411 | 265 | ||
412 | raw_spin_lock_irq(&desc->lock); | 266 | raw_spin_lock_irq(&desc->lock); |
413 | desc->status &= ~IRQ_INPROGRESS; | 267 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
414 | 268 | ||
415 | out_unlock: | 269 | out_unlock: |
416 | raw_spin_unlock_irq(&desc->lock); | 270 | raw_spin_unlock_irq(&desc->lock); |
417 | } | 271 | } |
418 | EXPORT_SYMBOL_GPL(handle_nested_irq); | 272 | EXPORT_SYMBOL_GPL(handle_nested_irq); |
419 | 273 | ||
274 | static bool irq_check_poll(struct irq_desc *desc) | ||
275 | { | ||
276 | if (!(desc->istate & IRQS_POLL_INPROGRESS)) | ||
277 | return false; | ||
278 | return irq_wait_for_poll(desc); | ||
279 | } | ||
280 | |||
420 | /** | 281 | /** |
421 | * handle_simple_irq - Simple and software-decoded IRQs. | 282 | * handle_simple_irq - Simple and software-decoded IRQs. |
422 | * @irq: the interrupt number | 283 | * @irq: the interrupt number |
@@ -432,32 +293,24 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); | |||
432 | void | 293 | void |
433 | handle_simple_irq(unsigned int irq, struct irq_desc *desc) | 294 | handle_simple_irq(unsigned int irq, struct irq_desc *desc) |
434 | { | 295 | { |
435 | struct irqaction *action; | ||
436 | irqreturn_t action_ret; | ||
437 | |||
438 | raw_spin_lock(&desc->lock); | 296 | raw_spin_lock(&desc->lock); |
439 | 297 | ||
440 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 298 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) |
441 | goto out_unlock; | 299 | if (!irq_check_poll(desc)) |
442 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 300 | goto out_unlock; |
301 | |||
302 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
443 | kstat_incr_irqs_this_cpu(irq, desc); | 303 | kstat_incr_irqs_this_cpu(irq, desc); |
444 | 304 | ||
445 | action = desc->action; | 305 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) |
446 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
447 | goto out_unlock; | 306 | goto out_unlock; |
448 | 307 | ||
449 | desc->status |= IRQ_INPROGRESS; | 308 | handle_irq_event(desc); |
450 | raw_spin_unlock(&desc->lock); | ||
451 | 309 | ||
452 | action_ret = handle_IRQ_event(irq, action); | ||
453 | if (!noirqdebug) | ||
454 | note_interrupt(irq, desc, action_ret); | ||
455 | |||
456 | raw_spin_lock(&desc->lock); | ||
457 | desc->status &= ~IRQ_INPROGRESS; | ||
458 | out_unlock: | 310 | out_unlock: |
459 | raw_spin_unlock(&desc->lock); | 311 | raw_spin_unlock(&desc->lock); |
460 | } | 312 | } |
313 | EXPORT_SYMBOL_GPL(handle_simple_irq); | ||
461 | 314 | ||
462 | /** | 315 | /** |
463 | * handle_level_irq - Level type irq handler | 316 | * handle_level_irq - Level type irq handler |
@@ -472,42 +325,42 @@ out_unlock: | |||
472 | void | 325 | void |
473 | handle_level_irq(unsigned int irq, struct irq_desc *desc) | 326 | handle_level_irq(unsigned int irq, struct irq_desc *desc) |
474 | { | 327 | { |
475 | struct irqaction *action; | ||
476 | irqreturn_t action_ret; | ||
477 | |||
478 | raw_spin_lock(&desc->lock); | 328 | raw_spin_lock(&desc->lock); |
479 | mask_ack_irq(desc, irq); | 329 | mask_ack_irq(desc); |
480 | 330 | ||
481 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 331 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) |
482 | goto out_unlock; | 332 | if (!irq_check_poll(desc)) |
483 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 333 | goto out_unlock; |
334 | |||
335 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
484 | kstat_incr_irqs_this_cpu(irq, desc); | 336 | kstat_incr_irqs_this_cpu(irq, desc); |
485 | 337 | ||
486 | /* | 338 | /* |
487 | * If its disabled or no action available | 339 | * If its disabled or no action available |
488 | * keep it masked and get out of here | 340 | * keep it masked and get out of here |
489 | */ | 341 | */ |
490 | action = desc->action; | 342 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) |
491 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
492 | goto out_unlock; | 343 | goto out_unlock; |
493 | 344 | ||
494 | desc->status |= IRQ_INPROGRESS; | 345 | handle_irq_event(desc); |
495 | raw_spin_unlock(&desc->lock); | ||
496 | |||
497 | action_ret = handle_IRQ_event(irq, action); | ||
498 | if (!noirqdebug) | ||
499 | note_interrupt(irq, desc, action_ret); | ||
500 | |||
501 | raw_spin_lock(&desc->lock); | ||
502 | desc->status &= ~IRQ_INPROGRESS; | ||
503 | 346 | ||
504 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) | 347 | if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) |
505 | unmask_irq(desc, irq); | 348 | unmask_irq(desc); |
506 | out_unlock: | 349 | out_unlock: |
507 | raw_spin_unlock(&desc->lock); | 350 | raw_spin_unlock(&desc->lock); |
508 | } | 351 | } |
509 | EXPORT_SYMBOL_GPL(handle_level_irq); | 352 | EXPORT_SYMBOL_GPL(handle_level_irq); |
510 | 353 | ||
354 | #ifdef CONFIG_IRQ_PREFLOW_FASTEOI | ||
355 | static inline void preflow_handler(struct irq_desc *desc) | ||
356 | { | ||
357 | if (desc->preflow_handler) | ||
358 | desc->preflow_handler(&desc->irq_data); | ||
359 | } | ||
360 | #else | ||
361 | static inline void preflow_handler(struct irq_desc *desc) { } | ||
362 | #endif | ||
363 | |||
511 | /** | 364 | /** |
512 | * handle_fasteoi_irq - irq handler for transparent controllers | 365 | * handle_fasteoi_irq - irq handler for transparent controllers |
513 | * @irq: the interrupt number | 366 | * @irq: the interrupt number |
@@ -521,42 +374,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq); | |||
521 | void | 374 | void |
522 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | 375 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) |
523 | { | 376 | { |
524 | struct irqaction *action; | ||
525 | irqreturn_t action_ret; | ||
526 | |||
527 | raw_spin_lock(&desc->lock); | 377 | raw_spin_lock(&desc->lock); |
528 | 378 | ||
529 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 379 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) |
530 | goto out; | 380 | if (!irq_check_poll(desc)) |
381 | goto out; | ||
531 | 382 | ||
532 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 383 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
533 | kstat_incr_irqs_this_cpu(irq, desc); | 384 | kstat_incr_irqs_this_cpu(irq, desc); |
534 | 385 | ||
535 | /* | 386 | /* |
536 | * If its disabled or no action available | 387 | * If its disabled or no action available |
537 | * then mask it and get out of here: | 388 | * then mask it and get out of here: |
538 | */ | 389 | */ |
539 | action = desc->action; | 390 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
540 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 391 | desc->istate |= IRQS_PENDING; |
541 | desc->status |= IRQ_PENDING; | 392 | mask_irq(desc); |
542 | mask_irq(desc, irq); | ||
543 | goto out; | 393 | goto out; |
544 | } | 394 | } |
545 | 395 | ||
546 | desc->status |= IRQ_INPROGRESS; | 396 | if (desc->istate & IRQS_ONESHOT) |
547 | desc->status &= ~IRQ_PENDING; | 397 | mask_irq(desc); |
548 | raw_spin_unlock(&desc->lock); | ||
549 | |||
550 | action_ret = handle_IRQ_event(irq, action); | ||
551 | if (!noirqdebug) | ||
552 | note_interrupt(irq, desc, action_ret); | ||
553 | 398 | ||
554 | raw_spin_lock(&desc->lock); | 399 | preflow_handler(desc); |
555 | desc->status &= ~IRQ_INPROGRESS; | 400 | handle_irq_event(desc); |
556 | out: | ||
557 | desc->chip->eoi(irq); | ||
558 | 401 | ||
402 | out_eoi: | ||
403 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | ||
404 | out_unlock: | ||
559 | raw_spin_unlock(&desc->lock); | 405 | raw_spin_unlock(&desc->lock); |
406 | return; | ||
407 | out: | ||
408 | if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) | ||
409 | goto out_eoi; | ||
410 | goto out_unlock; | ||
560 | } | 411 | } |
561 | 412 | ||
562 | /** | 413 | /** |
@@ -565,7 +416,7 @@ out: | |||
565 | * @desc: the interrupt description structure for this irq | 416 | * @desc: the interrupt description structure for this irq |
566 | * | 417 | * |
567 | * Interrupt occures on the falling and/or rising edge of a hardware | 418 | * Interrupt occures on the falling and/or rising edge of a hardware |
568 | * signal. The occurence is latched into the irq controller hardware | 419 | * signal. The occurrence is latched into the irq controller hardware |
569 | * and must be acked in order to be reenabled. After the ack another | 420 | * and must be acked in order to be reenabled. After the ack another |
570 | * interrupt can happen on the same source even before the first one | 421 | * interrupt can happen on the same source even before the first one |
571 | * is handled by the associated event handler. If this happens it | 422 | * is handled by the associated event handler. If this happens it |
@@ -580,34 +431,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
580 | { | 431 | { |
581 | raw_spin_lock(&desc->lock); | 432 | raw_spin_lock(&desc->lock); |
582 | 433 | ||
583 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | 434 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
584 | |||
585 | /* | 435 | /* |
586 | * If we're currently running this IRQ, or its disabled, | 436 | * If we're currently running this IRQ, or its disabled, |
587 | * we shouldn't process the IRQ. Mark it pending, handle | 437 | * we shouldn't process the IRQ. Mark it pending, handle |
588 | * the necessary masking and go out | 438 | * the necessary masking and go out |
589 | */ | 439 | */ |
590 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | 440 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || |
591 | !desc->action)) { | 441 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { |
592 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | 442 | if (!irq_check_poll(desc)) { |
593 | mask_ack_irq(desc, irq); | 443 | desc->istate |= IRQS_PENDING; |
594 | goto out_unlock; | 444 | mask_ack_irq(desc); |
445 | goto out_unlock; | ||
446 | } | ||
595 | } | 447 | } |
596 | kstat_incr_irqs_this_cpu(irq, desc); | 448 | kstat_incr_irqs_this_cpu(irq, desc); |
597 | 449 | ||
598 | /* Start handling the irq */ | 450 | /* Start handling the irq */ |
599 | if (desc->chip->ack) | 451 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
600 | desc->chip->ack(irq); | ||
601 | |||
602 | /* Mark the IRQ currently in progress.*/ | ||
603 | desc->status |= IRQ_INPROGRESS; | ||
604 | 452 | ||
605 | do { | 453 | do { |
606 | struct irqaction *action = desc->action; | 454 | if (unlikely(!desc->action)) { |
607 | irqreturn_t action_ret; | 455 | mask_irq(desc); |
608 | |||
609 | if (unlikely(!action)) { | ||
610 | mask_irq(desc, irq); | ||
611 | goto out_unlock; | 456 | goto out_unlock; |
612 | } | 457 | } |
613 | 458 | ||
@@ -616,26 +461,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
616 | * one, we could have masked the irq. | 461 | * one, we could have masked the irq. |
617 | * Renable it, if it was not disabled in meantime. | 462 | * Renable it, if it was not disabled in meantime. |
618 | */ | 463 | */ |
619 | if (unlikely((desc->status & | 464 | if (unlikely(desc->istate & IRQS_PENDING)) { |
620 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | 465 | if (!irqd_irq_disabled(&desc->irq_data) && |
621 | (IRQ_PENDING | IRQ_MASKED))) { | 466 | irqd_irq_masked(&desc->irq_data)) |
622 | unmask_irq(desc, irq); | 467 | unmask_irq(desc); |
623 | } | 468 | } |
624 | 469 | ||
625 | desc->status &= ~IRQ_PENDING; | 470 | handle_irq_event(desc); |
626 | raw_spin_unlock(&desc->lock); | ||
627 | action_ret = handle_IRQ_event(irq, action); | ||
628 | if (!noirqdebug) | ||
629 | note_interrupt(irq, desc, action_ret); | ||
630 | raw_spin_lock(&desc->lock); | ||
631 | 471 | ||
632 | } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); | 472 | } while ((desc->istate & IRQS_PENDING) && |
473 | !irqd_irq_disabled(&desc->irq_data)); | ||
633 | 474 | ||
634 | desc->status &= ~IRQ_INPROGRESS; | ||
635 | out_unlock: | 475 | out_unlock: |
636 | raw_spin_unlock(&desc->lock); | 476 | raw_spin_unlock(&desc->lock); |
637 | } | 477 | } |
638 | 478 | ||
479 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | ||
480 | /** | ||
481 | * handle_edge_eoi_irq - edge eoi type IRQ handler | ||
482 | * @irq: the interrupt number | ||
483 | * @desc: the interrupt description structure for this irq | ||
484 | * | ||
485 | * Similar as the above handle_edge_irq, but using eoi and w/o the | ||
486 | * mask/unmask logic. | ||
487 | */ | ||
488 | void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) | ||
489 | { | ||
490 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
491 | |||
492 | raw_spin_lock(&desc->lock); | ||
493 | |||
494 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
495 | /* | ||
496 | * If we're currently running this IRQ, or its disabled, | ||
497 | * we shouldn't process the IRQ. Mark it pending, handle | ||
498 | * the necessary masking and go out | ||
499 | */ | ||
500 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || | ||
501 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { | ||
502 | if (!irq_check_poll(desc)) { | ||
503 | desc->istate |= IRQS_PENDING; | ||
504 | goto out_eoi; | ||
505 | } | ||
506 | } | ||
507 | kstat_incr_irqs_this_cpu(irq, desc); | ||
508 | |||
509 | do { | ||
510 | if (unlikely(!desc->action)) | ||
511 | goto out_eoi; | ||
512 | |||
513 | handle_irq_event(desc); | ||
514 | |||
515 | } while ((desc->istate & IRQS_PENDING) && | ||
516 | !irqd_irq_disabled(&desc->irq_data)); | ||
517 | |||
518 | out_eoi: | ||
519 | chip->irq_eoi(&desc->irq_data); | ||
520 | raw_spin_unlock(&desc->lock); | ||
521 | } | ||
522 | #endif | ||
523 | |||
639 | /** | 524 | /** |
640 | * handle_percpu_irq - Per CPU local irq handler | 525 | * handle_percpu_irq - Per CPU local irq handler |
641 | * @irq: the interrupt number | 526 | * @irq: the interrupt number |
@@ -646,115 +531,147 @@ out_unlock: | |||
646 | void | 531 | void |
647 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | 532 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc) |
648 | { | 533 | { |
649 | irqreturn_t action_ret; | 534 | struct irq_chip *chip = irq_desc_get_chip(desc); |
650 | 535 | ||
651 | kstat_incr_irqs_this_cpu(irq, desc); | 536 | kstat_incr_irqs_this_cpu(irq, desc); |
652 | 537 | ||
653 | if (desc->chip->ack) | 538 | if (chip->irq_ack) |
654 | desc->chip->ack(irq); | 539 | chip->irq_ack(&desc->irq_data); |
655 | 540 | ||
656 | action_ret = handle_IRQ_event(irq, desc->action); | 541 | handle_irq_event_percpu(desc, desc->action); |
657 | if (!noirqdebug) | ||
658 | note_interrupt(irq, desc, action_ret); | ||
659 | 542 | ||
660 | if (desc->chip->eoi) | 543 | if (chip->irq_eoi) |
661 | desc->chip->eoi(irq); | 544 | chip->irq_eoi(&desc->irq_data); |
662 | } | 545 | } |
663 | 546 | ||
664 | void | 547 | void |
665 | __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | 548 | __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, |
666 | const char *name) | 549 | const char *name) |
667 | { | 550 | { |
668 | struct irq_desc *desc = irq_to_desc(irq); | ||
669 | unsigned long flags; | 551 | unsigned long flags; |
552 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
670 | 553 | ||
671 | if (!desc) { | 554 | if (!desc) |
672 | printk(KERN_ERR | ||
673 | "Trying to install type control for IRQ%d\n", irq); | ||
674 | return; | 555 | return; |
675 | } | ||
676 | 556 | ||
677 | if (!handle) | 557 | if (!handle) { |
678 | handle = handle_bad_irq; | 558 | handle = handle_bad_irq; |
679 | else if (desc->chip == &no_irq_chip) { | 559 | } else { |
680 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | 560 | if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) |
681 | "for IRQ%d\n", is_chained ? "chained " : "", irq); | 561 | goto out; |
682 | /* | ||
683 | * Some ARM implementations install a handler for really dumb | ||
684 | * interrupt hardware without setting an irq_chip. This worked | ||
685 | * with the ARM no_irq_chip but the check in setup_irq would | ||
686 | * prevent us to setup the interrupt at all. Switch it to | ||
687 | * dummy_irq_chip for easy transition. | ||
688 | */ | ||
689 | desc->chip = &dummy_irq_chip; | ||
690 | } | 562 | } |
691 | 563 | ||
692 | chip_bus_lock(irq, desc); | ||
693 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
694 | |||
695 | /* Uninstall? */ | 564 | /* Uninstall? */ |
696 | if (handle == handle_bad_irq) { | 565 | if (handle == handle_bad_irq) { |
697 | if (desc->chip != &no_irq_chip) | 566 | if (desc->irq_data.chip != &no_irq_chip) |
698 | mask_ack_irq(desc, irq); | 567 | mask_ack_irq(desc); |
699 | desc->status |= IRQ_DISABLED; | 568 | irq_state_set_disabled(desc); |
700 | desc->depth = 1; | 569 | desc->depth = 1; |
701 | } | 570 | } |
702 | desc->handle_irq = handle; | 571 | desc->handle_irq = handle; |
703 | desc->name = name; | 572 | desc->name = name; |
704 | 573 | ||
705 | if (handle != handle_bad_irq && is_chained) { | 574 | if (handle != handle_bad_irq && is_chained) { |
706 | desc->status &= ~IRQ_DISABLED; | 575 | irq_settings_set_noprobe(desc); |
707 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | 576 | irq_settings_set_norequest(desc); |
708 | desc->depth = 0; | 577 | irq_settings_set_nothread(desc); |
709 | desc->chip->startup(irq); | 578 | irq_startup(desc); |
710 | } | 579 | } |
711 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 580 | out: |
712 | chip_bus_sync_unlock(irq, desc); | 581 | irq_put_desc_busunlock(desc, flags); |
713 | } | 582 | } |
714 | EXPORT_SYMBOL_GPL(__set_irq_handler); | 583 | EXPORT_SYMBOL_GPL(__irq_set_handler); |
715 | 584 | ||
716 | void | 585 | void |
717 | set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | 586 | irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, |
718 | irq_flow_handler_t handle) | 587 | irq_flow_handler_t handle, const char *name) |
719 | { | 588 | { |
720 | set_irq_chip(irq, chip); | 589 | irq_set_chip(irq, chip); |
721 | __set_irq_handler(irq, handle, 0, NULL); | 590 | __irq_set_handler(irq, handle, 0, name); |
722 | } | 591 | } |
723 | 592 | ||
724 | void | 593 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
725 | set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | ||
726 | irq_flow_handler_t handle, const char *name) | ||
727 | { | 594 | { |
728 | set_irq_chip(irq, chip); | 595 | unsigned long flags; |
729 | __set_irq_handler(irq, handle, 0, name); | 596 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); |
597 | |||
598 | if (!desc) | ||
599 | return; | ||
600 | irq_settings_clr_and_set(desc, clr, set); | ||
601 | |||
602 | irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | | ||
603 | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); | ||
604 | if (irq_settings_has_no_balance_set(desc)) | ||
605 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | ||
606 | if (irq_settings_is_per_cpu(desc)) | ||
607 | irqd_set(&desc->irq_data, IRQD_PER_CPU); | ||
608 | if (irq_settings_can_move_pcntxt(desc)) | ||
609 | irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); | ||
610 | if (irq_settings_is_level(desc)) | ||
611 | irqd_set(&desc->irq_data, IRQD_LEVEL); | ||
612 | |||
613 | irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); | ||
614 | |||
615 | irq_put_desc_unlock(desc, flags); | ||
730 | } | 616 | } |
617 | EXPORT_SYMBOL_GPL(irq_modify_status); | ||
731 | 618 | ||
732 | void set_irq_noprobe(unsigned int irq) | 619 | /** |
620 | * irq_cpu_online - Invoke all irq_cpu_online functions. | ||
621 | * | ||
622 | * Iterate through all irqs and invoke the chip.irq_cpu_online() | ||
623 | * for each. | ||
624 | */ | ||
625 | void irq_cpu_online(void) | ||
733 | { | 626 | { |
734 | struct irq_desc *desc = irq_to_desc(irq); | 627 | struct irq_desc *desc; |
628 | struct irq_chip *chip; | ||
735 | unsigned long flags; | 629 | unsigned long flags; |
630 | unsigned int irq; | ||
736 | 631 | ||
737 | if (!desc) { | 632 | for_each_active_irq(irq) { |
738 | printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); | 633 | desc = irq_to_desc(irq); |
739 | return; | 634 | if (!desc) |
740 | } | 635 | continue; |
636 | |||
637 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
741 | 638 | ||
742 | raw_spin_lock_irqsave(&desc->lock, flags); | 639 | chip = irq_data_get_irq_chip(&desc->irq_data); |
743 | desc->status |= IRQ_NOPROBE; | 640 | if (chip && chip->irq_cpu_online && |
744 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 641 | (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || |
642 | !irqd_irq_disabled(&desc->irq_data))) | ||
643 | chip->irq_cpu_online(&desc->irq_data); | ||
644 | |||
645 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
646 | } | ||
745 | } | 647 | } |
746 | 648 | ||
747 | void set_irq_probe(unsigned int irq) | 649 | /** |
650 | * irq_cpu_offline - Invoke all irq_cpu_offline functions. | ||
651 | * | ||
652 | * Iterate through all irqs and invoke the chip.irq_cpu_offline() | ||
653 | * for each. | ||
654 | */ | ||
655 | void irq_cpu_offline(void) | ||
748 | { | 656 | { |
749 | struct irq_desc *desc = irq_to_desc(irq); | 657 | struct irq_desc *desc; |
658 | struct irq_chip *chip; | ||
750 | unsigned long flags; | 659 | unsigned long flags; |
660 | unsigned int irq; | ||
751 | 661 | ||
752 | if (!desc) { | 662 | for_each_active_irq(irq) { |
753 | printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); | 663 | desc = irq_to_desc(irq); |
754 | return; | 664 | if (!desc) |
755 | } | 665 | continue; |
666 | |||
667 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
756 | 668 | ||
757 | raw_spin_lock_irqsave(&desc->lock, flags); | 669 | chip = irq_data_get_irq_chip(&desc->irq_data); |
758 | desc->status &= ~IRQ_NOPROBE; | 670 | if (chip && chip->irq_cpu_offline && |
759 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 671 | (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || |
672 | !irqd_irq_disabled(&desc->irq_data))) | ||
673 | chip->irq_cpu_offline(&desc->irq_data); | ||
674 | |||
675 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
676 | } | ||
760 | } | 677 | } |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 000000000000..97a8bfadc88a --- /dev/null +++ b/kernel/irq/debug.h | |||
@@ -0,0 +1,45 @@ | |||
1 | /* | ||
2 | * Debugging printout: | ||
3 | */ | ||
4 | |||
5 | #include <linux/kallsyms.h> | ||
6 | |||
7 | #define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) | ||
8 | #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) | ||
9 | /* FIXME */ | ||
10 | #define PD(f) do { } while (0) | ||
11 | |||
12 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
13 | { | ||
14 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | ||
15 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
16 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
17 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
18 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); | ||
19 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); | ||
20 | printk("->action(): %p\n", desc->action); | ||
21 | if (desc->action) { | ||
22 | printk("->action->handler(): %p, ", desc->action->handler); | ||
23 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
24 | } | ||
25 | |||
26 | P(IRQ_LEVEL); | ||
27 | P(IRQ_PER_CPU); | ||
28 | P(IRQ_NOPROBE); | ||
29 | P(IRQ_NOREQUEST); | ||
30 | P(IRQ_NOTHREAD); | ||
31 | P(IRQ_NOAUTOEN); | ||
32 | |||
33 | PS(IRQS_AUTODETECT); | ||
34 | PS(IRQS_REPLAY); | ||
35 | PS(IRQS_WAITING); | ||
36 | PS(IRQS_PENDING); | ||
37 | |||
38 | PD(IRQS_INPROGRESS); | ||
39 | PD(IRQS_DISABLED); | ||
40 | PD(IRQS_MASKED); | ||
41 | } | ||
42 | |||
43 | #undef P | ||
44 | #undef PS | ||
45 | #undef PD | ||
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 000000000000..b5fcd96c7102 --- /dev/null +++ b/kernel/irq/dummychip.c | |||
@@ -0,0 +1,59 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
3 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
4 | * | ||
5 | * This file contains the dummy interrupt chip implementation | ||
6 | */ | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/irq.h> | ||
9 | |||
10 | #include "internals.h" | ||
11 | |||
12 | /* | ||
13 | * What should we do if we get a hw irq event on an illegal vector? | ||
14 | * Each architecture has to answer this themself. | ||
15 | */ | ||
16 | static void ack_bad(struct irq_data *data) | ||
17 | { | ||
18 | struct irq_desc *desc = irq_data_to_desc(data); | ||
19 | |||
20 | print_irq_desc(data->irq, desc); | ||
21 | ack_bad_irq(data->irq); | ||
22 | } | ||
23 | |||
24 | /* | ||
25 | * NOP functions | ||
26 | */ | ||
27 | static void noop(struct irq_data *data) { } | ||
28 | |||
29 | static unsigned int noop_ret(struct irq_data *data) | ||
30 | { | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * Generic no controller implementation | ||
36 | */ | ||
37 | struct irq_chip no_irq_chip = { | ||
38 | .name = "none", | ||
39 | .irq_startup = noop_ret, | ||
40 | .irq_shutdown = noop, | ||
41 | .irq_enable = noop, | ||
42 | .irq_disable = noop, | ||
43 | .irq_ack = ack_bad, | ||
44 | }; | ||
45 | |||
46 | /* | ||
47 | * Generic dummy implementation which can be used for | ||
48 | * real dumb interrupt sources | ||
49 | */ | ||
50 | struct irq_chip dummy_irq_chip = { | ||
51 | .name = "dummy", | ||
52 | .irq_startup = noop_ret, | ||
53 | .irq_shutdown = noop, | ||
54 | .irq_enable = noop, | ||
55 | .irq_disable = noop, | ||
56 | .irq_ack = noop, | ||
57 | .irq_mask = noop, | ||
58 | .irq_unmask = noop, | ||
59 | }; | ||
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 000000000000..3a2cab407b93 --- /dev/null +++ b/kernel/irq/generic-chip.c | |||
@@ -0,0 +1,368 @@ | |||
1 | /* | ||
2 | * Library implementing the most common irq chip callback functions | ||
3 | * | ||
4 | * Copyright (C) 2011, Thomas Gleixner | ||
5 | */ | ||
6 | #include <linux/io.h> | ||
7 | #include <linux/irq.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/interrupt.h> | ||
10 | #include <linux/kernel_stat.h> | ||
11 | #include <linux/syscore_ops.h> | ||
12 | |||
13 | #include "internals.h" | ||
14 | |||
15 | static LIST_HEAD(gc_list); | ||
16 | static DEFINE_RAW_SPINLOCK(gc_lock); | ||
17 | |||
18 | static inline struct irq_chip_regs *cur_regs(struct irq_data *d) | ||
19 | { | ||
20 | return &container_of(d->chip, struct irq_chip_type, chip)->regs; | ||
21 | } | ||
22 | |||
23 | /** | ||
24 | * irq_gc_noop - NOOP function | ||
25 | * @d: irq_data | ||
26 | */ | ||
27 | void irq_gc_noop(struct irq_data *d) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | * irq_gc_mask_disable_reg - Mask chip via disable register | ||
33 | * @d: irq_data | ||
34 | * | ||
35 | * Chip has separate enable/disable registers instead of a single mask | ||
36 | * register. | ||
37 | */ | ||
38 | void irq_gc_mask_disable_reg(struct irq_data *d) | ||
39 | { | ||
40 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
41 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
42 | |||
43 | irq_gc_lock(gc); | ||
44 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); | ||
45 | gc->mask_cache &= ~mask; | ||
46 | irq_gc_unlock(gc); | ||
47 | } | ||
48 | |||
49 | /** | ||
50 | * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register | ||
51 | * @d: irq_data | ||
52 | * | ||
53 | * Chip has a single mask register. Values of this register are cached | ||
54 | * and protected by gc->lock | ||
55 | */ | ||
56 | void irq_gc_mask_set_bit(struct irq_data *d) | ||
57 | { | ||
58 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
59 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
60 | |||
61 | irq_gc_lock(gc); | ||
62 | gc->mask_cache |= mask; | ||
63 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
64 | irq_gc_unlock(gc); | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register | ||
69 | * @d: irq_data | ||
70 | * | ||
71 | * Chip has a single mask register. Values of this register are cached | ||
72 | * and protected by gc->lock | ||
73 | */ | ||
74 | void irq_gc_mask_clr_bit(struct irq_data *d) | ||
75 | { | ||
76 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
77 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
78 | |||
79 | irq_gc_lock(gc); | ||
80 | gc->mask_cache &= ~mask; | ||
81 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
82 | irq_gc_unlock(gc); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * irq_gc_unmask_enable_reg - Unmask chip via enable register | ||
87 | * @d: irq_data | ||
88 | * | ||
89 | * Chip has separate enable/disable registers instead of a single mask | ||
90 | * register. | ||
91 | */ | ||
92 | void irq_gc_unmask_enable_reg(struct irq_data *d) | ||
93 | { | ||
94 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
95 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
96 | |||
97 | irq_gc_lock(gc); | ||
98 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); | ||
99 | gc->mask_cache |= mask; | ||
100 | irq_gc_unlock(gc); | ||
101 | } | ||
102 | |||
103 | /** | ||
104 | * irq_gc_ack_set_bit - Ack pending interrupt via setting bit | ||
105 | * @d: irq_data | ||
106 | */ | ||
107 | void irq_gc_ack_set_bit(struct irq_data *d) | ||
108 | { | ||
109 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
110 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
111 | |||
112 | irq_gc_lock(gc); | ||
113 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
114 | irq_gc_unlock(gc); | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit | ||
119 | * @d: irq_data | ||
120 | */ | ||
121 | void irq_gc_ack_clr_bit(struct irq_data *d) | ||
122 | { | ||
123 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
124 | u32 mask = ~(1 << (d->irq - gc->irq_base)); | ||
125 | |||
126 | irq_gc_lock(gc); | ||
127 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
128 | irq_gc_unlock(gc); | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt | ||
133 | * @d: irq_data | ||
134 | */ | ||
135 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | ||
136 | { | ||
137 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
138 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
139 | |||
140 | irq_gc_lock(gc); | ||
141 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); | ||
142 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
143 | irq_gc_unlock(gc); | ||
144 | } | ||
145 | |||
146 | /** | ||
147 | * irq_gc_eoi - EOI interrupt | ||
148 | * @d: irq_data | ||
149 | */ | ||
150 | void irq_gc_eoi(struct irq_data *d) | ||
151 | { | ||
152 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
153 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
154 | |||
155 | irq_gc_lock(gc); | ||
156 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); | ||
157 | irq_gc_unlock(gc); | ||
158 | } | ||
159 | |||
160 | /** | ||
161 | * irq_gc_set_wake - Set/clr wake bit for an interrupt | ||
162 | * @d: irq_data | ||
163 | * | ||
164 | * For chips where the wake from suspend functionality is not | ||
165 | * configured in a separate register and the wakeup active state is | ||
166 | * just stored in a bitmask. | ||
167 | */ | ||
168 | int irq_gc_set_wake(struct irq_data *d, unsigned int on) | ||
169 | { | ||
170 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
171 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
172 | |||
173 | if (!(mask & gc->wake_enabled)) | ||
174 | return -EINVAL; | ||
175 | |||
176 | irq_gc_lock(gc); | ||
177 | if (on) | ||
178 | gc->wake_active |= mask; | ||
179 | else | ||
180 | gc->wake_active &= ~mask; | ||
181 | irq_gc_unlock(gc); | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * irq_alloc_generic_chip - Allocate a generic chip and initialize it | ||
187 | * @name: Name of the irq chip | ||
188 | * @num_ct: Number of irq_chip_type instances associated with this | ||
189 | * @irq_base: Interrupt base nr for this chip | ||
190 | * @reg_base: Register base address (virtual) | ||
191 | * @handler: Default flow handler associated with this chip | ||
192 | * | ||
193 | * Returns an initialized irq_chip_generic structure. The chip defaults | ||
194 | * to the primary (index 0) irq_chip_type and @handler | ||
195 | */ | ||
196 | struct irq_chip_generic * | ||
197 | irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, | ||
198 | void __iomem *reg_base, irq_flow_handler_t handler) | ||
199 | { | ||
200 | struct irq_chip_generic *gc; | ||
201 | unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
202 | |||
203 | gc = kzalloc(sz, GFP_KERNEL); | ||
204 | if (gc) { | ||
205 | raw_spin_lock_init(&gc->lock); | ||
206 | gc->num_ct = num_ct; | ||
207 | gc->irq_base = irq_base; | ||
208 | gc->reg_base = reg_base; | ||
209 | gc->chip_types->chip.name = name; | ||
210 | gc->chip_types->handler = handler; | ||
211 | } | ||
212 | return gc; | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Separate lockdep class for interrupt chip which can nest irq_desc | ||
217 | * lock. | ||
218 | */ | ||
219 | static struct lock_class_key irq_nested_lock_class; | ||
220 | |||
221 | /** | ||
222 | * irq_setup_generic_chip - Setup a range of interrupts with a generic chip | ||
223 | * @gc: Generic irq chip holding all data | ||
224 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
225 | * @flags: Flags for initialization | ||
226 | * @clr: IRQ_* bits to clear | ||
227 | * @set: IRQ_* bits to set | ||
228 | * | ||
229 | * Set up max. 32 interrupts starting from gc->irq_base. Note, this | ||
230 | * initializes all interrupts to the primary irq_chip_type and its | ||
231 | * associated handler. | ||
232 | */ | ||
233 | void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
234 | enum irq_gc_flags flags, unsigned int clr, | ||
235 | unsigned int set) | ||
236 | { | ||
237 | struct irq_chip_type *ct = gc->chip_types; | ||
238 | unsigned int i; | ||
239 | |||
240 | raw_spin_lock(&gc_lock); | ||
241 | list_add_tail(&gc->list, &gc_list); | ||
242 | raw_spin_unlock(&gc_lock); | ||
243 | |||
244 | /* Init mask cache ? */ | ||
245 | if (flags & IRQ_GC_INIT_MASK_CACHE) | ||
246 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); | ||
247 | |||
248 | for (i = gc->irq_base; msk; msk >>= 1, i++) { | ||
249 | if (!msk & 0x01) | ||
250 | continue; | ||
251 | |||
252 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | ||
253 | irq_set_lockdep_class(i, &irq_nested_lock_class); | ||
254 | |||
255 | irq_set_chip_and_handler(i, &ct->chip, ct->handler); | ||
256 | irq_set_chip_data(i, gc); | ||
257 | irq_modify_status(i, clr, set); | ||
258 | } | ||
259 | gc->irq_cnt = i - gc->irq_base; | ||
260 | } | ||
261 | |||
262 | /** | ||
263 | * irq_setup_alt_chip - Switch to alternative chip | ||
264 | * @d: irq_data for this interrupt | ||
265 | * @type Flow type to be initialized | ||
266 | * | ||
267 | * Only to be called from chip->irq_set_type() callbacks. | ||
268 | */ | ||
269 | int irq_setup_alt_chip(struct irq_data *d, unsigned int type) | ||
270 | { | ||
271 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
272 | struct irq_chip_type *ct = gc->chip_types; | ||
273 | unsigned int i; | ||
274 | |||
275 | for (i = 0; i < gc->num_ct; i++, ct++) { | ||
276 | if (ct->type & type) { | ||
277 | d->chip = &ct->chip; | ||
278 | irq_data_to_desc(d)->handle_irq = ct->handler; | ||
279 | return 0; | ||
280 | } | ||
281 | } | ||
282 | return -EINVAL; | ||
283 | } | ||
284 | |||
285 | /** | ||
286 | * irq_remove_generic_chip - Remove a chip | ||
287 | * @gc: Generic irq chip holding all data | ||
288 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
289 | * @clr: IRQ_* bits to clear | ||
290 | * @set: IRQ_* bits to set | ||
291 | * | ||
292 | * Remove up to 32 interrupts starting from gc->irq_base. | ||
293 | */ | ||
294 | void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
295 | unsigned int clr, unsigned int set) | ||
296 | { | ||
297 | unsigned int i = gc->irq_base; | ||
298 | |||
299 | raw_spin_lock(&gc_lock); | ||
300 | list_del(&gc->list); | ||
301 | raw_spin_unlock(&gc_lock); | ||
302 | |||
303 | for (; msk; msk >>= 1, i++) { | ||
304 | if (!msk & 0x01) | ||
305 | continue; | ||
306 | |||
307 | /* Remove handler first. That will mask the irq line */ | ||
308 | irq_set_handler(i, NULL); | ||
309 | irq_set_chip(i, &no_irq_chip); | ||
310 | irq_set_chip_data(i, NULL); | ||
311 | irq_modify_status(i, clr, set); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | #ifdef CONFIG_PM | ||
316 | static int irq_gc_suspend(void) | ||
317 | { | ||
318 | struct irq_chip_generic *gc; | ||
319 | |||
320 | list_for_each_entry(gc, &gc_list, list) { | ||
321 | struct irq_chip_type *ct = gc->chip_types; | ||
322 | |||
323 | if (ct->chip.irq_suspend) | ||
324 | ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); | ||
325 | } | ||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static void irq_gc_resume(void) | ||
330 | { | ||
331 | struct irq_chip_generic *gc; | ||
332 | |||
333 | list_for_each_entry(gc, &gc_list, list) { | ||
334 | struct irq_chip_type *ct = gc->chip_types; | ||
335 | |||
336 | if (ct->chip.irq_resume) | ||
337 | ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); | ||
338 | } | ||
339 | } | ||
340 | #else | ||
341 | #define irq_gc_suspend NULL | ||
342 | #define irq_gc_resume NULL | ||
343 | #endif | ||
344 | |||
345 | static void irq_gc_shutdown(void) | ||
346 | { | ||
347 | struct irq_chip_generic *gc; | ||
348 | |||
349 | list_for_each_entry(gc, &gc_list, list) { | ||
350 | struct irq_chip_type *ct = gc->chip_types; | ||
351 | |||
352 | if (ct->chip.irq_pm_shutdown) | ||
353 | ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); | ||
354 | } | ||
355 | } | ||
356 | |||
357 | static struct syscore_ops irq_gc_syscore_ops = { | ||
358 | .suspend = irq_gc_suspend, | ||
359 | .resume = irq_gc_resume, | ||
360 | .shutdown = irq_gc_shutdown, | ||
361 | }; | ||
362 | |||
363 | static int __init irq_gc_init_ops(void) | ||
364 | { | ||
365 | register_syscore_ops(&irq_gc_syscore_ops); | ||
366 | return 0; | ||
367 | } | ||
368 | device_initcall(irq_gc_init_ops); | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c6911223..470d08c82bbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -11,24 +11,15 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
14 | #include <linux/sched.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/random.h> | 14 | #include <linux/random.h> |
15 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
19 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
20 | #include <linux/rculist.h> | 18 | |
21 | #include <linux/hash.h> | ||
22 | #include <linux/radix-tree.h> | ||
23 | #include <trace/events/irq.h> | 19 | #include <trace/events/irq.h> |
24 | 20 | ||
25 | #include "internals.h" | 21 | #include "internals.h" |
26 | 22 | ||
27 | /* | ||
28 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
29 | */ | ||
30 | struct lock_class_key irq_desc_lock_class; | ||
31 | |||
32 | /** | 23 | /** |
33 | * handle_bad_irq - handle spurious and unhandled irqs | 24 | * handle_bad_irq - handle spurious and unhandled irqs |
34 | * @irq: the interrupt number | 25 | * @irq: the interrupt number |
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) | |||
43 | ack_bad_irq(irq); | 34 | ack_bad_irq(irq); |
44 | } | 35 | } |
45 | 36 | ||
46 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | ||
47 | static void __init init_irq_default_affinity(void) | ||
48 | { | ||
49 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | ||
50 | cpumask_setall(irq_default_affinity); | ||
51 | } | ||
52 | #else | ||
53 | static void __init init_irq_default_affinity(void) | ||
54 | { | ||
55 | } | ||
56 | #endif | ||
57 | |||
58 | /* | ||
59 | * Linux has a controller-independent interrupt architecture. | ||
60 | * Every controller has a 'controller-template', that is used | ||
61 | * by the main code to do the right thing. Each driver-visible | ||
62 | * interrupt source is transparently wired to the appropriate | ||
63 | * controller. Thus drivers need not be aware of the | ||
64 | * interrupt-controller. | ||
65 | * | ||
66 | * The code is designed to be easily extended with new/different | ||
67 | * interrupt controllers, without having to do assembly magic or | ||
68 | * having to touch the generic code. | ||
69 | * | ||
70 | * Controller mappings for all interrupt sources: | ||
71 | */ | ||
72 | int nr_irqs = NR_IRQS; | ||
73 | EXPORT_SYMBOL_GPL(nr_irqs); | ||
74 | |||
75 | #ifdef CONFIG_SPARSE_IRQ | ||
76 | |||
77 | static struct irq_desc irq_desc_init = { | ||
78 | .irq = -1, | ||
79 | .status = IRQ_DISABLED, | ||
80 | .chip = &no_irq_chip, | ||
81 | .handle_irq = handle_bad_irq, | ||
82 | .depth = 1, | ||
83 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | ||
84 | }; | ||
85 | |||
86 | void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) | ||
87 | { | ||
88 | void *ptr; | ||
89 | |||
90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), | ||
91 | GFP_ATOMIC, node); | ||
92 | |||
93 | /* | ||
94 | * don't overwite if can not get new one | ||
95 | * init_copy_kstat_irqs() could still use old one | ||
96 | */ | ||
97 | if (ptr) { | ||
98 | printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); | ||
99 | desc->kstat_irqs = ptr; | ||
100 | } | ||
101 | } | ||
102 | |||
103 | static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) | ||
104 | { | ||
105 | memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); | ||
106 | |||
107 | raw_spin_lock_init(&desc->lock); | ||
108 | desc->irq = irq; | ||
109 | #ifdef CONFIG_SMP | ||
110 | desc->node = node; | ||
111 | #endif | ||
112 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
113 | init_kstat_irqs(desc, node, nr_cpu_ids); | ||
114 | if (!desc->kstat_irqs) { | ||
115 | printk(KERN_ERR "can not alloc kstat_irqs\n"); | ||
116 | BUG_ON(1); | ||
117 | } | ||
118 | if (!alloc_desc_masks(desc, node, false)) { | ||
119 | printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); | ||
120 | BUG_ON(1); | ||
121 | } | ||
122 | init_desc_masks(desc); | ||
123 | arch_init_chip_data(desc, node); | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Protect the sparse_irqs: | ||
128 | */ | ||
129 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); | ||
130 | |||
131 | static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); | ||
132 | |||
133 | static void set_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
134 | { | ||
135 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
136 | } | ||
137 | |||
138 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
139 | { | ||
140 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
141 | } | ||
142 | |||
143 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
144 | { | ||
145 | void **ptr; | ||
146 | |||
147 | ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); | ||
148 | if (ptr) | ||
149 | radix_tree_replace_slot(ptr, desc); | ||
150 | } | ||
151 | |||
152 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { | ||
153 | [0 ... NR_IRQS_LEGACY-1] = { | ||
154 | .irq = -1, | ||
155 | .status = IRQ_DISABLED, | ||
156 | .chip = &no_irq_chip, | ||
157 | .handle_irq = handle_bad_irq, | ||
158 | .depth = 1, | ||
159 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | ||
160 | } | ||
161 | }; | ||
162 | |||
163 | static unsigned int *kstat_irqs_legacy; | ||
164 | |||
165 | int __init early_irq_init(void) | ||
166 | { | ||
167 | struct irq_desc *desc; | ||
168 | int legacy_count; | ||
169 | int node; | ||
170 | int i; | ||
171 | |||
172 | init_irq_default_affinity(); | ||
173 | |||
174 | /* initialize nr_irqs based on nr_cpu_ids */ | ||
175 | arch_probe_nr_irqs(); | ||
176 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); | ||
177 | |||
178 | desc = irq_desc_legacy; | ||
179 | legacy_count = ARRAY_SIZE(irq_desc_legacy); | ||
180 | node = first_online_node; | ||
181 | |||
182 | /* allocate based on nr_cpu_ids */ | ||
183 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * | ||
184 | sizeof(int), GFP_NOWAIT, node); | ||
185 | |||
186 | for (i = 0; i < legacy_count; i++) { | ||
187 | desc[i].irq = i; | ||
188 | #ifdef CONFIG_SMP | ||
189 | desc[i].node = node; | ||
190 | #endif | ||
191 | desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; | ||
192 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | ||
193 | alloc_desc_masks(&desc[i], node, true); | ||
194 | init_desc_masks(&desc[i]); | ||
195 | set_irq_desc(i, &desc[i]); | ||
196 | } | ||
197 | |||
198 | return arch_early_irq_init(); | ||
199 | } | ||
200 | |||
201 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | ||
202 | { | ||
203 | struct irq_desc *desc; | ||
204 | unsigned long flags; | ||
205 | |||
206 | if (irq >= nr_irqs) { | ||
207 | WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", | ||
208 | irq, nr_irqs); | ||
209 | return NULL; | ||
210 | } | ||
211 | |||
212 | desc = irq_to_desc(irq); | ||
213 | if (desc) | ||
214 | return desc; | ||
215 | |||
216 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | ||
217 | |||
218 | /* We have to check it to avoid races with another CPU */ | ||
219 | desc = irq_to_desc(irq); | ||
220 | if (desc) | ||
221 | goto out_unlock; | ||
222 | |||
223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
224 | |||
225 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | ||
226 | if (!desc) { | ||
227 | printk(KERN_ERR "can not alloc irq_desc\n"); | ||
228 | BUG_ON(1); | ||
229 | } | ||
230 | init_one_irq_desc(irq, desc, node); | ||
231 | |||
232 | set_irq_desc(irq, desc); | ||
233 | |||
234 | out_unlock: | ||
235 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
236 | |||
237 | return desc; | ||
238 | } | ||
239 | |||
240 | #else /* !CONFIG_SPARSE_IRQ */ | ||
241 | |||
242 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | ||
243 | [0 ... NR_IRQS-1] = { | ||
244 | .status = IRQ_DISABLED, | ||
245 | .chip = &no_irq_chip, | ||
246 | .handle_irq = handle_bad_irq, | ||
247 | .depth = 1, | ||
248 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | ||
249 | } | ||
250 | }; | ||
251 | |||
252 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
253 | int __init early_irq_init(void) | ||
254 | { | ||
255 | struct irq_desc *desc; | ||
256 | int count; | ||
257 | int i; | ||
258 | |||
259 | init_irq_default_affinity(); | ||
260 | |||
261 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | ||
262 | |||
263 | desc = irq_desc; | ||
264 | count = ARRAY_SIZE(irq_desc); | ||
265 | |||
266 | for (i = 0; i < count; i++) { | ||
267 | desc[i].irq = i; | ||
268 | alloc_desc_masks(&desc[i], 0, true); | ||
269 | init_desc_masks(&desc[i]); | ||
270 | desc[i].kstat_irqs = kstat_irqs_all[i]; | ||
271 | } | ||
272 | return arch_early_irq_init(); | ||
273 | } | ||
274 | |||
275 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
276 | { | ||
277 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | ||
278 | } | ||
279 | |||
280 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) | ||
281 | { | ||
282 | return irq_to_desc(irq); | ||
283 | } | ||
284 | #endif /* !CONFIG_SPARSE_IRQ */ | ||
285 | |||
286 | void clear_kstat_irqs(struct irq_desc *desc) | ||
287 | { | ||
288 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * What should we do if we get a hw irq event on an illegal vector? | ||
293 | * Each architecture has to answer this themself. | ||
294 | */ | ||
295 | static void ack_bad(unsigned int irq) | ||
296 | { | ||
297 | struct irq_desc *desc = irq_to_desc(irq); | ||
298 | |||
299 | print_irq_desc(irq, desc); | ||
300 | ack_bad_irq(irq); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * NOP functions | ||
305 | */ | ||
306 | static void noop(unsigned int irq) | ||
307 | { | ||
308 | } | ||
309 | |||
310 | static unsigned int noop_ret(unsigned int irq) | ||
311 | { | ||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Generic no controller implementation | ||
317 | */ | ||
318 | struct irq_chip no_irq_chip = { | ||
319 | .name = "none", | ||
320 | .startup = noop_ret, | ||
321 | .shutdown = noop, | ||
322 | .enable = noop, | ||
323 | .disable = noop, | ||
324 | .ack = ack_bad, | ||
325 | .end = noop, | ||
326 | }; | ||
327 | |||
328 | /* | ||
329 | * Generic dummy implementation which can be used for | ||
330 | * real dumb interrupt sources | ||
331 | */ | ||
332 | struct irq_chip dummy_irq_chip = { | ||
333 | .name = "dummy", | ||
334 | .startup = noop_ret, | ||
335 | .shutdown = noop, | ||
336 | .enable = noop, | ||
337 | .disable = noop, | ||
338 | .ack = noop, | ||
339 | .mask = noop, | ||
340 | .unmask = noop, | ||
341 | .end = noop, | ||
342 | }; | ||
343 | |||
344 | /* | 37 | /* |
345 | * Special, empty irq handler: | 38 | * Special, empty irq handler: |
346 | */ | 39 | */ |
@@ -358,31 +51,87 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) | |||
358 | "but no thread function available.", irq, action->name); | 51 | "but no thread function available.", irq, action->name); |
359 | } | 52 | } |
360 | 53 | ||
361 | /** | 54 | static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) |
362 | * handle_IRQ_event - irq action chain handler | 55 | { |
363 | * @irq: the interrupt number | 56 | /* |
364 | * @action: the interrupt action chain for this irq | 57 | * Wake up the handler thread for this action. In case the |
365 | * | 58 | * thread crashed and was killed we just pretend that we |
366 | * Handles the action chain of an irq event | 59 | * handled the interrupt. The hardirq handler has disabled the |
367 | */ | 60 | * device interrupt, so no irq storm is lurking. If the |
368 | irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | 61 | * RUNTHREAD bit is already set, nothing to do. |
62 | */ | ||
63 | if (test_bit(IRQTF_DIED, &action->thread_flags) || | ||
64 | test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
65 | return; | ||
66 | |||
67 | /* | ||
68 | * It's safe to OR the mask lockless here. We have only two | ||
69 | * places which write to threads_oneshot: This code and the | ||
70 | * irq thread. | ||
71 | * | ||
72 | * This code is the hard irq context and can never run on two | ||
73 | * cpus in parallel. If it ever does we have more serious | ||
74 | * problems than this bitmask. | ||
75 | * | ||
76 | * The irq threads of this irq which clear their "running" bit | ||
77 | * in threads_oneshot are serialized via desc->lock against | ||
78 | * each other and they are serialized against this code by | ||
79 | * IRQS_INPROGRESS. | ||
80 | * | ||
81 | * Hard irq handler: | ||
82 | * | ||
83 | * spin_lock(desc->lock); | ||
84 | * desc->state |= IRQS_INPROGRESS; | ||
85 | * spin_unlock(desc->lock); | ||
86 | * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); | ||
87 | * desc->threads_oneshot |= mask; | ||
88 | * spin_lock(desc->lock); | ||
89 | * desc->state &= ~IRQS_INPROGRESS; | ||
90 | * spin_unlock(desc->lock); | ||
91 | * | ||
92 | * irq thread: | ||
93 | * | ||
94 | * again: | ||
95 | * spin_lock(desc->lock); | ||
96 | * if (desc->state & IRQS_INPROGRESS) { | ||
97 | * spin_unlock(desc->lock); | ||
98 | * while(desc->state & IRQS_INPROGRESS) | ||
99 | * cpu_relax(); | ||
100 | * goto again; | ||
101 | * } | ||
102 | * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
103 | * desc->threads_oneshot &= ~mask; | ||
104 | * spin_unlock(desc->lock); | ||
105 | * | ||
106 | * So either the thread waits for us to clear IRQS_INPROGRESS | ||
107 | * or we are waiting in the flow handler for desc->lock to be | ||
108 | * released before we reach this point. The thread also checks | ||
109 | * IRQTF_RUNTHREAD under desc->lock. If set it leaves | ||
110 | * threads_oneshot untouched and runs the thread another time. | ||
111 | */ | ||
112 | desc->threads_oneshot |= action->thread_mask; | ||
113 | wake_up_process(action->thread); | ||
114 | } | ||
115 | |||
116 | irqreturn_t | ||
117 | handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | ||
369 | { | 118 | { |
370 | irqreturn_t ret, retval = IRQ_NONE; | 119 | irqreturn_t retval = IRQ_NONE; |
371 | unsigned int status = 0; | 120 | unsigned int random = 0, irq = desc->irq_data.irq; |
372 | 121 | ||
373 | do { | 122 | do { |
123 | irqreturn_t res; | ||
124 | |||
374 | trace_irq_handler_entry(irq, action); | 125 | trace_irq_handler_entry(irq, action); |
375 | ret = action->handler(irq, action->dev_id); | 126 | res = action->handler(irq, action->dev_id); |
376 | trace_irq_handler_exit(irq, action, ret); | 127 | trace_irq_handler_exit(irq, action, res); |
377 | 128 | ||
378 | switch (ret) { | 129 | if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", |
379 | case IRQ_WAKE_THREAD: | 130 | irq, action->handler)) |
380 | /* | 131 | local_irq_disable(); |
381 | * Set result to handled so the spurious check | ||
382 | * does not trigger. | ||
383 | */ | ||
384 | ret = IRQ_HANDLED; | ||
385 | 132 | ||
133 | switch (res) { | ||
134 | case IRQ_WAKE_THREAD: | ||
386 | /* | 135 | /* |
387 | * Catch drivers which return WAKE_THREAD but | 136 | * Catch drivers which return WAKE_THREAD but |
388 | * did not set up a thread function | 137 | * did not set up a thread function |
@@ -392,165 +141,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) | |||
392 | break; | 141 | break; |
393 | } | 142 | } |
394 | 143 | ||
395 | /* | 144 | irq_wake_thread(desc, action); |
396 | * Wake up the handler thread for this | ||
397 | * action. In case the thread crashed and was | ||
398 | * killed we just pretend that we handled the | ||
399 | * interrupt. The hardirq handler above has | ||
400 | * disabled the device interrupt, so no irq | ||
401 | * storm is lurking. | ||
402 | */ | ||
403 | if (likely(!test_bit(IRQTF_DIED, | ||
404 | &action->thread_flags))) { | ||
405 | set_bit(IRQTF_RUNTHREAD, &action->thread_flags); | ||
406 | wake_up_process(action->thread); | ||
407 | } | ||
408 | 145 | ||
409 | /* Fall through to add to randomness */ | 146 | /* Fall through to add to randomness */ |
410 | case IRQ_HANDLED: | 147 | case IRQ_HANDLED: |
411 | status |= action->flags; | 148 | random |= action->flags; |
412 | break; | 149 | break; |
413 | 150 | ||
414 | default: | 151 | default: |
415 | break; | 152 | break; |
416 | } | 153 | } |
417 | 154 | ||
418 | retval |= ret; | 155 | retval |= res; |
419 | action = action->next; | 156 | action = action->next; |
420 | } while (action); | 157 | } while (action); |
421 | 158 | ||
422 | if (status & IRQF_SAMPLE_RANDOM) | 159 | if (random & IRQF_SAMPLE_RANDOM) |
423 | add_interrupt_randomness(irq); | 160 | add_interrupt_randomness(irq); |
424 | local_irq_disable(); | ||
425 | 161 | ||
162 | if (!noirqdebug) | ||
163 | note_interrupt(irq, desc, retval); | ||
426 | return retval; | 164 | return retval; |
427 | } | 165 | } |
428 | 166 | ||
429 | #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ | 167 | irqreturn_t handle_irq_event(struct irq_desc *desc) |
430 | |||
431 | #ifdef CONFIG_ENABLE_WARN_DEPRECATED | ||
432 | # warning __do_IRQ is deprecated. Please convert to proper flow handlers | ||
433 | #endif | ||
434 | |||
435 | /** | ||
436 | * __do_IRQ - original all in one highlevel IRQ handler | ||
437 | * @irq: the interrupt number | ||
438 | * | ||
439 | * __do_IRQ handles all normal device IRQ's (the special | ||
440 | * SMP cross-CPU interrupts have their own specific | ||
441 | * handlers). | ||
442 | * | ||
443 | * This is the original x86 implementation which is used for every | ||
444 | * interrupt type. | ||
445 | */ | ||
446 | unsigned int __do_IRQ(unsigned int irq) | ||
447 | { | 168 | { |
448 | struct irq_desc *desc = irq_to_desc(irq); | 169 | struct irqaction *action = desc->action; |
449 | struct irqaction *action; | 170 | irqreturn_t ret; |
450 | unsigned int status; | ||
451 | |||
452 | kstat_incr_irqs_this_cpu(irq, desc); | ||
453 | |||
454 | if (CHECK_IRQ_PER_CPU(desc->status)) { | ||
455 | irqreturn_t action_ret; | ||
456 | |||
457 | /* | ||
458 | * No locking required for CPU-local interrupts: | ||
459 | */ | ||
460 | if (desc->chip->ack) | ||
461 | desc->chip->ack(irq); | ||
462 | if (likely(!(desc->status & IRQ_DISABLED))) { | ||
463 | action_ret = handle_IRQ_event(irq, desc->action); | ||
464 | if (!noirqdebug) | ||
465 | note_interrupt(irq, desc, action_ret); | ||
466 | } | ||
467 | desc->chip->end(irq); | ||
468 | return 1; | ||
469 | } | ||
470 | |||
471 | raw_spin_lock(&desc->lock); | ||
472 | if (desc->chip->ack) | ||
473 | desc->chip->ack(irq); | ||
474 | /* | ||
475 | * REPLAY is when Linux resends an IRQ that was dropped earlier | ||
476 | * WAITING is used by probe to mark irqs that are being tested | ||
477 | */ | ||
478 | status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); | ||
479 | status |= IRQ_PENDING; /* we _want_ to handle it */ | ||
480 | |||
481 | /* | ||
482 | * If the IRQ is disabled for whatever reason, we cannot | ||
483 | * use the action we have. | ||
484 | */ | ||
485 | action = NULL; | ||
486 | if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { | ||
487 | action = desc->action; | ||
488 | status &= ~IRQ_PENDING; /* we commit to handling */ | ||
489 | status |= IRQ_INPROGRESS; /* we are handling it */ | ||
490 | } | ||
491 | desc->status = status; | ||
492 | 171 | ||
493 | /* | 172 | desc->istate &= ~IRQS_PENDING; |
494 | * If there is no IRQ handler or it was disabled, exit early. | 173 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
495 | * Since we set PENDING, if another processor is handling | ||
496 | * a different instance of this same irq, the other processor | ||
497 | * will take care of it. | ||
498 | */ | ||
499 | if (unlikely(!action)) | ||
500 | goto out; | ||
501 | |||
502 | /* | ||
503 | * Edge triggered interrupts need to remember | ||
504 | * pending events. | ||
505 | * This applies to any hw interrupts that allow a second | ||
506 | * instance of the same irq to arrive while we are in do_IRQ | ||
507 | * or in the handler. But the code here only handles the _second_ | ||
508 | * instance of the irq, not the third or fourth. So it is mostly | ||
509 | * useful for irq hardware that does not mask cleanly in an | ||
510 | * SMP environment. | ||
511 | */ | ||
512 | for (;;) { | ||
513 | irqreturn_t action_ret; | ||
514 | |||
515 | raw_spin_unlock(&desc->lock); | ||
516 | |||
517 | action_ret = handle_IRQ_event(irq, action); | ||
518 | if (!noirqdebug) | ||
519 | note_interrupt(irq, desc, action_ret); | ||
520 | |||
521 | raw_spin_lock(&desc->lock); | ||
522 | if (likely(!(desc->status & IRQ_PENDING))) | ||
523 | break; | ||
524 | desc->status &= ~IRQ_PENDING; | ||
525 | } | ||
526 | desc->status &= ~IRQ_INPROGRESS; | ||
527 | |||
528 | out: | ||
529 | /* | ||
530 | * The ->end() handler has to deal with interrupts which got | ||
531 | * disabled while the handler was running. | ||
532 | */ | ||
533 | desc->chip->end(irq); | ||
534 | raw_spin_unlock(&desc->lock); | 174 | raw_spin_unlock(&desc->lock); |
535 | 175 | ||
536 | return 1; | 176 | ret = handle_irq_event_percpu(desc, action); |
537 | } | ||
538 | #endif | ||
539 | |||
540 | void early_init_irq_lock_class(void) | ||
541 | { | ||
542 | struct irq_desc *desc; | ||
543 | int i; | ||
544 | |||
545 | for_each_irq_desc(i, desc) { | ||
546 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
547 | } | ||
548 | } | ||
549 | 177 | ||
550 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | 178 | raw_spin_lock(&desc->lock); |
551 | { | 179 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
552 | struct irq_desc *desc = irq_to_desc(irq); | 180 | return ret; |
553 | return desc ? desc->kstat_irqs[cpu] : 0; | ||
554 | } | 181 | } |
555 | EXPORT_SYMBOL(kstat_irqs_cpu); | ||
556 | |||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc88f0b..6546431447d7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -1,95 +1,171 @@ | |||
1 | /* | 1 | /* |
2 | * IRQ subsystem internal functions and variables: | 2 | * IRQ subsystem internal functions and variables: |
3 | * | ||
4 | * Do not ever include this file from anything else than | ||
5 | * kernel/irq/. Do not even think about using any information outside | ||
6 | * of this file for your non core code. | ||
3 | */ | 7 | */ |
8 | #include <linux/irqdesc.h> | ||
9 | |||
10 | #ifdef CONFIG_SPARSE_IRQ | ||
11 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | ||
12 | #else | ||
13 | # define IRQ_BITMAP_BITS NR_IRQS | ||
14 | #endif | ||
15 | |||
16 | #define istate core_internal_state__do_not_mess_with_it | ||
4 | 17 | ||
5 | extern int noirqdebug; | 18 | extern int noirqdebug; |
6 | 19 | ||
7 | /* Set default functions for irq_chip structures: */ | 20 | /* |
8 | extern void irq_chip_set_defaults(struct irq_chip *chip); | 21 | * Bits used by threaded handlers: |
22 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run | ||
23 | * IRQTF_DIED - handler thread died | ||
24 | * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed | ||
25 | * IRQTF_AFFINITY - irq thread is requested to adjust affinity | ||
26 | * IRQTF_FORCED_THREAD - irq action is force threaded | ||
27 | */ | ||
28 | enum { | ||
29 | IRQTF_RUNTHREAD, | ||
30 | IRQTF_DIED, | ||
31 | IRQTF_WARNED, | ||
32 | IRQTF_AFFINITY, | ||
33 | IRQTF_FORCED_THREAD, | ||
34 | }; | ||
9 | 35 | ||
10 | /* Set default handler: */ | 36 | /* |
11 | extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); | 37 | * Bit masks for desc->state |
38 | * | ||
39 | * IRQS_AUTODETECT - autodetection in progress | ||
40 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt | ||
41 | * detection | ||
42 | * IRQS_POLL_INPROGRESS - polling in progress | ||
43 | * IRQS_ONESHOT - irq is not unmasked in primary handler | ||
44 | * IRQS_REPLAY - irq is replayed | ||
45 | * IRQS_WAITING - irq is waiting | ||
46 | * IRQS_PENDING - irq is pending and replayed later | ||
47 | * IRQS_SUSPENDED - irq is suspended | ||
48 | */ | ||
49 | enum { | ||
50 | IRQS_AUTODETECT = 0x00000001, | ||
51 | IRQS_SPURIOUS_DISABLED = 0x00000002, | ||
52 | IRQS_POLL_INPROGRESS = 0x00000008, | ||
53 | IRQS_ONESHOT = 0x00000020, | ||
54 | IRQS_REPLAY = 0x00000040, | ||
55 | IRQS_WAITING = 0x00000080, | ||
56 | IRQS_PENDING = 0x00000200, | ||
57 | IRQS_SUSPENDED = 0x00000800, | ||
58 | }; | ||
59 | |||
60 | #include "debug.h" | ||
61 | #include "settings.h" | ||
62 | |||
63 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) | ||
12 | 64 | ||
13 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 65 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
14 | unsigned long flags); | 66 | unsigned long flags); |
15 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 67 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
16 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 68 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
17 | 69 | ||
18 | extern struct lock_class_key irq_desc_lock_class; | 70 | extern int irq_startup(struct irq_desc *desc); |
71 | extern void irq_shutdown(struct irq_desc *desc); | ||
72 | extern void irq_enable(struct irq_desc *desc); | ||
73 | extern void irq_disable(struct irq_desc *desc); | ||
74 | extern void mask_irq(struct irq_desc *desc); | ||
75 | extern void unmask_irq(struct irq_desc *desc); | ||
76 | |||
19 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 77 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
20 | extern void clear_kstat_irqs(struct irq_desc *desc); | ||
21 | extern raw_spinlock_t sparse_irq_lock; | ||
22 | 78 | ||
23 | #ifdef CONFIG_SPARSE_IRQ | 79 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); |
24 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc); | 80 | irqreturn_t handle_irq_event(struct irq_desc *desc); |
25 | #endif | 81 | |
82 | /* Resending of interrupts :*/ | ||
83 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); | ||
84 | bool irq_wait_for_poll(struct irq_desc *desc); | ||
26 | 85 | ||
27 | #ifdef CONFIG_PROC_FS | 86 | #ifdef CONFIG_PROC_FS |
28 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); | 87 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); |
88 | extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); | ||
29 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | 89 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); |
30 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); | 90 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); |
31 | #else | 91 | #else |
32 | static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } | 92 | static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } |
93 | static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } | ||
33 | static inline void register_handler_proc(unsigned int irq, | 94 | static inline void register_handler_proc(unsigned int irq, |
34 | struct irqaction *action) { } | 95 | struct irqaction *action) { } |
35 | static inline void unregister_handler_proc(unsigned int irq, | 96 | static inline void unregister_handler_proc(unsigned int irq, |
36 | struct irqaction *action) { } | 97 | struct irqaction *action) { } |
37 | #endif | 98 | #endif |
38 | 99 | ||
39 | extern int irq_select_affinity_usr(unsigned int irq); | 100 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); |
40 | 101 | ||
41 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
42 | 103 | ||
43 | /* Inline functions for support of irq chips on slow busses */ | 104 | /* Inline functions for support of irq chips on slow busses */ |
44 | static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) | 105 | static inline void chip_bus_lock(struct irq_desc *desc) |
106 | { | ||
107 | if (unlikely(desc->irq_data.chip->irq_bus_lock)) | ||
108 | desc->irq_data.chip->irq_bus_lock(&desc->irq_data); | ||
109 | } | ||
110 | |||
111 | static inline void chip_bus_sync_unlock(struct irq_desc *desc) | ||
112 | { | ||
113 | if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) | ||
114 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); | ||
115 | } | ||
116 | |||
117 | struct irq_desc * | ||
118 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); | ||
119 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); | ||
120 | |||
121 | static inline struct irq_desc * | ||
122 | irq_get_desc_buslock(unsigned int irq, unsigned long *flags) | ||
123 | { | ||
124 | return __irq_get_desc_lock(irq, flags, true); | ||
125 | } | ||
126 | |||
127 | static inline void | ||
128 | irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) | ||
45 | { | 129 | { |
46 | if (unlikely(desc->chip->bus_lock)) | 130 | __irq_put_desc_unlock(desc, flags, true); |
47 | desc->chip->bus_lock(irq); | ||
48 | } | 131 | } |
49 | 132 | ||
50 | static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) | 133 | static inline struct irq_desc * |
134 | irq_get_desc_lock(unsigned int irq, unsigned long *flags) | ||
51 | { | 135 | { |
52 | if (unlikely(desc->chip->bus_sync_unlock)) | 136 | return __irq_get_desc_lock(irq, flags, false); |
53 | desc->chip->bus_sync_unlock(irq); | 137 | } |
138 | |||
139 | static inline void | ||
140 | irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) | ||
141 | { | ||
142 | __irq_put_desc_unlock(desc, flags, false); | ||
54 | } | 143 | } |
55 | 144 | ||
56 | /* | 145 | /* |
57 | * Debugging printout: | 146 | * Manipulation functions for irq_data.state |
58 | */ | 147 | */ |
148 | static inline void irqd_set_move_pending(struct irq_data *d) | ||
149 | { | ||
150 | d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; | ||
151 | } | ||
59 | 152 | ||
60 | #include <linux/kallsyms.h> | 153 | static inline void irqd_clr_move_pending(struct irq_data *d) |
61 | 154 | { | |
62 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | 155 | d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; |
156 | } | ||
63 | 157 | ||
64 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | 158 | static inline void irqd_clear(struct irq_data *d, unsigned int mask) |
65 | { | 159 | { |
66 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | 160 | d->state_use_accessors &= ~mask; |
67 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
68 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
69 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
70 | printk("->chip(): %p, ", desc->chip); | ||
71 | print_symbol("%s\n", (unsigned long)desc->chip); | ||
72 | printk("->action(): %p\n", desc->action); | ||
73 | if (desc->action) { | ||
74 | printk("->action->handler(): %p, ", desc->action->handler); | ||
75 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
76 | } | ||
77 | |||
78 | P(IRQ_INPROGRESS); | ||
79 | P(IRQ_DISABLED); | ||
80 | P(IRQ_PENDING); | ||
81 | P(IRQ_REPLAY); | ||
82 | P(IRQ_AUTODETECT); | ||
83 | P(IRQ_WAITING); | ||
84 | P(IRQ_LEVEL); | ||
85 | P(IRQ_MASKED); | ||
86 | #ifdef CONFIG_IRQ_PER_CPU | ||
87 | P(IRQ_PER_CPU); | ||
88 | #endif | ||
89 | P(IRQ_NOPROBE); | ||
90 | P(IRQ_NOREQUEST); | ||
91 | P(IRQ_NOAUTOEN); | ||
92 | } | 161 | } |
93 | 162 | ||
94 | #undef P | 163 | static inline void irqd_set(struct irq_data *d, unsigned int mask) |
164 | { | ||
165 | d->state_use_accessors |= mask; | ||
166 | } | ||
95 | 167 | ||
168 | static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) | ||
169 | { | ||
170 | return d->state_use_accessors & mask; | ||
171 | } | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 000000000000..4c60a50e66b2 --- /dev/null +++ b/kernel/irq/irqdesc.c | |||
@@ -0,0 +1,466 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
3 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
4 | * | ||
5 | * This file contains the interrupt descriptor management code | ||
6 | * | ||
7 | * Detailed information is available in Documentation/DocBook/genericirq | ||
8 | * | ||
9 | */ | ||
10 | #include <linux/irq.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/kernel_stat.h> | ||
15 | #include <linux/radix-tree.h> | ||
16 | #include <linux/bitmap.h> | ||
17 | |||
18 | #include "internals.h" | ||
19 | |||
20 | /* | ||
21 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
22 | */ | ||
23 | static struct lock_class_key irq_desc_lock_class; | ||
24 | |||
25 | #if defined(CONFIG_SMP) | ||
26 | static void __init init_irq_default_affinity(void) | ||
27 | { | ||
28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | ||
29 | cpumask_setall(irq_default_affinity); | ||
30 | } | ||
31 | #else | ||
32 | static void __init init_irq_default_affinity(void) | ||
33 | { | ||
34 | } | ||
35 | #endif | ||
36 | |||
37 | #ifdef CONFIG_SMP | ||
38 | static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) | ||
39 | { | ||
40 | if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) | ||
41 | return -ENOMEM; | ||
42 | |||
43 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
44 | if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { | ||
45 | free_cpumask_var(desc->irq_data.affinity); | ||
46 | return -ENOMEM; | ||
47 | } | ||
48 | #endif | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | static void desc_smp_init(struct irq_desc *desc, int node) | ||
53 | { | ||
54 | desc->irq_data.node = node; | ||
55 | cpumask_copy(desc->irq_data.affinity, irq_default_affinity); | ||
56 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
57 | cpumask_clear(desc->pending_mask); | ||
58 | #endif | ||
59 | } | ||
60 | |||
61 | static inline int desc_node(struct irq_desc *desc) | ||
62 | { | ||
63 | return desc->irq_data.node; | ||
64 | } | ||
65 | |||
66 | #else | ||
67 | static inline int | ||
68 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } | ||
69 | static inline void desc_smp_init(struct irq_desc *desc, int node) { } | ||
70 | static inline int desc_node(struct irq_desc *desc) { return 0; } | ||
71 | #endif | ||
72 | |||
73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | ||
74 | { | ||
75 | int cpu; | ||
76 | |||
77 | desc->irq_data.irq = irq; | ||
78 | desc->irq_data.chip = &no_irq_chip; | ||
79 | desc->irq_data.chip_data = NULL; | ||
80 | desc->irq_data.handler_data = NULL; | ||
81 | desc->irq_data.msi_desc = NULL; | ||
82 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); | ||
83 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | ||
84 | desc->handle_irq = handle_bad_irq; | ||
85 | desc->depth = 1; | ||
86 | desc->irq_count = 0; | ||
87 | desc->irqs_unhandled = 0; | ||
88 | desc->name = NULL; | ||
89 | for_each_possible_cpu(cpu) | ||
90 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | ||
91 | desc_smp_init(desc, node); | ||
92 | } | ||
93 | |||
94 | int nr_irqs = NR_IRQS; | ||
95 | EXPORT_SYMBOL_GPL(nr_irqs); | ||
96 | |||
97 | static DEFINE_MUTEX(sparse_irq_lock); | ||
98 | static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); | ||
99 | |||
100 | #ifdef CONFIG_SPARSE_IRQ | ||
101 | |||
102 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); | ||
103 | |||
104 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) | ||
105 | { | ||
106 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
107 | } | ||
108 | |||
109 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
110 | { | ||
111 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
112 | } | ||
113 | |||
114 | static void delete_irq_desc(unsigned int irq) | ||
115 | { | ||
116 | radix_tree_delete(&irq_desc_tree, irq); | ||
117 | } | ||
118 | |||
119 | #ifdef CONFIG_SMP | ||
120 | static void free_masks(struct irq_desc *desc) | ||
121 | { | ||
122 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
123 | free_cpumask_var(desc->pending_mask); | ||
124 | #endif | ||
125 | free_cpumask_var(desc->irq_data.affinity); | ||
126 | } | ||
127 | #else | ||
128 | static inline void free_masks(struct irq_desc *desc) { } | ||
129 | #endif | ||
130 | |||
131 | static struct irq_desc *alloc_desc(int irq, int node) | ||
132 | { | ||
133 | struct irq_desc *desc; | ||
134 | gfp_t gfp = GFP_KERNEL; | ||
135 | |||
136 | desc = kzalloc_node(sizeof(*desc), gfp, node); | ||
137 | if (!desc) | ||
138 | return NULL; | ||
139 | /* allocate based on nr_cpu_ids */ | ||
140 | desc->kstat_irqs = alloc_percpu(unsigned int); | ||
141 | if (!desc->kstat_irqs) | ||
142 | goto err_desc; | ||
143 | |||
144 | if (alloc_masks(desc, gfp, node)) | ||
145 | goto err_kstat; | ||
146 | |||
147 | raw_spin_lock_init(&desc->lock); | ||
148 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
149 | |||
150 | desc_set_defaults(irq, desc, node); | ||
151 | |||
152 | return desc; | ||
153 | |||
154 | err_kstat: | ||
155 | free_percpu(desc->kstat_irqs); | ||
156 | err_desc: | ||
157 | kfree(desc); | ||
158 | return NULL; | ||
159 | } | ||
160 | |||
161 | static void free_desc(unsigned int irq) | ||
162 | { | ||
163 | struct irq_desc *desc = irq_to_desc(irq); | ||
164 | |||
165 | unregister_irq_proc(irq, desc); | ||
166 | |||
167 | mutex_lock(&sparse_irq_lock); | ||
168 | delete_irq_desc(irq); | ||
169 | mutex_unlock(&sparse_irq_lock); | ||
170 | |||
171 | free_masks(desc); | ||
172 | free_percpu(desc->kstat_irqs); | ||
173 | kfree(desc); | ||
174 | } | ||
175 | |||
176 | static int alloc_descs(unsigned int start, unsigned int cnt, int node) | ||
177 | { | ||
178 | struct irq_desc *desc; | ||
179 | int i; | ||
180 | |||
181 | for (i = 0; i < cnt; i++) { | ||
182 | desc = alloc_desc(start + i, node); | ||
183 | if (!desc) | ||
184 | goto err; | ||
185 | mutex_lock(&sparse_irq_lock); | ||
186 | irq_insert_desc(start + i, desc); | ||
187 | mutex_unlock(&sparse_irq_lock); | ||
188 | } | ||
189 | return start; | ||
190 | |||
191 | err: | ||
192 | for (i--; i >= 0; i--) | ||
193 | free_desc(start + i); | ||
194 | |||
195 | mutex_lock(&sparse_irq_lock); | ||
196 | bitmap_clear(allocated_irqs, start, cnt); | ||
197 | mutex_unlock(&sparse_irq_lock); | ||
198 | return -ENOMEM; | ||
199 | } | ||
200 | |||
201 | static int irq_expand_nr_irqs(unsigned int nr) | ||
202 | { | ||
203 | if (nr > IRQ_BITMAP_BITS) | ||
204 | return -ENOMEM; | ||
205 | nr_irqs = nr; | ||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | int __init early_irq_init(void) | ||
210 | { | ||
211 | int i, initcnt, node = first_online_node; | ||
212 | struct irq_desc *desc; | ||
213 | |||
214 | init_irq_default_affinity(); | ||
215 | |||
216 | /* Let arch update nr_irqs and return the nr of preallocated irqs */ | ||
217 | initcnt = arch_probe_nr_irqs(); | ||
218 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | ||
219 | |||
220 | if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) | ||
221 | nr_irqs = IRQ_BITMAP_BITS; | ||
222 | |||
223 | if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) | ||
224 | initcnt = IRQ_BITMAP_BITS; | ||
225 | |||
226 | if (initcnt > nr_irqs) | ||
227 | nr_irqs = initcnt; | ||
228 | |||
229 | for (i = 0; i < initcnt; i++) { | ||
230 | desc = alloc_desc(i, node); | ||
231 | set_bit(i, allocated_irqs); | ||
232 | irq_insert_desc(i, desc); | ||
233 | } | ||
234 | return arch_early_irq_init(); | ||
235 | } | ||
236 | |||
237 | #else /* !CONFIG_SPARSE_IRQ */ | ||
238 | |||
239 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | ||
240 | [0 ... NR_IRQS-1] = { | ||
241 | .handle_irq = handle_bad_irq, | ||
242 | .depth = 1, | ||
243 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | ||
244 | } | ||
245 | }; | ||
246 | |||
247 | int __init early_irq_init(void) | ||
248 | { | ||
249 | int count, i, node = first_online_node; | ||
250 | struct irq_desc *desc; | ||
251 | |||
252 | init_irq_default_affinity(); | ||
253 | |||
254 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | ||
255 | |||
256 | desc = irq_desc; | ||
257 | count = ARRAY_SIZE(irq_desc); | ||
258 | |||
259 | for (i = 0; i < count; i++) { | ||
260 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | ||
261 | alloc_masks(&desc[i], GFP_KERNEL, node); | ||
262 | raw_spin_lock_init(&desc[i].lock); | ||
263 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | ||
264 | desc_set_defaults(i, &desc[i], node); | ||
265 | } | ||
266 | return arch_early_irq_init(); | ||
267 | } | ||
268 | |||
269 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
270 | { | ||
271 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | ||
272 | } | ||
273 | |||
274 | static void free_desc(unsigned int irq) | ||
275 | { | ||
276 | dynamic_irq_cleanup(irq); | ||
277 | } | ||
278 | |||
279 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | ||
280 | { | ||
281 | return start; | ||
282 | } | ||
283 | |||
284 | static int irq_expand_nr_irqs(unsigned int nr) | ||
285 | { | ||
286 | return -ENOMEM; | ||
287 | } | ||
288 | |||
289 | #endif /* !CONFIG_SPARSE_IRQ */ | ||
290 | |||
291 | /** | ||
292 | * generic_handle_irq - Invoke the handler for a particular irq | ||
293 | * @irq: The irq number to handle | ||
294 | * | ||
295 | */ | ||
296 | int generic_handle_irq(unsigned int irq) | ||
297 | { | ||
298 | struct irq_desc *desc = irq_to_desc(irq); | ||
299 | |||
300 | if (!desc) | ||
301 | return -EINVAL; | ||
302 | generic_handle_irq_desc(irq, desc); | ||
303 | return 0; | ||
304 | } | ||
305 | EXPORT_SYMBOL_GPL(generic_handle_irq); | ||
306 | |||
307 | /* Dynamic interrupt handling */ | ||
308 | |||
309 | /** | ||
310 | * irq_free_descs - free irq descriptors | ||
311 | * @from: Start of descriptor range | ||
312 | * @cnt: Number of consecutive irqs to free | ||
313 | */ | ||
314 | void irq_free_descs(unsigned int from, unsigned int cnt) | ||
315 | { | ||
316 | int i; | ||
317 | |||
318 | if (from >= nr_irqs || (from + cnt) > nr_irqs) | ||
319 | return; | ||
320 | |||
321 | for (i = 0; i < cnt; i++) | ||
322 | free_desc(from + i); | ||
323 | |||
324 | mutex_lock(&sparse_irq_lock); | ||
325 | bitmap_clear(allocated_irqs, from, cnt); | ||
326 | mutex_unlock(&sparse_irq_lock); | ||
327 | } | ||
328 | EXPORT_SYMBOL_GPL(irq_free_descs); | ||
329 | |||
330 | /** | ||
331 | * irq_alloc_descs - allocate and initialize a range of irq descriptors | ||
332 | * @irq: Allocate for specific irq number if irq >= 0 | ||
333 | * @from: Start the search from this irq number | ||
334 | * @cnt: Number of consecutive irqs to allocate. | ||
335 | * @node: Preferred node on which the irq descriptor should be allocated | ||
336 | * | ||
337 | * Returns the first irq number or error code | ||
338 | */ | ||
339 | int __ref | ||
340 | irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | ||
341 | { | ||
342 | int start, ret; | ||
343 | |||
344 | if (!cnt) | ||
345 | return -EINVAL; | ||
346 | |||
347 | if (irq >= 0) { | ||
348 | if (from > irq) | ||
349 | return -EINVAL; | ||
350 | from = irq; | ||
351 | } | ||
352 | |||
353 | mutex_lock(&sparse_irq_lock); | ||
354 | |||
355 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, | ||
356 | from, cnt, 0); | ||
357 | ret = -EEXIST; | ||
358 | if (irq >=0 && start != irq) | ||
359 | goto err; | ||
360 | |||
361 | if (start + cnt > nr_irqs) { | ||
362 | ret = irq_expand_nr_irqs(start + cnt); | ||
363 | if (ret) | ||
364 | goto err; | ||
365 | } | ||
366 | |||
367 | bitmap_set(allocated_irqs, start, cnt); | ||
368 | mutex_unlock(&sparse_irq_lock); | ||
369 | return alloc_descs(start, cnt, node); | ||
370 | |||
371 | err: | ||
372 | mutex_unlock(&sparse_irq_lock); | ||
373 | return ret; | ||
374 | } | ||
375 | EXPORT_SYMBOL_GPL(irq_alloc_descs); | ||
376 | |||
377 | /** | ||
378 | * irq_reserve_irqs - mark irqs allocated | ||
379 | * @from: mark from irq number | ||
380 | * @cnt: number of irqs to mark | ||
381 | * | ||
382 | * Returns 0 on success or an appropriate error code | ||
383 | */ | ||
384 | int irq_reserve_irqs(unsigned int from, unsigned int cnt) | ||
385 | { | ||
386 | unsigned int start; | ||
387 | int ret = 0; | ||
388 | |||
389 | if (!cnt || (from + cnt) > nr_irqs) | ||
390 | return -EINVAL; | ||
391 | |||
392 | mutex_lock(&sparse_irq_lock); | ||
393 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | ||
394 | if (start == from) | ||
395 | bitmap_set(allocated_irqs, start, cnt); | ||
396 | else | ||
397 | ret = -EEXIST; | ||
398 | mutex_unlock(&sparse_irq_lock); | ||
399 | return ret; | ||
400 | } | ||
401 | |||
402 | /** | ||
403 | * irq_get_next_irq - get next allocated irq number | ||
404 | * @offset: where to start the search | ||
405 | * | ||
406 | * Returns next irq number after offset or nr_irqs if none is found. | ||
407 | */ | ||
408 | unsigned int irq_get_next_irq(unsigned int offset) | ||
409 | { | ||
410 | return find_next_bit(allocated_irqs, nr_irqs, offset); | ||
411 | } | ||
412 | |||
413 | struct irq_desc * | ||
414 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) | ||
415 | { | ||
416 | struct irq_desc *desc = irq_to_desc(irq); | ||
417 | |||
418 | if (desc) { | ||
419 | if (bus) | ||
420 | chip_bus_lock(desc); | ||
421 | raw_spin_lock_irqsave(&desc->lock, *flags); | ||
422 | } | ||
423 | return desc; | ||
424 | } | ||
425 | |||
426 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) | ||
427 | { | ||
428 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
429 | if (bus) | ||
430 | chip_bus_sync_unlock(desc); | ||
431 | } | ||
432 | |||
433 | /** | ||
434 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
435 | * @irq: irq number to initialize | ||
436 | */ | ||
437 | void dynamic_irq_cleanup(unsigned int irq) | ||
438 | { | ||
439 | struct irq_desc *desc = irq_to_desc(irq); | ||
440 | unsigned long flags; | ||
441 | |||
442 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
443 | desc_set_defaults(irq, desc, desc_node(desc)); | ||
444 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
445 | } | ||
446 | |||
447 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | ||
448 | { | ||
449 | struct irq_desc *desc = irq_to_desc(irq); | ||
450 | |||
451 | return desc && desc->kstat_irqs ? | ||
452 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | ||
453 | } | ||
454 | |||
455 | unsigned int kstat_irqs(unsigned int irq) | ||
456 | { | ||
457 | struct irq_desc *desc = irq_to_desc(irq); | ||
458 | int cpu; | ||
459 | int sum = 0; | ||
460 | |||
461 | if (!desc || !desc->kstat_irqs) | ||
462 | return 0; | ||
463 | for_each_possible_cpu(cpu) | ||
464 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); | ||
465 | return sum; | ||
466 | } | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c3003e9d91a3..0a7840aeb0fb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -17,6 +17,17 @@ | |||
17 | 17 | ||
18 | #include "internals.h" | 18 | #include "internals.h" |
19 | 19 | ||
20 | #ifdef CONFIG_IRQ_FORCED_THREADING | ||
21 | __read_mostly bool force_irqthreads; | ||
22 | |||
23 | static int __init setup_forced_irqthreads(char *arg) | ||
24 | { | ||
25 | force_irqthreads = true; | ||
26 | return 0; | ||
27 | } | ||
28 | early_param("threadirqs", setup_forced_irqthreads); | ||
29 | #endif | ||
30 | |||
20 | /** | 31 | /** |
21 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 32 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
22 | * @irq: interrupt number to wait for | 33 | * @irq: interrupt number to wait for |
@@ -30,7 +41,7 @@ | |||
30 | void synchronize_irq(unsigned int irq) | 41 | void synchronize_irq(unsigned int irq) |
31 | { | 42 | { |
32 | struct irq_desc *desc = irq_to_desc(irq); | 43 | struct irq_desc *desc = irq_to_desc(irq); |
33 | unsigned int status; | 44 | bool inprogress; |
34 | 45 | ||
35 | if (!desc) | 46 | if (!desc) |
36 | return; | 47 | return; |
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq) | |||
42 | * Wait until we're out of the critical section. This might | 53 | * Wait until we're out of the critical section. This might |
43 | * give the wrong answer due to the lack of memory barriers. | 54 | * give the wrong answer due to the lack of memory barriers. |
44 | */ | 55 | */ |
45 | while (desc->status & IRQ_INPROGRESS) | 56 | while (irqd_irq_inprogress(&desc->irq_data)) |
46 | cpu_relax(); | 57 | cpu_relax(); |
47 | 58 | ||
48 | /* Ok, that indicated we're done: double-check carefully. */ | 59 | /* Ok, that indicated we're done: double-check carefully. */ |
49 | raw_spin_lock_irqsave(&desc->lock, flags); | 60 | raw_spin_lock_irqsave(&desc->lock, flags); |
50 | status = desc->status; | 61 | inprogress = irqd_irq_inprogress(&desc->irq_data); |
51 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 62 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
52 | 63 | ||
53 | /* Oops, that failed? */ | 64 | /* Oops, that failed? */ |
54 | } while (status & IRQ_INPROGRESS); | 65 | } while (inprogress); |
55 | 66 | ||
56 | /* | 67 | /* |
57 | * We made sure that no hardirq handler is running. Now verify | 68 | * We made sure that no hardirq handler is running. Now verify |
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq) | |||
73 | { | 84 | { |
74 | struct irq_desc *desc = irq_to_desc(irq); | 85 | struct irq_desc *desc = irq_to_desc(irq); |
75 | 86 | ||
76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || | 87 | if (!desc || !irqd_can_balance(&desc->irq_data) || |
77 | !desc->chip->set_affinity) | 88 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) |
78 | return 0; | 89 | return 0; |
79 | 90 | ||
80 | return 1; | 91 | return 1; |
@@ -100,66 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
100 | } | 111 | } |
101 | } | 112 | } |
102 | 113 | ||
114 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
115 | static inline bool irq_can_move_pcntxt(struct irq_data *data) | ||
116 | { | ||
117 | return irqd_can_move_in_process_context(data); | ||
118 | } | ||
119 | static inline bool irq_move_pending(struct irq_data *data) | ||
120 | { | ||
121 | return irqd_is_setaffinity_pending(data); | ||
122 | } | ||
123 | static inline void | ||
124 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) | ||
125 | { | ||
126 | cpumask_copy(desc->pending_mask, mask); | ||
127 | } | ||
128 | static inline void | ||
129 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) | ||
130 | { | ||
131 | cpumask_copy(mask, desc->pending_mask); | ||
132 | } | ||
133 | #else | ||
134 | static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } | ||
135 | static inline bool irq_move_pending(struct irq_data *data) { return false; } | ||
136 | static inline void | ||
137 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } | ||
138 | static inline void | ||
139 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | ||
140 | #endif | ||
141 | |||
142 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | ||
143 | { | ||
144 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
145 | struct irq_desc *desc = irq_data_to_desc(data); | ||
146 | int ret = 0; | ||
147 | |||
148 | if (!chip || !chip->irq_set_affinity) | ||
149 | return -EINVAL; | ||
150 | |||
151 | if (irq_can_move_pcntxt(data)) { | ||
152 | ret = chip->irq_set_affinity(data, mask, false); | ||
153 | switch (ret) { | ||
154 | case IRQ_SET_MASK_OK: | ||
155 | cpumask_copy(data->affinity, mask); | ||
156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
157 | irq_set_thread_affinity(desc); | ||
158 | ret = 0; | ||
159 | } | ||
160 | } else { | ||
161 | irqd_set_move_pending(data); | ||
162 | irq_copy_pending(desc, mask); | ||
163 | } | ||
164 | |||
165 | if (desc->affinity_notify) { | ||
166 | kref_get(&desc->affinity_notify->kref); | ||
167 | schedule_work(&desc->affinity_notify->work); | ||
168 | } | ||
169 | irqd_set(data, IRQD_AFFINITY_SET); | ||
170 | |||
171 | return ret; | ||
172 | } | ||
173 | |||
103 | /** | 174 | /** |
104 | * irq_set_affinity - Set the irq affinity of a given irq | 175 | * irq_set_affinity - Set the irq affinity of a given irq |
105 | * @irq: Interrupt to set affinity | 176 | * @irq: Interrupt to set affinity |
106 | * @cpumask: cpumask | 177 | * @mask: cpumask |
107 | * | 178 | * |
108 | */ | 179 | */ |
109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | 180 | int irq_set_affinity(unsigned int irq, const struct cpumask *mask) |
110 | { | 181 | { |
111 | struct irq_desc *desc = irq_to_desc(irq); | 182 | struct irq_desc *desc = irq_to_desc(irq); |
112 | unsigned long flags; | 183 | unsigned long flags; |
184 | int ret; | ||
113 | 185 | ||
114 | if (!desc->chip->set_affinity) | 186 | if (!desc) |
115 | return -EINVAL; | 187 | return -EINVAL; |
116 | 188 | ||
117 | raw_spin_lock_irqsave(&desc->lock, flags); | 189 | raw_spin_lock_irqsave(&desc->lock, flags); |
118 | 190 | ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); | |
119 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
120 | if (desc->status & IRQ_MOVE_PCNTXT) { | ||
121 | if (!desc->chip->set_affinity(irq, cpumask)) { | ||
122 | cpumask_copy(desc->affinity, cpumask); | ||
123 | irq_set_thread_affinity(desc); | ||
124 | } | ||
125 | } | ||
126 | else { | ||
127 | desc->status |= IRQ_MOVE_PENDING; | ||
128 | cpumask_copy(desc->pending_mask, cpumask); | ||
129 | } | ||
130 | #else | ||
131 | if (!desc->chip->set_affinity(irq, cpumask)) { | ||
132 | cpumask_copy(desc->affinity, cpumask); | ||
133 | irq_set_thread_affinity(desc); | ||
134 | } | ||
135 | #endif | ||
136 | desc->status |= IRQ_AFFINITY_SET; | ||
137 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 191 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
138 | return 0; | 192 | return ret; |
139 | } | 193 | } |
140 | 194 | ||
141 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) | 195 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) |
142 | { | 196 | { |
197 | unsigned long flags; | ||
198 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
199 | |||
200 | if (!desc) | ||
201 | return -EINVAL; | ||
202 | desc->affinity_hint = m; | ||
203 | irq_put_desc_unlock(desc, flags); | ||
204 | return 0; | ||
205 | } | ||
206 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | ||
207 | |||
208 | static void irq_affinity_notify(struct work_struct *work) | ||
209 | { | ||
210 | struct irq_affinity_notify *notify = | ||
211 | container_of(work, struct irq_affinity_notify, work); | ||
212 | struct irq_desc *desc = irq_to_desc(notify->irq); | ||
213 | cpumask_var_t cpumask; | ||
214 | unsigned long flags; | ||
215 | |||
216 | if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) | ||
217 | goto out; | ||
218 | |||
219 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
220 | if (irq_move_pending(&desc->irq_data)) | ||
221 | irq_get_pending(cpumask, desc); | ||
222 | else | ||
223 | cpumask_copy(cpumask, desc->irq_data.affinity); | ||
224 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
225 | |||
226 | notify->notify(notify, cpumask); | ||
227 | |||
228 | free_cpumask_var(cpumask); | ||
229 | out: | ||
230 | kref_put(¬ify->kref, notify->release); | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * irq_set_affinity_notifier - control notification of IRQ affinity changes | ||
235 | * @irq: Interrupt for which to enable/disable notification | ||
236 | * @notify: Context for notification, or %NULL to disable | ||
237 | * notification. Function pointers must be initialised; | ||
238 | * the other fields will be initialised by this function. | ||
239 | * | ||
240 | * Must be called in process context. Notification may only be enabled | ||
241 | * after the IRQ is allocated and must be disabled before the IRQ is | ||
242 | * freed using free_irq(). | ||
243 | */ | ||
244 | int | ||
245 | irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) | ||
246 | { | ||
143 | struct irq_desc *desc = irq_to_desc(irq); | 247 | struct irq_desc *desc = irq_to_desc(irq); |
248 | struct irq_affinity_notify *old_notify; | ||
144 | unsigned long flags; | 249 | unsigned long flags; |
145 | 250 | ||
251 | /* The release function is promised process context */ | ||
252 | might_sleep(); | ||
253 | |||
146 | if (!desc) | 254 | if (!desc) |
147 | return -EINVAL; | 255 | return -EINVAL; |
148 | 256 | ||
257 | /* Complete initialisation of *notify */ | ||
258 | if (notify) { | ||
259 | notify->irq = irq; | ||
260 | kref_init(¬ify->kref); | ||
261 | INIT_WORK(¬ify->work, irq_affinity_notify); | ||
262 | } | ||
263 | |||
149 | raw_spin_lock_irqsave(&desc->lock, flags); | 264 | raw_spin_lock_irqsave(&desc->lock, flags); |
150 | desc->affinity_hint = m; | 265 | old_notify = desc->affinity_notify; |
266 | desc->affinity_notify = notify; | ||
151 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 267 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
152 | 268 | ||
269 | if (old_notify) | ||
270 | kref_put(&old_notify->kref, old_notify->release); | ||
271 | |||
153 | return 0; | 272 | return 0; |
154 | } | 273 | } |
155 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | 274 | EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); |
156 | 275 | ||
157 | #ifndef CONFIG_AUTO_IRQ_AFFINITY | 276 | #ifndef CONFIG_AUTO_IRQ_AFFINITY |
158 | /* | 277 | /* |
159 | * Generic version of the affinity autoselector. | 278 | * Generic version of the affinity autoselector. |
160 | */ | 279 | */ |
161 | static int setup_affinity(unsigned int irq, struct irq_desc *desc) | 280 | static int |
281 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
162 | { | 282 | { |
283 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
284 | struct cpumask *set = irq_default_affinity; | ||
285 | int ret; | ||
286 | |||
287 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | ||
163 | if (!irq_can_set_affinity(irq)) | 288 | if (!irq_can_set_affinity(irq)) |
164 | return 0; | 289 | return 0; |
165 | 290 | ||
@@ -167,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
167 | * Preserve an userspace affinity setup, but make sure that | 292 | * Preserve an userspace affinity setup, but make sure that |
168 | * one of the targets is online. | 293 | * one of the targets is online. |
169 | */ | 294 | */ |
170 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { | 295 | if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { |
171 | if (cpumask_any_and(desc->affinity, cpu_online_mask) | 296 | if (cpumask_intersects(desc->irq_data.affinity, |
172 | < nr_cpu_ids) | 297 | cpu_online_mask)) |
173 | goto set_affinity; | 298 | set = desc->irq_data.affinity; |
174 | else | 299 | else |
175 | desc->status &= ~IRQ_AFFINITY_SET; | 300 | irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); |
176 | } | 301 | } |
177 | 302 | ||
178 | cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); | 303 | cpumask_and(mask, cpu_online_mask, set); |
179 | set_affinity: | 304 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); |
180 | desc->chip->set_affinity(irq, desc->affinity); | 305 | switch (ret) { |
181 | 306 | case IRQ_SET_MASK_OK: | |
307 | cpumask_copy(desc->irq_data.affinity, mask); | ||
308 | case IRQ_SET_MASK_OK_NOCOPY: | ||
309 | irq_set_thread_affinity(desc); | ||
310 | } | ||
182 | return 0; | 311 | return 0; |
183 | } | 312 | } |
184 | #else | 313 | #else |
185 | static inline int setup_affinity(unsigned int irq, struct irq_desc *d) | 314 | static inline int |
315 | setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) | ||
186 | { | 316 | { |
187 | return irq_select_affinity(irq); | 317 | return irq_select_affinity(irq); |
188 | } | 318 | } |
@@ -191,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) | |||
191 | /* | 321 | /* |
192 | * Called when affinity is set via /proc/irq | 322 | * Called when affinity is set via /proc/irq |
193 | */ | 323 | */ |
194 | int irq_select_affinity_usr(unsigned int irq) | 324 | int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) |
195 | { | 325 | { |
196 | struct irq_desc *desc = irq_to_desc(irq); | 326 | struct irq_desc *desc = irq_to_desc(irq); |
197 | unsigned long flags; | 327 | unsigned long flags; |
198 | int ret; | 328 | int ret; |
199 | 329 | ||
200 | raw_spin_lock_irqsave(&desc->lock, flags); | 330 | raw_spin_lock_irqsave(&desc->lock, flags); |
201 | ret = setup_affinity(irq, desc); | 331 | ret = setup_affinity(irq, desc, mask); |
202 | if (!ret) | ||
203 | irq_set_thread_affinity(desc); | ||
204 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 332 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
205 | |||
206 | return ret; | 333 | return ret; |
207 | } | 334 | } |
208 | 335 | ||
209 | #else | 336 | #else |
210 | static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) | 337 | static inline int |
338 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | ||
211 | { | 339 | { |
212 | return 0; | 340 | return 0; |
213 | } | 341 | } |
@@ -218,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
218 | if (suspend) { | 346 | if (suspend) { |
219 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) | 347 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) |
220 | return; | 348 | return; |
221 | desc->status |= IRQ_SUSPENDED; | 349 | desc->istate |= IRQS_SUSPENDED; |
222 | } | 350 | } |
223 | 351 | ||
224 | if (!desc->depth++) { | 352 | if (!desc->depth++) |
225 | desc->status |= IRQ_DISABLED; | 353 | irq_disable(desc); |
226 | desc->chip->disable(irq); | 354 | } |
227 | } | 355 | |
356 | static int __disable_irq_nosync(unsigned int irq) | ||
357 | { | ||
358 | unsigned long flags; | ||
359 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
360 | |||
361 | if (!desc) | ||
362 | return -EINVAL; | ||
363 | __disable_irq(desc, irq, false); | ||
364 | irq_put_desc_busunlock(desc, flags); | ||
365 | return 0; | ||
228 | } | 366 | } |
229 | 367 | ||
230 | /** | 368 | /** |
@@ -240,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
240 | */ | 378 | */ |
241 | void disable_irq_nosync(unsigned int irq) | 379 | void disable_irq_nosync(unsigned int irq) |
242 | { | 380 | { |
243 | struct irq_desc *desc = irq_to_desc(irq); | 381 | __disable_irq_nosync(irq); |
244 | unsigned long flags; | ||
245 | |||
246 | if (!desc) | ||
247 | return; | ||
248 | |||
249 | chip_bus_lock(irq, desc); | ||
250 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
251 | __disable_irq(desc, irq, false); | ||
252 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
253 | chip_bus_sync_unlock(irq, desc); | ||
254 | } | 382 | } |
255 | EXPORT_SYMBOL(disable_irq_nosync); | 383 | EXPORT_SYMBOL(disable_irq_nosync); |
256 | 384 | ||
@@ -268,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync); | |||
268 | */ | 396 | */ |
269 | void disable_irq(unsigned int irq) | 397 | void disable_irq(unsigned int irq) |
270 | { | 398 | { |
271 | struct irq_desc *desc = irq_to_desc(irq); | 399 | if (!__disable_irq_nosync(irq)) |
272 | |||
273 | if (!desc) | ||
274 | return; | ||
275 | |||
276 | disable_irq_nosync(irq); | ||
277 | if (desc->action) | ||
278 | synchronize_irq(irq); | 400 | synchronize_irq(irq); |
279 | } | 401 | } |
280 | EXPORT_SYMBOL(disable_irq); | 402 | EXPORT_SYMBOL(disable_irq); |
281 | 403 | ||
282 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | 404 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) |
283 | { | 405 | { |
284 | if (resume) | 406 | if (resume) { |
285 | desc->status &= ~IRQ_SUSPENDED; | 407 | if (!(desc->istate & IRQS_SUSPENDED)) { |
408 | if (!desc->action) | ||
409 | return; | ||
410 | if (!(desc->action->flags & IRQF_FORCE_RESUME)) | ||
411 | return; | ||
412 | /* Pretend that it got disabled ! */ | ||
413 | desc->depth++; | ||
414 | } | ||
415 | desc->istate &= ~IRQS_SUSPENDED; | ||
416 | } | ||
286 | 417 | ||
287 | switch (desc->depth) { | 418 | switch (desc->depth) { |
288 | case 0: | 419 | case 0: |
@@ -290,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
290 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | 421 | WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); |
291 | break; | 422 | break; |
292 | case 1: { | 423 | case 1: { |
293 | unsigned int status = desc->status & ~IRQ_DISABLED; | 424 | if (desc->istate & IRQS_SUSPENDED) |
294 | |||
295 | if (desc->status & IRQ_SUSPENDED) | ||
296 | goto err_out; | 425 | goto err_out; |
297 | /* Prevent probing on this irq: */ | 426 | /* Prevent probing on this irq: */ |
298 | desc->status = status | IRQ_NOPROBE; | 427 | irq_settings_set_noprobe(desc); |
428 | irq_enable(desc); | ||
299 | check_irq_resend(desc, irq); | 429 | check_irq_resend(desc, irq); |
300 | /* fall-through */ | 430 | /* fall-through */ |
301 | } | 431 | } |
@@ -313,21 +443,22 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
313 | * IRQ line is re-enabled. | 443 | * IRQ line is re-enabled. |
314 | * | 444 | * |
315 | * This function may be called from IRQ context only when | 445 | * This function may be called from IRQ context only when |
316 | * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! | 446 | * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! |
317 | */ | 447 | */ |
318 | void enable_irq(unsigned int irq) | 448 | void enable_irq(unsigned int irq) |
319 | { | 449 | { |
320 | struct irq_desc *desc = irq_to_desc(irq); | ||
321 | unsigned long flags; | 450 | unsigned long flags; |
451 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
322 | 452 | ||
323 | if (!desc) | 453 | if (!desc) |
324 | return; | 454 | return; |
455 | if (WARN(!desc->irq_data.chip, | ||
456 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | ||
457 | goto out; | ||
325 | 458 | ||
326 | chip_bus_lock(irq, desc); | ||
327 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
328 | __enable_irq(desc, irq, false); | 459 | __enable_irq(desc, irq, false); |
329 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 460 | out: |
330 | chip_bus_sync_unlock(irq, desc); | 461 | irq_put_desc_busunlock(desc, flags); |
331 | } | 462 | } |
332 | EXPORT_SYMBOL(enable_irq); | 463 | EXPORT_SYMBOL(enable_irq); |
333 | 464 | ||
@@ -336,14 +467,14 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
336 | struct irq_desc *desc = irq_to_desc(irq); | 467 | struct irq_desc *desc = irq_to_desc(irq); |
337 | int ret = -ENXIO; | 468 | int ret = -ENXIO; |
338 | 469 | ||
339 | if (desc->chip->set_wake) | 470 | if (desc->irq_data.chip->irq_set_wake) |
340 | ret = desc->chip->set_wake(irq, on); | 471 | ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); |
341 | 472 | ||
342 | return ret; | 473 | return ret; |
343 | } | 474 | } |
344 | 475 | ||
345 | /** | 476 | /** |
346 | * set_irq_wake - control irq power management wakeup | 477 | * irq_set_irq_wake - control irq power management wakeup |
347 | * @irq: interrupt to control | 478 | * @irq: interrupt to control |
348 | * @on: enable/disable power management wakeup | 479 | * @on: enable/disable power management wakeup |
349 | * | 480 | * |
@@ -354,23 +485,25 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
354 | * Wakeup mode lets this IRQ wake the system from sleep | 485 | * Wakeup mode lets this IRQ wake the system from sleep |
355 | * states like "suspend to RAM". | 486 | * states like "suspend to RAM". |
356 | */ | 487 | */ |
357 | int set_irq_wake(unsigned int irq, unsigned int on) | 488 | int irq_set_irq_wake(unsigned int irq, unsigned int on) |
358 | { | 489 | { |
359 | struct irq_desc *desc = irq_to_desc(irq); | ||
360 | unsigned long flags; | 490 | unsigned long flags; |
491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | ||
361 | int ret = 0; | 492 | int ret = 0; |
362 | 493 | ||
494 | if (!desc) | ||
495 | return -EINVAL; | ||
496 | |||
363 | /* wakeup-capable irqs can be shared between drivers that | 497 | /* wakeup-capable irqs can be shared between drivers that |
364 | * don't need to have the same sleep mode behaviors. | 498 | * don't need to have the same sleep mode behaviors. |
365 | */ | 499 | */ |
366 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
367 | if (on) { | 500 | if (on) { |
368 | if (desc->wake_depth++ == 0) { | 501 | if (desc->wake_depth++ == 0) { |
369 | ret = set_irq_wake_real(irq, on); | 502 | ret = set_irq_wake_real(irq, on); |
370 | if (ret) | 503 | if (ret) |
371 | desc->wake_depth = 0; | 504 | desc->wake_depth = 0; |
372 | else | 505 | else |
373 | desc->status |= IRQ_WAKEUP; | 506 | irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); |
374 | } | 507 | } |
375 | } else { | 508 | } else { |
376 | if (desc->wake_depth == 0) { | 509 | if (desc->wake_depth == 0) { |
@@ -380,14 +513,13 @@ int set_irq_wake(unsigned int irq, unsigned int on) | |||
380 | if (ret) | 513 | if (ret) |
381 | desc->wake_depth = 1; | 514 | desc->wake_depth = 1; |
382 | else | 515 | else |
383 | desc->status &= ~IRQ_WAKEUP; | 516 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); |
384 | } | 517 | } |
385 | } | 518 | } |
386 | 519 | irq_put_desc_busunlock(desc, flags); | |
387 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
388 | return ret; | 520 | return ret; |
389 | } | 521 | } |
390 | EXPORT_SYMBOL(set_irq_wake); | 522 | EXPORT_SYMBOL(irq_set_irq_wake); |
391 | 523 | ||
392 | /* | 524 | /* |
393 | * Internal function that tells the architecture code whether a | 525 | * Internal function that tells the architecture code whether a |
@@ -396,45 +528,29 @@ EXPORT_SYMBOL(set_irq_wake); | |||
396 | */ | 528 | */ |
397 | int can_request_irq(unsigned int irq, unsigned long irqflags) | 529 | int can_request_irq(unsigned int irq, unsigned long irqflags) |
398 | { | 530 | { |
399 | struct irq_desc *desc = irq_to_desc(irq); | ||
400 | struct irqaction *action; | ||
401 | unsigned long flags; | 531 | unsigned long flags; |
532 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | ||
533 | int canrequest = 0; | ||
402 | 534 | ||
403 | if (!desc) | 535 | if (!desc) |
404 | return 0; | 536 | return 0; |
405 | 537 | ||
406 | if (desc->status & IRQ_NOREQUEST) | 538 | if (irq_settings_can_request(desc)) { |
407 | return 0; | 539 | if (desc->action) |
408 | 540 | if (irqflags & desc->action->flags & IRQF_SHARED) | |
409 | raw_spin_lock_irqsave(&desc->lock, flags); | 541 | canrequest =1; |
410 | action = desc->action; | 542 | } |
411 | if (action) | 543 | irq_put_desc_unlock(desc, flags); |
412 | if (irqflags & action->flags & IRQF_SHARED) | 544 | return canrequest; |
413 | action = NULL; | ||
414 | |||
415 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
416 | |||
417 | return !action; | ||
418 | } | ||
419 | |||
420 | void compat_irq_chip_set_default_handler(struct irq_desc *desc) | ||
421 | { | ||
422 | /* | ||
423 | * If the architecture still has not overriden | ||
424 | * the flow handler then zap the default. This | ||
425 | * should catch incorrect flow-type setting. | ||
426 | */ | ||
427 | if (desc->handle_irq == &handle_bad_irq) | ||
428 | desc->handle_irq = NULL; | ||
429 | } | 545 | } |
430 | 546 | ||
431 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 547 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
432 | unsigned long flags) | 548 | unsigned long flags) |
433 | { | 549 | { |
434 | int ret; | 550 | struct irq_chip *chip = desc->irq_data.chip; |
435 | struct irq_chip *chip = desc->chip; | 551 | int ret, unmask = 0; |
436 | 552 | ||
437 | if (!chip || !chip->set_type) { | 553 | if (!chip || !chip->irq_set_type) { |
438 | /* | 554 | /* |
439 | * IRQF_TRIGGER_* but the PIC does not support multiple | 555 | * IRQF_TRIGGER_* but the PIC does not support multiple |
440 | * flow-types? | 556 | * flow-types? |
@@ -444,23 +560,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
444 | return 0; | 560 | return 0; |
445 | } | 561 | } |
446 | 562 | ||
447 | /* caller masked out all except trigger mode flags */ | 563 | flags &= IRQ_TYPE_SENSE_MASK; |
448 | ret = chip->set_type(irq, flags); | 564 | |
449 | 565 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { | |
450 | if (ret) | 566 | if (!irqd_irq_masked(&desc->irq_data)) |
451 | pr_err("setting trigger mode %d for irq %u failed (%pF)\n", | 567 | mask_irq(desc); |
452 | (int)flags, irq, chip->set_type); | 568 | if (!irqd_irq_disabled(&desc->irq_data)) |
453 | else { | 569 | unmask = 1; |
454 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) | ||
455 | flags |= IRQ_LEVEL; | ||
456 | /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ | ||
457 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); | ||
458 | desc->status |= flags; | ||
459 | |||
460 | if (chip != desc->chip) | ||
461 | irq_chip_set_defaults(desc->chip); | ||
462 | } | 570 | } |
463 | 571 | ||
572 | /* caller masked out all except trigger mode flags */ | ||
573 | ret = chip->irq_set_type(&desc->irq_data, flags); | ||
574 | |||
575 | switch (ret) { | ||
576 | case IRQ_SET_MASK_OK: | ||
577 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); | ||
578 | irqd_set(&desc->irq_data, flags); | ||
579 | |||
580 | case IRQ_SET_MASK_OK_NOCOPY: | ||
581 | flags = irqd_get_trigger_type(&desc->irq_data); | ||
582 | irq_settings_set_trigger_mask(desc, flags); | ||
583 | irqd_clear(&desc->irq_data, IRQD_LEVEL); | ||
584 | irq_settings_clr_level(desc); | ||
585 | if (flags & IRQ_TYPE_LEVEL_MASK) { | ||
586 | irq_settings_set_level(desc); | ||
587 | irqd_set(&desc->irq_data, IRQD_LEVEL); | ||
588 | } | ||
589 | |||
590 | ret = 0; | ||
591 | break; | ||
592 | default: | ||
593 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | ||
594 | flags, irq, chip->irq_set_type); | ||
595 | } | ||
596 | if (unmask) | ||
597 | unmask_irq(desc); | ||
464 | return ret; | 598 | return ret; |
465 | } | 599 | } |
466 | 600 | ||
@@ -504,10 +638,13 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
504 | * handler finished. unmask if the interrupt has not been disabled and | 638 | * handler finished. unmask if the interrupt has not been disabled and |
505 | * is marked MASKED. | 639 | * is marked MASKED. |
506 | */ | 640 | */ |
507 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) | 641 | static void irq_finalize_oneshot(struct irq_desc *desc, |
642 | struct irqaction *action, bool force) | ||
508 | { | 643 | { |
644 | if (!(desc->istate & IRQS_ONESHOT)) | ||
645 | return; | ||
509 | again: | 646 | again: |
510 | chip_bus_lock(irq, desc); | 647 | chip_bus_lock(desc); |
511 | raw_spin_lock_irq(&desc->lock); | 648 | raw_spin_lock_irq(&desc->lock); |
512 | 649 | ||
513 | /* | 650 | /* |
@@ -517,26 +654,42 @@ again: | |||
517 | * The thread is faster done than the hard interrupt handler | 654 | * The thread is faster done than the hard interrupt handler |
518 | * on the other CPU. If we unmask the irq line then the | 655 | * on the other CPU. If we unmask the irq line then the |
519 | * interrupt can come in again and masks the line, leaves due | 656 | * interrupt can come in again and masks the line, leaves due |
520 | * to IRQ_INPROGRESS and the irq line is masked forever. | 657 | * to IRQS_INPROGRESS and the irq line is masked forever. |
658 | * | ||
659 | * This also serializes the state of shared oneshot handlers | ||
660 | * versus "desc->threads_onehsot |= action->thread_mask;" in | ||
661 | * irq_wake_thread(). See the comment there which explains the | ||
662 | * serialization. | ||
521 | */ | 663 | */ |
522 | if (unlikely(desc->status & IRQ_INPROGRESS)) { | 664 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { |
523 | raw_spin_unlock_irq(&desc->lock); | 665 | raw_spin_unlock_irq(&desc->lock); |
524 | chip_bus_sync_unlock(irq, desc); | 666 | chip_bus_sync_unlock(desc); |
525 | cpu_relax(); | 667 | cpu_relax(); |
526 | goto again; | 668 | goto again; |
527 | } | 669 | } |
528 | 670 | ||
529 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { | 671 | /* |
530 | desc->status &= ~IRQ_MASKED; | 672 | * Now check again, whether the thread should run. Otherwise |
531 | desc->chip->unmask(irq); | 673 | * we would clear the threads_oneshot bit of this thread which |
532 | } | 674 | * was just set. |
675 | */ | ||
676 | if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
677 | goto out_unlock; | ||
678 | |||
679 | desc->threads_oneshot &= ~action->thread_mask; | ||
680 | |||
681 | if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && | ||
682 | irqd_irq_masked(&desc->irq_data)) | ||
683 | unmask_irq(desc); | ||
684 | |||
685 | out_unlock: | ||
533 | raw_spin_unlock_irq(&desc->lock); | 686 | raw_spin_unlock_irq(&desc->lock); |
534 | chip_bus_sync_unlock(irq, desc); | 687 | chip_bus_sync_unlock(desc); |
535 | } | 688 | } |
536 | 689 | ||
537 | #ifdef CONFIG_SMP | 690 | #ifdef CONFIG_SMP |
538 | /* | 691 | /* |
539 | * Check whether we need to change the affinity of the interrupt thread. | 692 | * Check whether we need to chasnge the affinity of the interrupt thread. |
540 | */ | 693 | */ |
541 | static void | 694 | static void |
542 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | 695 | irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) |
@@ -556,7 +709,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
556 | } | 709 | } |
557 | 710 | ||
558 | raw_spin_lock_irq(&desc->lock); | 711 | raw_spin_lock_irq(&desc->lock); |
559 | cpumask_copy(mask, desc->affinity); | 712 | cpumask_copy(mask, desc->irq_data.affinity); |
560 | raw_spin_unlock_irq(&desc->lock); | 713 | raw_spin_unlock_irq(&desc->lock); |
561 | 714 | ||
562 | set_cpus_allowed_ptr(current, mask); | 715 | set_cpus_allowed_ptr(current, mask); |
@@ -568,14 +721,57 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
568 | #endif | 721 | #endif |
569 | 722 | ||
570 | /* | 723 | /* |
724 | * Interrupts which are not explicitely requested as threaded | ||
725 | * interrupts rely on the implicit bh/preempt disable of the hard irq | ||
726 | * context. So we need to disable bh here to avoid deadlocks and other | ||
727 | * side effects. | ||
728 | */ | ||
729 | static irqreturn_t | ||
730 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | ||
731 | { | ||
732 | irqreturn_t ret; | ||
733 | |||
734 | local_bh_disable(); | ||
735 | ret = action->thread_fn(action->irq, action->dev_id); | ||
736 | irq_finalize_oneshot(desc, action, false); | ||
737 | local_bh_enable(); | ||
738 | return ret; | ||
739 | } | ||
740 | |||
741 | /* | ||
742 | * Interrupts explicitely requested as threaded interupts want to be | ||
743 | * preemtible - many of them need to sleep and wait for slow busses to | ||
744 | * complete. | ||
745 | */ | ||
746 | static irqreturn_t irq_thread_fn(struct irq_desc *desc, | ||
747 | struct irqaction *action) | ||
748 | { | ||
749 | irqreturn_t ret; | ||
750 | |||
751 | ret = action->thread_fn(action->irq, action->dev_id); | ||
752 | irq_finalize_oneshot(desc, action, false); | ||
753 | return ret; | ||
754 | } | ||
755 | |||
756 | /* | ||
571 | * Interrupt handler thread | 757 | * Interrupt handler thread |
572 | */ | 758 | */ |
573 | static int irq_thread(void *data) | 759 | static int irq_thread(void *data) |
574 | { | 760 | { |
575 | struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; | 761 | static const struct sched_param param = { |
762 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
763 | }; | ||
576 | struct irqaction *action = data; | 764 | struct irqaction *action = data; |
577 | struct irq_desc *desc = irq_to_desc(action->irq); | 765 | struct irq_desc *desc = irq_to_desc(action->irq); |
578 | int wake, oneshot = desc->status & IRQ_ONESHOT; | 766 | irqreturn_t (*handler_fn)(struct irq_desc *desc, |
767 | struct irqaction *action); | ||
768 | int wake; | ||
769 | |||
770 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, | ||
771 | &action->thread_flags)) | ||
772 | handler_fn = irq_forced_thread_fn; | ||
773 | else | ||
774 | handler_fn = irq_thread_fn; | ||
579 | 775 | ||
580 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 776 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
581 | current->irqaction = action; | 777 | current->irqaction = action; |
@@ -587,23 +783,23 @@ static int irq_thread(void *data) | |||
587 | atomic_inc(&desc->threads_active); | 783 | atomic_inc(&desc->threads_active); |
588 | 784 | ||
589 | raw_spin_lock_irq(&desc->lock); | 785 | raw_spin_lock_irq(&desc->lock); |
590 | if (unlikely(desc->status & IRQ_DISABLED)) { | 786 | if (unlikely(irqd_irq_disabled(&desc->irq_data))) { |
591 | /* | 787 | /* |
592 | * CHECKME: We might need a dedicated | 788 | * CHECKME: We might need a dedicated |
593 | * IRQ_THREAD_PENDING flag here, which | 789 | * IRQ_THREAD_PENDING flag here, which |
594 | * retriggers the thread in check_irq_resend() | 790 | * retriggers the thread in check_irq_resend() |
595 | * but AFAICT IRQ_PENDING should be fine as it | 791 | * but AFAICT IRQS_PENDING should be fine as it |
596 | * retriggers the interrupt itself --- tglx | 792 | * retriggers the interrupt itself --- tglx |
597 | */ | 793 | */ |
598 | desc->status |= IRQ_PENDING; | 794 | desc->istate |= IRQS_PENDING; |
599 | raw_spin_unlock_irq(&desc->lock); | 795 | raw_spin_unlock_irq(&desc->lock); |
600 | } else { | 796 | } else { |
601 | raw_spin_unlock_irq(&desc->lock); | 797 | irqreturn_t action_ret; |
602 | |||
603 | action->thread_fn(action->irq, action->dev_id); | ||
604 | 798 | ||
605 | if (oneshot) | 799 | raw_spin_unlock_irq(&desc->lock); |
606 | irq_finalize_oneshot(action->irq, desc); | 800 | action_ret = handler_fn(desc, action); |
801 | if (!noirqdebug) | ||
802 | note_interrupt(action->irq, desc, action_ret); | ||
607 | } | 803 | } |
608 | 804 | ||
609 | wake = atomic_dec_and_test(&desc->threads_active); | 805 | wake = atomic_dec_and_test(&desc->threads_active); |
@@ -612,6 +808,9 @@ static int irq_thread(void *data) | |||
612 | wake_up(&desc->wait_for_threads); | 808 | wake_up(&desc->wait_for_threads); |
613 | } | 809 | } |
614 | 810 | ||
811 | /* Prevent a stale desc->threads_oneshot */ | ||
812 | irq_finalize_oneshot(desc, action, true); | ||
813 | |||
615 | /* | 814 | /* |
616 | * Clear irqaction. Otherwise exit_irq_thread() would make | 815 | * Clear irqaction. Otherwise exit_irq_thread() would make |
617 | * fuzz about an active irq thread going into nirvana. | 816 | * fuzz about an active irq thread going into nirvana. |
@@ -626,6 +825,7 @@ static int irq_thread(void *data) | |||
626 | void exit_irq_thread(void) | 825 | void exit_irq_thread(void) |
627 | { | 826 | { |
628 | struct task_struct *tsk = current; | 827 | struct task_struct *tsk = current; |
828 | struct irq_desc *desc; | ||
629 | 829 | ||
630 | if (!tsk->irqaction) | 830 | if (!tsk->irqaction) |
631 | return; | 831 | return; |
@@ -634,6 +834,14 @@ void exit_irq_thread(void) | |||
634 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | 834 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
635 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); | 835 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); |
636 | 836 | ||
837 | desc = irq_to_desc(tsk->irqaction->irq); | ||
838 | |||
839 | /* | ||
840 | * Prevent a stale desc->threads_oneshot. Must be called | ||
841 | * before setting the IRQTF_DIED flag. | ||
842 | */ | ||
843 | irq_finalize_oneshot(desc, tsk->irqaction, true); | ||
844 | |||
637 | /* | 845 | /* |
638 | * Set the THREAD DIED flag to prevent further wakeups of the | 846 | * Set the THREAD DIED flag to prevent further wakeups of the |
639 | * soon to be gone threaded handler. | 847 | * soon to be gone threaded handler. |
@@ -641,6 +849,22 @@ void exit_irq_thread(void) | |||
641 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); | 849 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); |
642 | } | 850 | } |
643 | 851 | ||
852 | static void irq_setup_forced_threading(struct irqaction *new) | ||
853 | { | ||
854 | if (!force_irqthreads) | ||
855 | return; | ||
856 | if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) | ||
857 | return; | ||
858 | |||
859 | new->flags |= IRQF_ONESHOT; | ||
860 | |||
861 | if (!new->thread_fn) { | ||
862 | set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); | ||
863 | new->thread_fn = new->handler; | ||
864 | new->handler = irq_default_primary_handler; | ||
865 | } | ||
866 | } | ||
867 | |||
644 | /* | 868 | /* |
645 | * Internal function to register an irqaction - typically used to | 869 | * Internal function to register an irqaction - typically used to |
646 | * allocate special interrupts that are part of the architecture. | 870 | * allocate special interrupts that are part of the architecture. |
@@ -650,14 +874,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
650 | { | 874 | { |
651 | struct irqaction *old, **old_ptr; | 875 | struct irqaction *old, **old_ptr; |
652 | const char *old_name = NULL; | 876 | const char *old_name = NULL; |
653 | unsigned long flags; | 877 | unsigned long flags, thread_mask = 0; |
654 | int nested, shared = 0; | 878 | int ret, nested, shared = 0; |
655 | int ret; | 879 | cpumask_var_t mask; |
656 | 880 | ||
657 | if (!desc) | 881 | if (!desc) |
658 | return -EINVAL; | 882 | return -EINVAL; |
659 | 883 | ||
660 | if (desc->chip == &no_irq_chip) | 884 | if (desc->irq_data.chip == &no_irq_chip) |
661 | return -ENOSYS; | 885 | return -ENOSYS; |
662 | /* | 886 | /* |
663 | * Some drivers like serial.c use request_irq() heavily, | 887 | * Some drivers like serial.c use request_irq() heavily, |
@@ -676,15 +900,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
676 | rand_initialize_irq(irq); | 900 | rand_initialize_irq(irq); |
677 | } | 901 | } |
678 | 902 | ||
679 | /* Oneshot interrupts are not allowed with shared */ | ||
680 | if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) | ||
681 | return -EINVAL; | ||
682 | |||
683 | /* | 903 | /* |
684 | * Check whether the interrupt nests into another interrupt | 904 | * Check whether the interrupt nests into another interrupt |
685 | * thread. | 905 | * thread. |
686 | */ | 906 | */ |
687 | nested = desc->status & IRQ_NESTED_THREAD; | 907 | nested = irq_settings_is_nested_thread(desc); |
688 | if (nested) { | 908 | if (nested) { |
689 | if (!new->thread_fn) | 909 | if (!new->thread_fn) |
690 | return -EINVAL; | 910 | return -EINVAL; |
@@ -694,6 +914,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
694 | * dummy function which warns when called. | 914 | * dummy function which warns when called. |
695 | */ | 915 | */ |
696 | new->handler = irq_nested_primary_handler; | 916 | new->handler = irq_nested_primary_handler; |
917 | } else { | ||
918 | if (irq_settings_can_thread(desc)) | ||
919 | irq_setup_forced_threading(new); | ||
697 | } | 920 | } |
698 | 921 | ||
699 | /* | 922 | /* |
@@ -717,6 +940,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
717 | new->thread = t; | 940 | new->thread = t; |
718 | } | 941 | } |
719 | 942 | ||
943 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | ||
944 | ret = -ENOMEM; | ||
945 | goto out_thread; | ||
946 | } | ||
947 | |||
720 | /* | 948 | /* |
721 | * The following block of code has to be executed atomically | 949 | * The following block of code has to be executed atomically |
722 | */ | 950 | */ |
@@ -728,32 +956,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
728 | * Can't share interrupts unless both agree to and are | 956 | * Can't share interrupts unless both agree to and are |
729 | * the same type (level, edge, polarity). So both flag | 957 | * the same type (level, edge, polarity). So both flag |
730 | * fields must have IRQF_SHARED set and the bits which | 958 | * fields must have IRQF_SHARED set and the bits which |
731 | * set the trigger type must match. | 959 | * set the trigger type must match. Also all must |
960 | * agree on ONESHOT. | ||
732 | */ | 961 | */ |
733 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 962 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
734 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { | 963 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
964 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | ||
735 | old_name = old->name; | 965 | old_name = old->name; |
736 | goto mismatch; | 966 | goto mismatch; |
737 | } | 967 | } |
738 | 968 | ||
739 | #if defined(CONFIG_IRQ_PER_CPU) | ||
740 | /* All handlers must agree on per-cpuness */ | 969 | /* All handlers must agree on per-cpuness */ |
741 | if ((old->flags & IRQF_PERCPU) != | 970 | if ((old->flags & IRQF_PERCPU) != |
742 | (new->flags & IRQF_PERCPU)) | 971 | (new->flags & IRQF_PERCPU)) |
743 | goto mismatch; | 972 | goto mismatch; |
744 | #endif | ||
745 | 973 | ||
746 | /* add new interrupt at end of irq queue */ | 974 | /* add new interrupt at end of irq queue */ |
747 | do { | 975 | do { |
976 | thread_mask |= old->thread_mask; | ||
748 | old_ptr = &old->next; | 977 | old_ptr = &old->next; |
749 | old = *old_ptr; | 978 | old = *old_ptr; |
750 | } while (old); | 979 | } while (old); |
751 | shared = 1; | 980 | shared = 1; |
752 | } | 981 | } |
753 | 982 | ||
754 | if (!shared) { | 983 | /* |
755 | irq_chip_set_defaults(desc->chip); | 984 | * Setup the thread mask for this irqaction. Unlikely to have |
985 | * 32 resp 64 irqs sharing one line, but who knows. | ||
986 | */ | ||
987 | if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { | ||
988 | ret = -EBUSY; | ||
989 | goto out_mask; | ||
990 | } | ||
991 | new->thread_mask = 1 << ffz(thread_mask); | ||
756 | 992 | ||
993 | if (!shared) { | ||
757 | init_waitqueue_head(&desc->wait_for_threads); | 994 | init_waitqueue_head(&desc->wait_for_threads); |
758 | 995 | ||
759 | /* Setup the type (level, edge polarity) if configured: */ | 996 | /* Setup the type (level, edge polarity) if configured: */ |
@@ -762,42 +999,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
762 | new->flags & IRQF_TRIGGER_MASK); | 999 | new->flags & IRQF_TRIGGER_MASK); |
763 | 1000 | ||
764 | if (ret) | 1001 | if (ret) |
765 | goto out_thread; | 1002 | goto out_mask; |
766 | } else | 1003 | } |
767 | compat_irq_chip_set_default_handler(desc); | 1004 | |
768 | #if defined(CONFIG_IRQ_PER_CPU) | 1005 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ |
769 | if (new->flags & IRQF_PERCPU) | 1006 | IRQS_ONESHOT | IRQS_WAITING); |
770 | desc->status |= IRQ_PER_CPU; | 1007 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
771 | #endif | ||
772 | 1008 | ||
773 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | | 1009 | if (new->flags & IRQF_PERCPU) { |
774 | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); | 1010 | irqd_set(&desc->irq_data, IRQD_PER_CPU); |
1011 | irq_settings_set_per_cpu(desc); | ||
1012 | } | ||
775 | 1013 | ||
776 | if (new->flags & IRQF_ONESHOT) | 1014 | if (new->flags & IRQF_ONESHOT) |
777 | desc->status |= IRQ_ONESHOT; | 1015 | desc->istate |= IRQS_ONESHOT; |
778 | 1016 | ||
779 | if (!(desc->status & IRQ_NOAUTOEN)) { | 1017 | if (irq_settings_can_autoenable(desc)) |
780 | desc->depth = 0; | 1018 | irq_startup(desc); |
781 | desc->status &= ~IRQ_DISABLED; | 1019 | else |
782 | desc->chip->startup(irq); | ||
783 | } else | ||
784 | /* Undo nested disables: */ | 1020 | /* Undo nested disables: */ |
785 | desc->depth = 1; | 1021 | desc->depth = 1; |
786 | 1022 | ||
787 | /* Exclude IRQ from balancing if requested */ | 1023 | /* Exclude IRQ from balancing if requested */ |
788 | if (new->flags & IRQF_NOBALANCING) | 1024 | if (new->flags & IRQF_NOBALANCING) { |
789 | desc->status |= IRQ_NO_BALANCING; | 1025 | irq_settings_set_no_balancing(desc); |
1026 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | ||
1027 | } | ||
790 | 1028 | ||
791 | /* Set default affinity mask once everything is setup */ | 1029 | /* Set default affinity mask once everything is setup */ |
792 | setup_affinity(irq, desc); | 1030 | setup_affinity(irq, desc, mask); |
793 | 1031 | ||
794 | } else if ((new->flags & IRQF_TRIGGER_MASK) | 1032 | } else if (new->flags & IRQF_TRIGGER_MASK) { |
795 | && (new->flags & IRQF_TRIGGER_MASK) | 1033 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; |
796 | != (desc->status & IRQ_TYPE_SENSE_MASK)) { | 1034 | unsigned int omsk = irq_settings_get_trigger_mask(desc); |
797 | /* hope the handler works with the actual trigger mode... */ | 1035 | |
798 | pr_warning("IRQ %d uses trigger mode %d; requested %d\n", | 1036 | if (nmsk != omsk) |
799 | irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), | 1037 | /* hope the handler works with current trigger mode */ |
800 | (int)(new->flags & IRQF_TRIGGER_MASK)); | 1038 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", |
1039 | irq, nmsk, omsk); | ||
801 | } | 1040 | } |
802 | 1041 | ||
803 | new->irq = irq; | 1042 | new->irq = irq; |
@@ -811,8 +1050,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
811 | * Check whether we disabled the irq via the spurious handler | 1050 | * Check whether we disabled the irq via the spurious handler |
812 | * before. Reenable it and give it another chance. | 1051 | * before. Reenable it and give it another chance. |
813 | */ | 1052 | */ |
814 | if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { | 1053 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { |
815 | desc->status &= ~IRQ_SPURIOUS_DISABLED; | 1054 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; |
816 | __enable_irq(desc, irq, false); | 1055 | __enable_irq(desc, irq, false); |
817 | } | 1056 | } |
818 | 1057 | ||
@@ -828,6 +1067,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
828 | register_irq_proc(irq, desc); | 1067 | register_irq_proc(irq, desc); |
829 | new->dir = NULL; | 1068 | new->dir = NULL; |
830 | register_handler_proc(irq, new); | 1069 | register_handler_proc(irq, new); |
1070 | free_cpumask_var(mask); | ||
831 | 1071 | ||
832 | return 0; | 1072 | return 0; |
833 | 1073 | ||
@@ -842,8 +1082,11 @@ mismatch: | |||
842 | #endif | 1082 | #endif |
843 | ret = -EBUSY; | 1083 | ret = -EBUSY; |
844 | 1084 | ||
845 | out_thread: | 1085 | out_mask: |
846 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1086 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
1087 | free_cpumask_var(mask); | ||
1088 | |||
1089 | out_thread: | ||
847 | if (new->thread) { | 1090 | if (new->thread) { |
848 | struct task_struct *t = new->thread; | 1091 | struct task_struct *t = new->thread; |
849 | 1092 | ||
@@ -864,9 +1107,14 @@ out_thread: | |||
864 | */ | 1107 | */ |
865 | int setup_irq(unsigned int irq, struct irqaction *act) | 1108 | int setup_irq(unsigned int irq, struct irqaction *act) |
866 | { | 1109 | { |
1110 | int retval; | ||
867 | struct irq_desc *desc = irq_to_desc(irq); | 1111 | struct irq_desc *desc = irq_to_desc(irq); |
868 | 1112 | ||
869 | return __setup_irq(irq, desc, act); | 1113 | chip_bus_lock(desc); |
1114 | retval = __setup_irq(irq, desc, act); | ||
1115 | chip_bus_sync_unlock(desc); | ||
1116 | |||
1117 | return retval; | ||
870 | } | 1118 | } |
871 | EXPORT_SYMBOL_GPL(setup_irq); | 1119 | EXPORT_SYMBOL_GPL(setup_irq); |
872 | 1120 | ||
@@ -912,18 +1160,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
912 | 1160 | ||
913 | /* Currently used only by UML, might disappear one day: */ | 1161 | /* Currently used only by UML, might disappear one day: */ |
914 | #ifdef CONFIG_IRQ_RELEASE_METHOD | 1162 | #ifdef CONFIG_IRQ_RELEASE_METHOD |
915 | if (desc->chip->release) | 1163 | if (desc->irq_data.chip->release) |
916 | desc->chip->release(irq, dev_id); | 1164 | desc->irq_data.chip->release(irq, dev_id); |
917 | #endif | 1165 | #endif |
918 | 1166 | ||
919 | /* If this was the last handler, shut down the IRQ line: */ | 1167 | /* If this was the last handler, shut down the IRQ line: */ |
920 | if (!desc->action) { | 1168 | if (!desc->action) |
921 | desc->status |= IRQ_DISABLED; | 1169 | irq_shutdown(desc); |
922 | if (desc->chip->shutdown) | ||
923 | desc->chip->shutdown(irq); | ||
924 | else | ||
925 | desc->chip->disable(irq); | ||
926 | } | ||
927 | 1170 | ||
928 | #ifdef CONFIG_SMP | 1171 | #ifdef CONFIG_SMP |
929 | /* make sure affinity_hint is cleaned up */ | 1172 | /* make sure affinity_hint is cleaned up */ |
@@ -997,9 +1240,14 @@ void free_irq(unsigned int irq, void *dev_id) | |||
997 | if (!desc) | 1240 | if (!desc) |
998 | return; | 1241 | return; |
999 | 1242 | ||
1000 | chip_bus_lock(irq, desc); | 1243 | #ifdef CONFIG_SMP |
1244 | if (WARN_ON(desc->affinity_notify)) | ||
1245 | desc->affinity_notify = NULL; | ||
1246 | #endif | ||
1247 | |||
1248 | chip_bus_lock(desc); | ||
1001 | kfree(__free_irq(irq, dev_id)); | 1249 | kfree(__free_irq(irq, dev_id)); |
1002 | chip_bus_sync_unlock(irq, desc); | 1250 | chip_bus_sync_unlock(desc); |
1003 | } | 1251 | } |
1004 | EXPORT_SYMBOL(free_irq); | 1252 | EXPORT_SYMBOL(free_irq); |
1005 | 1253 | ||
@@ -1067,7 +1315,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1067 | if (!desc) | 1315 | if (!desc) |
1068 | return -EINVAL; | 1316 | return -EINVAL; |
1069 | 1317 | ||
1070 | if (desc->status & IRQ_NOREQUEST) | 1318 | if (!irq_settings_can_request(desc)) |
1071 | return -EINVAL; | 1319 | return -EINVAL; |
1072 | 1320 | ||
1073 | if (!handler) { | 1321 | if (!handler) { |
@@ -1086,14 +1334,14 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1086 | action->name = devname; | 1334 | action->name = devname; |
1087 | action->dev_id = dev_id; | 1335 | action->dev_id = dev_id; |
1088 | 1336 | ||
1089 | chip_bus_lock(irq, desc); | 1337 | chip_bus_lock(desc); |
1090 | retval = __setup_irq(irq, desc, action); | 1338 | retval = __setup_irq(irq, desc, action); |
1091 | chip_bus_sync_unlock(irq, desc); | 1339 | chip_bus_sync_unlock(desc); |
1092 | 1340 | ||
1093 | if (retval) | 1341 | if (retval) |
1094 | kfree(action); | 1342 | kfree(action); |
1095 | 1343 | ||
1096 | #ifdef CONFIG_DEBUG_SHIRQ | 1344 | #ifdef CONFIG_DEBUG_SHIRQ_FIXME |
1097 | if (!retval && (irqflags & IRQF_SHARED)) { | 1345 | if (!retval && (irqflags & IRQF_SHARED)) { |
1098 | /* | 1346 | /* |
1099 | * It's a shared IRQ -- the driver ought to be prepared for it | 1347 | * It's a shared IRQ -- the driver ought to be prepared for it |
@@ -1142,7 +1390,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, | |||
1142 | if (!desc) | 1390 | if (!desc) |
1143 | return -EINVAL; | 1391 | return -EINVAL; |
1144 | 1392 | ||
1145 | if (desc->status & IRQ_NESTED_THREAD) { | 1393 | if (irq_settings_is_nested_thread(desc)) { |
1146 | ret = request_threaded_irq(irq, NULL, handler, | 1394 | ret = request_threaded_irq(irq, NULL, handler, |
1147 | flags, name, dev_id); | 1395 | flags, name, dev_id); |
1148 | return !ret ? IRQC_IS_NESTED : ret; | 1396 | return !ret ? IRQC_IS_NESTED : ret; |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 241962280836..47420908fba0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -4,27 +4,28 @@ | |||
4 | 4 | ||
5 | #include "internals.h" | 5 | #include "internals.h" |
6 | 6 | ||
7 | void move_masked_irq(int irq) | 7 | void irq_move_masked_irq(struct irq_data *idata) |
8 | { | 8 | { |
9 | struct irq_desc *desc = irq_to_desc(irq); | 9 | struct irq_desc *desc = irq_data_to_desc(idata); |
10 | struct irq_chip *chip = idata->chip; | ||
10 | 11 | ||
11 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 12 | if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) |
12 | return; | 13 | return; |
13 | 14 | ||
14 | /* | 15 | /* |
15 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. | 16 | * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. |
16 | */ | 17 | */ |
17 | if (CHECK_IRQ_PER_CPU(desc->status)) { | 18 | if (!irqd_can_balance(&desc->irq_data)) { |
18 | WARN_ON(1); | 19 | WARN_ON(1); |
19 | return; | 20 | return; |
20 | } | 21 | } |
21 | 22 | ||
22 | desc->status &= ~IRQ_MOVE_PENDING; | 23 | irqd_clr_move_pending(&desc->irq_data); |
23 | 24 | ||
24 | if (unlikely(cpumask_empty(desc->pending_mask))) | 25 | if (unlikely(cpumask_empty(desc->pending_mask))) |
25 | return; | 26 | return; |
26 | 27 | ||
27 | if (!desc->chip->set_affinity) | 28 | if (!chip->irq_set_affinity) |
28 | return; | 29 | return; |
29 | 30 | ||
30 | assert_raw_spin_locked(&desc->lock); | 31 | assert_raw_spin_locked(&desc->lock); |
@@ -34,7 +35,7 @@ void move_masked_irq(int irq) | |||
34 | * do the disable, re-program, enable sequence. | 35 | * do the disable, re-program, enable sequence. |
35 | * This is *not* particularly important for level triggered | 36 | * This is *not* particularly important for level triggered |
36 | * but in a edge trigger case, we might be setting rte | 37 | * but in a edge trigger case, we might be setting rte |
37 | * when an active trigger is comming in. This could | 38 | * when an active trigger is coming in. This could |
38 | * cause some ioapics to mal-function. | 39 | * cause some ioapics to mal-function. |
39 | * Being paranoid i guess! | 40 | * Being paranoid i guess! |
40 | * | 41 | * |
@@ -43,26 +44,34 @@ void move_masked_irq(int irq) | |||
43 | */ | 44 | */ |
44 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) |
45 | < nr_cpu_ids)) | 46 | < nr_cpu_ids)) |
46 | if (!desc->chip->set_affinity(irq, desc->pending_mask)) { | 47 | if (!chip->irq_set_affinity(&desc->irq_data, |
47 | cpumask_copy(desc->affinity, desc->pending_mask); | 48 | desc->pending_mask, false)) { |
49 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
48 | irq_set_thread_affinity(desc); | 50 | irq_set_thread_affinity(desc); |
49 | } | 51 | } |
50 | 52 | ||
51 | cpumask_clear(desc->pending_mask); | 53 | cpumask_clear(desc->pending_mask); |
52 | } | 54 | } |
53 | 55 | ||
54 | void move_native_irq(int irq) | 56 | void irq_move_irq(struct irq_data *idata) |
55 | { | 57 | { |
56 | struct irq_desc *desc = irq_to_desc(irq); | 58 | bool masked; |
57 | 59 | ||
58 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 60 | if (likely(!irqd_is_setaffinity_pending(idata))) |
59 | return; | 61 | return; |
60 | 62 | ||
61 | if (unlikely(desc->status & IRQ_DISABLED)) | 63 | if (unlikely(irqd_irq_disabled(idata))) |
62 | return; | 64 | return; |
63 | 65 | ||
64 | desc->chip->mask(irq); | 66 | /* |
65 | move_masked_irq(irq); | 67 | * Be careful vs. already masked interrupts. If this is a |
66 | desc->chip->unmask(irq); | 68 | * threaded interrupt with ONESHOT set, we can end up with an |
69 | * interrupt storm. | ||
70 | */ | ||
71 | masked = irqd_irq_masked(idata); | ||
72 | if (!masked) | ||
73 | idata->chip->irq_mask(idata); | ||
74 | irq_move_masked_irq(idata); | ||
75 | if (!masked) | ||
76 | idata->chip->irq_unmask(idata); | ||
67 | } | 77 | } |
68 | |||
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index 65d3845665ac..000000000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null | |||
@@ -1,120 +0,0 @@ | |||
1 | /* | ||
2 | * NUMA irq-desc migration code | ||
3 | * | ||
4 | * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to | ||
5 | * the new "home node" of the IRQ. | ||
6 | */ | ||
7 | |||
8 | #include <linux/irq.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/random.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | |||
15 | #include "internals.h" | ||
16 | |||
17 | static void init_copy_kstat_irqs(struct irq_desc *old_desc, | ||
18 | struct irq_desc *desc, | ||
19 | int node, int nr) | ||
20 | { | ||
21 | init_kstat_irqs(desc, node, nr); | ||
22 | |||
23 | if (desc->kstat_irqs != old_desc->kstat_irqs) | ||
24 | memcpy(desc->kstat_irqs, old_desc->kstat_irqs, | ||
25 | nr * sizeof(*desc->kstat_irqs)); | ||
26 | } | ||
27 | |||
28 | static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) | ||
29 | { | ||
30 | if (old_desc->kstat_irqs == desc->kstat_irqs) | ||
31 | return; | ||
32 | |||
33 | kfree(old_desc->kstat_irqs); | ||
34 | old_desc->kstat_irqs = NULL; | ||
35 | } | ||
36 | |||
37 | static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, | ||
38 | struct irq_desc *desc, int node) | ||
39 | { | ||
40 | memcpy(desc, old_desc, sizeof(struct irq_desc)); | ||
41 | if (!alloc_desc_masks(desc, node, false)) { | ||
42 | printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " | ||
43 | "for migration.\n", irq); | ||
44 | return false; | ||
45 | } | ||
46 | raw_spin_lock_init(&desc->lock); | ||
47 | desc->node = node; | ||
48 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
49 | init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); | ||
50 | init_copy_desc_masks(old_desc, desc); | ||
51 | arch_init_copy_chip_data(old_desc, desc, node); | ||
52 | return true; | ||
53 | } | ||
54 | |||
55 | static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) | ||
56 | { | ||
57 | free_kstat_irqs(old_desc, desc); | ||
58 | free_desc_masks(old_desc, desc); | ||
59 | arch_free_chip_data(old_desc, desc); | ||
60 | } | ||
61 | |||
62 | static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | ||
63 | int node) | ||
64 | { | ||
65 | struct irq_desc *desc; | ||
66 | unsigned int irq; | ||
67 | unsigned long flags; | ||
68 | |||
69 | irq = old_desc->irq; | ||
70 | |||
71 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | ||
72 | |||
73 | /* We have to check it to avoid races with another CPU */ | ||
74 | desc = irq_to_desc(irq); | ||
75 | |||
76 | if (desc && old_desc != desc) | ||
77 | goto out_unlock; | ||
78 | |||
79 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
80 | if (!desc) { | ||
81 | printk(KERN_ERR "irq %d: can not get new irq_desc " | ||
82 | "for migration.\n", irq); | ||
83 | /* still use old one */ | ||
84 | desc = old_desc; | ||
85 | goto out_unlock; | ||
86 | } | ||
87 | if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { | ||
88 | /* still use old one */ | ||
89 | kfree(desc); | ||
90 | desc = old_desc; | ||
91 | goto out_unlock; | ||
92 | } | ||
93 | |||
94 | replace_irq_desc(irq, desc); | ||
95 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
96 | |||
97 | /* free the old one */ | ||
98 | free_one_irq_desc(old_desc, desc); | ||
99 | kfree(old_desc); | ||
100 | |||
101 | return desc; | ||
102 | |||
103 | out_unlock: | ||
104 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
105 | |||
106 | return desc; | ||
107 | } | ||
108 | |||
109 | struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) | ||
110 | { | ||
111 | /* those static or target node is -1, do not move them */ | ||
112 | if (desc->irq < NR_IRQS_LEGACY || node == -1) | ||
113 | return desc; | ||
114 | |||
115 | if (desc->node != node) | ||
116 | desc = __real_move_irq_desc(desc, node); | ||
117 | |||
118 | return desc; | ||
119 | } | ||
120 | |||
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b03..f76fc00c9877 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -18,7 +18,7 @@ | |||
18 | * During system-wide suspend or hibernation device drivers need to be prevented | 18 | * During system-wide suspend or hibernation device drivers need to be prevented |
19 | * from receiving interrupts and this function is provided for this purpose. | 19 | * from receiving interrupts and this function is provided for this purpose. |
20 | * It marks all interrupt lines in use, except for the timer ones, as disabled | 20 | * It marks all interrupt lines in use, except for the timer ones, as disabled |
21 | * and sets the IRQ_SUSPENDED flag for each of them. | 21 | * and sets the IRQS_SUSPENDED flag for each of them. |
22 | */ | 22 | */ |
23 | void suspend_device_irqs(void) | 23 | void suspend_device_irqs(void) |
24 | { | 24 | { |
@@ -34,7 +34,7 @@ void suspend_device_irqs(void) | |||
34 | } | 34 | } |
35 | 35 | ||
36 | for_each_irq_desc(irq, desc) | 36 | for_each_irq_desc(irq, desc) |
37 | if (desc->status & IRQ_SUSPENDED) | 37 | if (desc->istate & IRQS_SUSPENDED) |
38 | synchronize_irq(irq); | 38 | synchronize_irq(irq); |
39 | } | 39 | } |
40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); | |||
43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | 43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() |
44 | * | 44 | * |
45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that | 45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that |
46 | * have the IRQ_SUSPENDED flag set. | 46 | * have the IRQS_SUSPENDED flag set. |
47 | */ | 47 | */ |
48 | void resume_device_irqs(void) | 48 | void resume_device_irqs(void) |
49 | { | 49 | { |
@@ -53,9 +53,6 @@ void resume_device_irqs(void) | |||
53 | for_each_irq_desc(irq, desc) { | 53 | for_each_irq_desc(irq, desc) { |
54 | unsigned long flags; | 54 | unsigned long flags; |
55 | 55 | ||
56 | if (!(desc->status & IRQ_SUSPENDED)) | ||
57 | continue; | ||
58 | |||
59 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
60 | __enable_irq(desc, irq, true); | 57 | __enable_irq(desc, irq, true); |
61 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void) | |||
71 | struct irq_desc *desc; | 68 | struct irq_desc *desc; |
72 | int irq; | 69 | int irq; |
73 | 70 | ||
74 | for_each_irq_desc(irq, desc) | 71 | for_each_irq_desc(irq, desc) { |
75 | if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) | 72 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
76 | return -EBUSY; | 73 | if (desc->istate & IRQS_PENDING) |
74 | return -EBUSY; | ||
75 | continue; | ||
76 | } | ||
77 | /* | ||
78 | * Check the non wakeup interrupts whether they need | ||
79 | * to be masked before finally going into suspend | ||
80 | * state. That's for hardware which has no wakeup | ||
81 | * source configuration facility. The chip | ||
82 | * implementation indicates that with | ||
83 | * IRQCHIP_MASK_ON_SUSPEND. | ||
84 | */ | ||
85 | if (desc->istate & IRQS_SUSPENDED && | ||
86 | irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
87 | mask_irq(desc); | ||
88 | } | ||
77 | 89 | ||
78 | return 0; | 90 | return 0; |
79 | } | 91 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd2..4bd4faa6323a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/proc_fs.h> | 11 | #include <linux/proc_fs.h> |
12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
14 | #include <linux/kernel_stat.h> | ||
14 | 15 | ||
15 | #include "internals.h" | 16 | #include "internals.h" |
16 | 17 | ||
@@ -18,16 +19,19 @@ static struct proc_dir_entry *root_irq_dir; | |||
18 | 19 | ||
19 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
20 | 21 | ||
21 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 22 | static int show_irq_affinity(int type, struct seq_file *m, void *v) |
22 | { | 23 | { |
23 | struct irq_desc *desc = irq_to_desc((long)m->private); | 24 | struct irq_desc *desc = irq_to_desc((long)m->private); |
24 | const struct cpumask *mask = desc->affinity; | 25 | const struct cpumask *mask = desc->irq_data.affinity; |
25 | 26 | ||
26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 27 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
27 | if (desc->status & IRQ_MOVE_PENDING) | 28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) |
28 | mask = desc->pending_mask; | 29 | mask = desc->pending_mask; |
29 | #endif | 30 | #endif |
30 | seq_cpumask(m, mask); | 31 | if (type) |
32 | seq_cpumask_list(m, mask); | ||
33 | else | ||
34 | seq_cpumask(m, mask); | ||
31 | seq_putc(m, '\n'); | 35 | seq_putc(m, '\n'); |
32 | return 0; | 36 | return 0; |
33 | } | 37 | } |
@@ -58,21 +62,34 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) | |||
58 | #endif | 62 | #endif |
59 | 63 | ||
60 | int no_irq_affinity; | 64 | int no_irq_affinity; |
61 | static ssize_t irq_affinity_proc_write(struct file *file, | 65 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
66 | { | ||
67 | return show_irq_affinity(0, m, v); | ||
68 | } | ||
69 | |||
70 | static int irq_affinity_list_proc_show(struct seq_file *m, void *v) | ||
71 | { | ||
72 | return show_irq_affinity(1, m, v); | ||
73 | } | ||
74 | |||
75 | |||
76 | static ssize_t write_irq_affinity(int type, struct file *file, | ||
62 | const char __user *buffer, size_t count, loff_t *pos) | 77 | const char __user *buffer, size_t count, loff_t *pos) |
63 | { | 78 | { |
64 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; | 79 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; |
65 | cpumask_var_t new_value; | 80 | cpumask_var_t new_value; |
66 | int err; | 81 | int err; |
67 | 82 | ||
68 | if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || | 83 | if (!irq_can_set_affinity(irq) || no_irq_affinity) |
69 | irq_balancing_disabled(irq)) | ||
70 | return -EIO; | 84 | return -EIO; |
71 | 85 | ||
72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 86 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
73 | return -ENOMEM; | 87 | return -ENOMEM; |
74 | 88 | ||
75 | err = cpumask_parse_user(buffer, count, new_value); | 89 | if (type) |
90 | err = cpumask_parselist_user(buffer, count, new_value); | ||
91 | else | ||
92 | err = cpumask_parse_user(buffer, count, new_value); | ||
76 | if (err) | 93 | if (err) |
77 | goto free_cpumask; | 94 | goto free_cpumask; |
78 | 95 | ||
@@ -89,7 +106,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
89 | if (!cpumask_intersects(new_value, cpu_online_mask)) { | 106 | if (!cpumask_intersects(new_value, cpu_online_mask)) { |
90 | /* Special case for empty set - allow the architecture | 107 | /* Special case for empty set - allow the architecture |
91 | code to set default SMP affinity. */ | 108 | code to set default SMP affinity. */ |
92 | err = irq_select_affinity_usr(irq) ? -EINVAL : count; | 109 | err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; |
93 | } else { | 110 | } else { |
94 | irq_set_affinity(irq, new_value); | 111 | irq_set_affinity(irq, new_value); |
95 | err = count; | 112 | err = count; |
@@ -100,11 +117,28 @@ free_cpumask: | |||
100 | return err; | 117 | return err; |
101 | } | 118 | } |
102 | 119 | ||
120 | static ssize_t irq_affinity_proc_write(struct file *file, | ||
121 | const char __user *buffer, size_t count, loff_t *pos) | ||
122 | { | ||
123 | return write_irq_affinity(0, file, buffer, count, pos); | ||
124 | } | ||
125 | |||
126 | static ssize_t irq_affinity_list_proc_write(struct file *file, | ||
127 | const char __user *buffer, size_t count, loff_t *pos) | ||
128 | { | ||
129 | return write_irq_affinity(1, file, buffer, count, pos); | ||
130 | } | ||
131 | |||
103 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) | 132 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) |
104 | { | 133 | { |
105 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); | 134 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); |
106 | } | 135 | } |
107 | 136 | ||
137 | static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) | ||
138 | { | ||
139 | return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); | ||
140 | } | ||
141 | |||
108 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) | 142 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) |
109 | { | 143 | { |
110 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); | 144 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); |
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = { | |||
125 | .release = single_release, | 159 | .release = single_release, |
126 | }; | 160 | }; |
127 | 161 | ||
162 | static const struct file_operations irq_affinity_list_proc_fops = { | ||
163 | .open = irq_affinity_list_proc_open, | ||
164 | .read = seq_read, | ||
165 | .llseek = seq_lseek, | ||
166 | .release = single_release, | ||
167 | .write = irq_affinity_list_proc_write, | ||
168 | }; | ||
169 | |||
128 | static int default_affinity_show(struct seq_file *m, void *v) | 170 | static int default_affinity_show(struct seq_file *m, void *v) |
129 | { | 171 | { |
130 | seq_cpumask(m, irq_default_affinity); | 172 | seq_cpumask(m, irq_default_affinity); |
@@ -185,7 +227,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) | |||
185 | { | 227 | { |
186 | struct irq_desc *desc = irq_to_desc((long) m->private); | 228 | struct irq_desc *desc = irq_to_desc((long) m->private); |
187 | 229 | ||
188 | seq_printf(m, "%d\n", desc->node); | 230 | seq_printf(m, "%d\n", desc->irq_data.node); |
189 | return 0; | 231 | return 0; |
190 | } | 232 | } |
191 | 233 | ||
@@ -214,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) | |||
214 | 256 | ||
215 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) | 257 | static int irq_spurious_proc_open(struct inode *inode, struct file *file) |
216 | { | 258 | { |
217 | return single_open(file, irq_spurious_proc_show, NULL); | 259 | return single_open(file, irq_spurious_proc_show, PDE(inode)->data); |
218 | } | 260 | } |
219 | 261 | ||
220 | static const struct file_operations irq_spurious_proc_fops = { | 262 | static const struct file_operations irq_spurious_proc_fops = { |
@@ -269,7 +311,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
269 | { | 311 | { |
270 | char name [MAX_NAMELEN]; | 312 | char name [MAX_NAMELEN]; |
271 | 313 | ||
272 | if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) | 314 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) |
273 | return; | 315 | return; |
274 | 316 | ||
275 | memset(name, 0, MAX_NAMELEN); | 317 | memset(name, 0, MAX_NAMELEN); |
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
289 | proc_create_data("affinity_hint", 0400, desc->dir, | 331 | proc_create_data("affinity_hint", 0400, desc->dir, |
290 | &irq_affinity_hint_proc_fops, (void *)(long)irq); | 332 | &irq_affinity_hint_proc_fops, (void *)(long)irq); |
291 | 333 | ||
334 | /* create /proc/irq/<irq>/smp_affinity_list */ | ||
335 | proc_create_data("smp_affinity_list", 0600, desc->dir, | ||
336 | &irq_affinity_list_proc_fops, (void *)(long)irq); | ||
337 | |||
292 | proc_create_data("node", 0444, desc->dir, | 338 | proc_create_data("node", 0444, desc->dir, |
293 | &irq_node_proc_fops, (void *)(long)irq); | 339 | &irq_node_proc_fops, (void *)(long)irq); |
294 | #endif | 340 | #endif |
@@ -297,6 +343,25 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
297 | &irq_spurious_proc_fops, (void *)(long)irq); | 343 | &irq_spurious_proc_fops, (void *)(long)irq); |
298 | } | 344 | } |
299 | 345 | ||
346 | void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | ||
347 | { | ||
348 | char name [MAX_NAMELEN]; | ||
349 | |||
350 | if (!root_irq_dir || !desc->dir) | ||
351 | return; | ||
352 | #ifdef CONFIG_SMP | ||
353 | remove_proc_entry("smp_affinity", desc->dir); | ||
354 | remove_proc_entry("affinity_hint", desc->dir); | ||
355 | remove_proc_entry("smp_affinity_list", desc->dir); | ||
356 | remove_proc_entry("node", desc->dir); | ||
357 | #endif | ||
358 | remove_proc_entry("spurious", desc->dir); | ||
359 | |||
360 | memset(name, 0, MAX_NAMELEN); | ||
361 | sprintf(name, "%u", irq); | ||
362 | remove_proc_entry(name, root_irq_dir); | ||
363 | } | ||
364 | |||
300 | #undef MAX_NAMELEN | 365 | #undef MAX_NAMELEN |
301 | 366 | ||
302 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 367 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
@@ -339,3 +404,83 @@ void init_irq_proc(void) | |||
339 | } | 404 | } |
340 | } | 405 | } |
341 | 406 | ||
407 | #ifdef CONFIG_GENERIC_IRQ_SHOW | ||
408 | |||
409 | int __weak arch_show_interrupts(struct seq_file *p, int prec) | ||
410 | { | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | #ifndef ACTUAL_NR_IRQS | ||
415 | # define ACTUAL_NR_IRQS nr_irqs | ||
416 | #endif | ||
417 | |||
418 | int show_interrupts(struct seq_file *p, void *v) | ||
419 | { | ||
420 | static int prec; | ||
421 | |||
422 | unsigned long flags, any_count = 0; | ||
423 | int i = *(loff_t *) v, j; | ||
424 | struct irqaction *action; | ||
425 | struct irq_desc *desc; | ||
426 | |||
427 | if (i > ACTUAL_NR_IRQS) | ||
428 | return 0; | ||
429 | |||
430 | if (i == ACTUAL_NR_IRQS) | ||
431 | return arch_show_interrupts(p, prec); | ||
432 | |||
433 | /* print header and calculate the width of the first column */ | ||
434 | if (i == 0) { | ||
435 | for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) | ||
436 | j *= 10; | ||
437 | |||
438 | seq_printf(p, "%*s", prec + 8, ""); | ||
439 | for_each_online_cpu(j) | ||
440 | seq_printf(p, "CPU%-8d", j); | ||
441 | seq_putc(p, '\n'); | ||
442 | } | ||
443 | |||
444 | desc = irq_to_desc(i); | ||
445 | if (!desc) | ||
446 | return 0; | ||
447 | |||
448 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
449 | for_each_online_cpu(j) | ||
450 | any_count |= kstat_irqs_cpu(i, j); | ||
451 | action = desc->action; | ||
452 | if (!action && !any_count) | ||
453 | goto out; | ||
454 | |||
455 | seq_printf(p, "%*d: ", prec, i); | ||
456 | for_each_online_cpu(j) | ||
457 | seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); | ||
458 | |||
459 | if (desc->irq_data.chip) { | ||
460 | if (desc->irq_data.chip->irq_print_chip) | ||
461 | desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); | ||
462 | else if (desc->irq_data.chip->name) | ||
463 | seq_printf(p, " %8s", desc->irq_data.chip->name); | ||
464 | else | ||
465 | seq_printf(p, " %8s", "-"); | ||
466 | } else { | ||
467 | seq_printf(p, " %8s", "None"); | ||
468 | } | ||
469 | #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL | ||
470 | seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); | ||
471 | #endif | ||
472 | if (desc->name) | ||
473 | seq_printf(p, "-%-8s", desc->name); | ||
474 | |||
475 | if (action) { | ||
476 | seq_printf(p, " %s", action->name); | ||
477 | while ((action = action->next) != NULL) | ||
478 | seq_printf(p, ", %s", action->name); | ||
479 | } | ||
480 | |||
481 | seq_putc(p, '\n'); | ||
482 | out: | ||
483 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
484 | return 0; | ||
485 | } | ||
486 | #endif | ||
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c3763f3a2..14dd5761e8c9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
24 | 24 | ||
25 | /* Bitmap to handle software resend of interrupts: */ | 25 | /* Bitmap to handle software resend of interrupts: */ |
26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | 26 | static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * Run software resends of IRQ's | 29 | * Run software resends of IRQ's |
@@ -55,22 +55,21 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | |||
55 | */ | 55 | */ |
56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | 56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) |
57 | { | 57 | { |
58 | unsigned int status = desc->status; | ||
59 | |||
60 | /* | ||
61 | * Make sure the interrupt is enabled, before resending it: | ||
62 | */ | ||
63 | desc->chip->enable(irq); | ||
64 | |||
65 | /* | 58 | /* |
66 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
67 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
68 | * active. | 61 | * active. |
69 | */ | 62 | */ |
70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 63 | if (irq_settings_is_level(desc)) |
71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | 64 | return; |
65 | if (desc->istate & IRQS_REPLAY) | ||
66 | return; | ||
67 | if (desc->istate & IRQS_PENDING) { | ||
68 | desc->istate &= ~IRQS_PENDING; | ||
69 | desc->istate |= IRQS_REPLAY; | ||
72 | 70 | ||
73 | if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { | 71 | if (!desc->irq_data.chip->irq_retrigger || |
72 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | ||
74 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 73 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
75 | /* Set it pending and activate the softirq: */ | 74 | /* Set it pending and activate the softirq: */ |
76 | set_bit(irq, irqs_resend); | 75 | set_bit(irq, irqs_resend); |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 000000000000..f1667833d444 --- /dev/null +++ b/kernel/irq/settings.h | |||
@@ -0,0 +1,142 @@ | |||
1 | /* | ||
2 | * Internal header to deal with irq_desc->status which will be renamed | ||
3 | * to irq_desc->settings. | ||
4 | */ | ||
5 | enum { | ||
6 | _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, | ||
7 | _IRQ_PER_CPU = IRQ_PER_CPU, | ||
8 | _IRQ_LEVEL = IRQ_LEVEL, | ||
9 | _IRQ_NOPROBE = IRQ_NOPROBE, | ||
10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, | ||
11 | _IRQ_NOTHREAD = IRQ_NOTHREAD, | ||
12 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, | ||
13 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | ||
14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | ||
15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, | ||
16 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | ||
17 | }; | ||
18 | |||
19 | #define IRQ_PER_CPU GOT_YOU_MORON | ||
20 | #define IRQ_NO_BALANCING GOT_YOU_MORON | ||
21 | #define IRQ_LEVEL GOT_YOU_MORON | ||
22 | #define IRQ_NOPROBE GOT_YOU_MORON | ||
23 | #define IRQ_NOREQUEST GOT_YOU_MORON | ||
24 | #define IRQ_NOTHREAD GOT_YOU_MORON | ||
25 | #define IRQ_NOAUTOEN GOT_YOU_MORON | ||
26 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | ||
27 | #undef IRQF_MODIFY_MASK | ||
28 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | ||
29 | |||
30 | static inline void | ||
31 | irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) | ||
32 | { | ||
33 | desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); | ||
34 | desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); | ||
35 | } | ||
36 | |||
37 | static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) | ||
38 | { | ||
39 | return desc->status_use_accessors & _IRQ_PER_CPU; | ||
40 | } | ||
41 | |||
42 | static inline void irq_settings_set_per_cpu(struct irq_desc *desc) | ||
43 | { | ||
44 | desc->status_use_accessors |= _IRQ_PER_CPU; | ||
45 | } | ||
46 | |||
47 | static inline void irq_settings_set_no_balancing(struct irq_desc *desc) | ||
48 | { | ||
49 | desc->status_use_accessors |= _IRQ_NO_BALANCING; | ||
50 | } | ||
51 | |||
52 | static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) | ||
53 | { | ||
54 | return desc->status_use_accessors & _IRQ_NO_BALANCING; | ||
55 | } | ||
56 | |||
57 | static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) | ||
58 | { | ||
59 | return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; | ||
60 | } | ||
61 | |||
62 | static inline void | ||
63 | irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) | ||
64 | { | ||
65 | desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; | ||
66 | desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; | ||
67 | } | ||
68 | |||
69 | static inline bool irq_settings_is_level(struct irq_desc *desc) | ||
70 | { | ||
71 | return desc->status_use_accessors & _IRQ_LEVEL; | ||
72 | } | ||
73 | |||
74 | static inline void irq_settings_clr_level(struct irq_desc *desc) | ||
75 | { | ||
76 | desc->status_use_accessors &= ~_IRQ_LEVEL; | ||
77 | } | ||
78 | |||
79 | static inline void irq_settings_set_level(struct irq_desc *desc) | ||
80 | { | ||
81 | desc->status_use_accessors |= _IRQ_LEVEL; | ||
82 | } | ||
83 | |||
84 | static inline bool irq_settings_can_request(struct irq_desc *desc) | ||
85 | { | ||
86 | return !(desc->status_use_accessors & _IRQ_NOREQUEST); | ||
87 | } | ||
88 | |||
89 | static inline void irq_settings_clr_norequest(struct irq_desc *desc) | ||
90 | { | ||
91 | desc->status_use_accessors &= ~_IRQ_NOREQUEST; | ||
92 | } | ||
93 | |||
94 | static inline void irq_settings_set_norequest(struct irq_desc *desc) | ||
95 | { | ||
96 | desc->status_use_accessors |= _IRQ_NOREQUEST; | ||
97 | } | ||
98 | |||
99 | static inline bool irq_settings_can_thread(struct irq_desc *desc) | ||
100 | { | ||
101 | return !(desc->status_use_accessors & _IRQ_NOTHREAD); | ||
102 | } | ||
103 | |||
104 | static inline void irq_settings_clr_nothread(struct irq_desc *desc) | ||
105 | { | ||
106 | desc->status_use_accessors &= ~_IRQ_NOTHREAD; | ||
107 | } | ||
108 | |||
109 | static inline void irq_settings_set_nothread(struct irq_desc *desc) | ||
110 | { | ||
111 | desc->status_use_accessors |= _IRQ_NOTHREAD; | ||
112 | } | ||
113 | |||
114 | static inline bool irq_settings_can_probe(struct irq_desc *desc) | ||
115 | { | ||
116 | return !(desc->status_use_accessors & _IRQ_NOPROBE); | ||
117 | } | ||
118 | |||
119 | static inline void irq_settings_clr_noprobe(struct irq_desc *desc) | ||
120 | { | ||
121 | desc->status_use_accessors &= ~_IRQ_NOPROBE; | ||
122 | } | ||
123 | |||
124 | static inline void irq_settings_set_noprobe(struct irq_desc *desc) | ||
125 | { | ||
126 | desc->status_use_accessors |= _IRQ_NOPROBE; | ||
127 | } | ||
128 | |||
129 | static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) | ||
130 | { | ||
131 | return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; | ||
132 | } | ||
133 | |||
134 | static inline bool irq_settings_can_autoenable(struct irq_desc *desc) | ||
135 | { | ||
136 | return !(desc->status_use_accessors & _IRQ_NOAUTOEN); | ||
137 | } | ||
138 | |||
139 | static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) | ||
140 | { | ||
141 | return desc->status_use_accessors & _IRQ_NESTED_THREAD; | ||
142 | } | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90ae534f..aa57d5da18c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -14,75 +14,100 @@ | |||
14 | #include <linux/moduleparam.h> | 14 | #include <linux/moduleparam.h> |
15 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
16 | 16 | ||
17 | #include "internals.h" | ||
18 | |||
17 | static int irqfixup __read_mostly; | 19 | static int irqfixup __read_mostly; |
18 | 20 | ||
19 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) | 21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) |
20 | static void poll_spurious_irqs(unsigned long dummy); | 22 | static void poll_spurious_irqs(unsigned long dummy); |
21 | static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); | 23 | static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); |
24 | static int irq_poll_cpu; | ||
25 | static atomic_t irq_poll_active; | ||
26 | |||
27 | /* | ||
28 | * We wait here for a poller to finish. | ||
29 | * | ||
30 | * If the poll runs on this CPU, then we yell loudly and return | ||
31 | * false. That will leave the interrupt line disabled in the worst | ||
32 | * case, but it should never happen. | ||
33 | * | ||
34 | * We wait until the poller is done and then recheck disabled and | ||
35 | * action (about to be disabled). Only if it's still active, we return | ||
36 | * true and let the handler run. | ||
37 | */ | ||
38 | bool irq_wait_for_poll(struct irq_desc *desc) | ||
39 | { | ||
40 | if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), | ||
41 | "irq poll in progress on cpu %d for irq %d\n", | ||
42 | smp_processor_id(), desc->irq_data.irq)) | ||
43 | return false; | ||
44 | |||
45 | #ifdef CONFIG_SMP | ||
46 | do { | ||
47 | raw_spin_unlock(&desc->lock); | ||
48 | while (irqd_irq_inprogress(&desc->irq_data)) | ||
49 | cpu_relax(); | ||
50 | raw_spin_lock(&desc->lock); | ||
51 | } while (irqd_irq_inprogress(&desc->irq_data)); | ||
52 | /* Might have been disabled in meantime */ | ||
53 | return !irqd_irq_disabled(&desc->irq_data) && desc->action; | ||
54 | #else | ||
55 | return false; | ||
56 | #endif | ||
57 | } | ||
58 | |||
22 | 59 | ||
23 | /* | 60 | /* |
24 | * Recovery handler for misrouted interrupts. | 61 | * Recovery handler for misrouted interrupts. |
25 | */ | 62 | */ |
26 | static int try_one_irq(int irq, struct irq_desc *desc) | 63 | static int try_one_irq(int irq, struct irq_desc *desc, bool force) |
27 | { | 64 | { |
65 | irqreturn_t ret = IRQ_NONE; | ||
28 | struct irqaction *action; | 66 | struct irqaction *action; |
29 | int ok = 0, work = 0; | ||
30 | 67 | ||
31 | raw_spin_lock(&desc->lock); | 68 | raw_spin_lock(&desc->lock); |
32 | /* Already running on another processor */ | ||
33 | if (desc->status & IRQ_INPROGRESS) { | ||
34 | /* | ||
35 | * Already running: If it is shared get the other | ||
36 | * CPU to go looking for our mystery interrupt too | ||
37 | */ | ||
38 | if (desc->action && (desc->action->flags & IRQF_SHARED)) | ||
39 | desc->status |= IRQ_PENDING; | ||
40 | raw_spin_unlock(&desc->lock); | ||
41 | return ok; | ||
42 | } | ||
43 | /* Honour the normal IRQ locking */ | ||
44 | desc->status |= IRQ_INPROGRESS; | ||
45 | action = desc->action; | ||
46 | raw_spin_unlock(&desc->lock); | ||
47 | 69 | ||
48 | while (action) { | 70 | /* PER_CPU and nested thread interrupts are never polled */ |
49 | /* Only shared IRQ handlers are safe to call */ | 71 | if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) |
50 | if (action->flags & IRQF_SHARED) { | 72 | goto out; |
51 | if (action->handler(irq, action->dev_id) == | ||
52 | IRQ_HANDLED) | ||
53 | ok = 1; | ||
54 | } | ||
55 | action = action->next; | ||
56 | } | ||
57 | local_irq_disable(); | ||
58 | /* Now clean up the flags */ | ||
59 | raw_spin_lock(&desc->lock); | ||
60 | action = desc->action; | ||
61 | 73 | ||
62 | /* | 74 | /* |
63 | * While we were looking for a fixup someone queued a real | 75 | * Do not poll disabled interrupts unless the spurious |
64 | * IRQ clashing with our walk: | 76 | * disabled poller asks explicitely. |
65 | */ | 77 | */ |
66 | while ((desc->status & IRQ_PENDING) && action) { | 78 | if (irqd_irq_disabled(&desc->irq_data) && !force) |
79 | goto out; | ||
80 | |||
81 | /* | ||
82 | * All handlers must agree on IRQF_SHARED, so we test just the | ||
83 | * first. Check for action->next as well. | ||
84 | */ | ||
85 | action = desc->action; | ||
86 | if (!action || !(action->flags & IRQF_SHARED) || | ||
87 | (action->flags & __IRQF_TIMER) || !action->next) | ||
88 | goto out; | ||
89 | |||
90 | /* Already running on another processor */ | ||
91 | if (irqd_irq_inprogress(&desc->irq_data)) { | ||
67 | /* | 92 | /* |
68 | * Perform real IRQ processing for the IRQ we deferred | 93 | * Already running: If it is shared get the other |
94 | * CPU to go looking for our mystery interrupt too | ||
69 | */ | 95 | */ |
70 | work = 1; | 96 | desc->istate |= IRQS_PENDING; |
71 | raw_spin_unlock(&desc->lock); | 97 | goto out; |
72 | handle_IRQ_event(irq, action); | ||
73 | raw_spin_lock(&desc->lock); | ||
74 | desc->status &= ~IRQ_PENDING; | ||
75 | } | 98 | } |
76 | desc->status &= ~IRQ_INPROGRESS; | ||
77 | /* | ||
78 | * If we did actual work for the real IRQ line we must let the | ||
79 | * IRQ controller clean up too | ||
80 | */ | ||
81 | if (work && desc->chip && desc->chip->end) | ||
82 | desc->chip->end(irq); | ||
83 | raw_spin_unlock(&desc->lock); | ||
84 | 99 | ||
85 | return ok; | 100 | /* Mark it poll in progress */ |
101 | desc->istate |= IRQS_POLL_INPROGRESS; | ||
102 | do { | ||
103 | if (handle_irq_event(desc) == IRQ_HANDLED) | ||
104 | ret = IRQ_HANDLED; | ||
105 | action = desc->action; | ||
106 | } while ((desc->istate & IRQS_PENDING) && action); | ||
107 | desc->istate &= ~IRQS_POLL_INPROGRESS; | ||
108 | out: | ||
109 | raw_spin_unlock(&desc->lock); | ||
110 | return ret == IRQ_HANDLED; | ||
86 | } | 111 | } |
87 | 112 | ||
88 | static int misrouted_irq(int irq) | 113 | static int misrouted_irq(int irq) |
@@ -90,6 +115,11 @@ static int misrouted_irq(int irq) | |||
90 | struct irq_desc *desc; | 115 | struct irq_desc *desc; |
91 | int i, ok = 0; | 116 | int i, ok = 0; |
92 | 117 | ||
118 | if (atomic_inc_return(&irq_poll_active) == 1) | ||
119 | goto out; | ||
120 | |||
121 | irq_poll_cpu = smp_processor_id(); | ||
122 | |||
93 | for_each_irq_desc(i, desc) { | 123 | for_each_irq_desc(i, desc) { |
94 | if (!i) | 124 | if (!i) |
95 | continue; | 125 | continue; |
@@ -97,9 +127,11 @@ static int misrouted_irq(int irq) | |||
97 | if (i == irq) /* Already tried */ | 127 | if (i == irq) /* Already tried */ |
98 | continue; | 128 | continue; |
99 | 129 | ||
100 | if (try_one_irq(i, desc)) | 130 | if (try_one_irq(i, desc, false)) |
101 | ok = 1; | 131 | ok = 1; |
102 | } | 132 | } |
133 | out: | ||
134 | atomic_dec(&irq_poll_active); | ||
103 | /* So the caller can adjust the irq error counts */ | 135 | /* So the caller can adjust the irq error counts */ |
104 | return ok; | 136 | return ok; |
105 | } | 137 | } |
@@ -109,27 +141,39 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
109 | struct irq_desc *desc; | 141 | struct irq_desc *desc; |
110 | int i; | 142 | int i; |
111 | 143 | ||
144 | if (atomic_inc_return(&irq_poll_active) != 1) | ||
145 | goto out; | ||
146 | irq_poll_cpu = smp_processor_id(); | ||
147 | |||
112 | for_each_irq_desc(i, desc) { | 148 | for_each_irq_desc(i, desc) { |
113 | unsigned int status; | 149 | unsigned int state; |
114 | 150 | ||
115 | if (!i) | 151 | if (!i) |
116 | continue; | 152 | continue; |
117 | 153 | ||
118 | /* Racy but it doesn't matter */ | 154 | /* Racy but it doesn't matter */ |
119 | status = desc->status; | 155 | state = desc->istate; |
120 | barrier(); | 156 | barrier(); |
121 | if (!(status & IRQ_SPURIOUS_DISABLED)) | 157 | if (!(state & IRQS_SPURIOUS_DISABLED)) |
122 | continue; | 158 | continue; |
123 | 159 | ||
124 | local_irq_disable(); | 160 | local_irq_disable(); |
125 | try_one_irq(i, desc); | 161 | try_one_irq(i, desc, true); |
126 | local_irq_enable(); | 162 | local_irq_enable(); |
127 | } | 163 | } |
128 | 164 | out: | |
165 | atomic_dec(&irq_poll_active); | ||
129 | mod_timer(&poll_spurious_irq_timer, | 166 | mod_timer(&poll_spurious_irq_timer, |
130 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 167 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
131 | } | 168 | } |
132 | 169 | ||
170 | static inline int bad_action_ret(irqreturn_t action_ret) | ||
171 | { | ||
172 | if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) | ||
173 | return 0; | ||
174 | return 1; | ||
175 | } | ||
176 | |||
133 | /* | 177 | /* |
134 | * If 99,900 of the previous 100,000 interrupts have not been handled | 178 | * If 99,900 of the previous 100,000 interrupts have not been handled |
135 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic | 179 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic |
@@ -137,17 +181,15 @@ static void poll_spurious_irqs(unsigned long dummy) | |||
137 | * | 181 | * |
138 | * (The other 100-of-100,000 interrupts may have been a correctly | 182 | * (The other 100-of-100,000 interrupts may have been a correctly |
139 | * functioning device sharing an IRQ with the failing one) | 183 | * functioning device sharing an IRQ with the failing one) |
140 | * | ||
141 | * Called under desc->lock | ||
142 | */ | 184 | */ |
143 | |||
144 | static void | 185 | static void |
145 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, | 186 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, |
146 | irqreturn_t action_ret) | 187 | irqreturn_t action_ret) |
147 | { | 188 | { |
148 | struct irqaction *action; | 189 | struct irqaction *action; |
190 | unsigned long flags; | ||
149 | 191 | ||
150 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { | 192 | if (bad_action_ret(action_ret)) { |
151 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | 193 | printk(KERN_ERR "irq event %d: bogus return value %x\n", |
152 | irq, action_ret); | 194 | irq, action_ret); |
153 | } else { | 195 | } else { |
@@ -157,14 +199,23 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
157 | dump_stack(); | 199 | dump_stack(); |
158 | printk(KERN_ERR "handlers:\n"); | 200 | printk(KERN_ERR "handlers:\n"); |
159 | 201 | ||
202 | /* | ||
203 | * We need to take desc->lock here. note_interrupt() is called | ||
204 | * w/o desc->lock held, but IRQ_PROGRESS set. We might race | ||
205 | * with something else removing an action. It's ok to take | ||
206 | * desc->lock here. See synchronize_irq(). | ||
207 | */ | ||
208 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
160 | action = desc->action; | 209 | action = desc->action; |
161 | while (action) { | 210 | while (action) { |
162 | printk(KERN_ERR "[<%p>]", action->handler); | 211 | printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); |
163 | print_symbol(" (%s)", | 212 | if (action->thread_fn) |
164 | (unsigned long)action->handler); | 213 | printk(KERN_CONT " threaded [<%p>] %pf", |
165 | printk("\n"); | 214 | action->thread_fn, action->thread_fn); |
215 | printk(KERN_CONT "\n"); | ||
166 | action = action->next; | 216 | action = action->next; |
167 | } | 217 | } |
218 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
168 | } | 219 | } |
169 | 220 | ||
170 | static void | 221 | static void |
@@ -216,7 +267,19 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
216 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 267 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
217 | irqreturn_t action_ret) | 268 | irqreturn_t action_ret) |
218 | { | 269 | { |
219 | if (unlikely(action_ret != IRQ_HANDLED)) { | 270 | if (desc->istate & IRQS_POLL_INPROGRESS) |
271 | return; | ||
272 | |||
273 | /* we get here again via the threaded handler */ | ||
274 | if (action_ret == IRQ_WAKE_THREAD) | ||
275 | return; | ||
276 | |||
277 | if (bad_action_ret(action_ret)) { | ||
278 | report_bad_irq(irq, desc, action_ret); | ||
279 | return; | ||
280 | } | ||
281 | |||
282 | if (unlikely(action_ret == IRQ_NONE)) { | ||
220 | /* | 283 | /* |
221 | * If we are seeing only the odd spurious IRQ caused by | 284 | * If we are seeing only the odd spurious IRQ caused by |
222 | * bus asynchronicity then don't eventually trigger an error, | 285 | * bus asynchronicity then don't eventually trigger an error, |
@@ -228,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
228 | else | 291 | else |
229 | desc->irqs_unhandled++; | 292 | desc->irqs_unhandled++; |
230 | desc->last_unhandled = jiffies; | 293 | desc->last_unhandled = jiffies; |
231 | if (unlikely(action_ret != IRQ_NONE)) | ||
232 | report_bad_irq(irq, desc, action_ret); | ||
233 | } | 294 | } |
234 | 295 | ||
235 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { | 296 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { |
@@ -252,9 +313,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
252 | * Now kill the IRQ | 313 | * Now kill the IRQ |
253 | */ | 314 | */ |
254 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 315 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
255 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; | 316 | desc->istate |= IRQS_SPURIOUS_DISABLED; |
256 | desc->depth++; | 317 | desc->depth++; |
257 | desc->chip->disable(irq); | 318 | irq_disable(desc); |
258 | 319 | ||
259 | mod_timer(&poll_spurious_irq_timer, | 320 | mod_timer(&poll_spurious_irq_timer, |
260 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 321 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..c58fa7da8aef --- /dev/null +++ b/kernel/irq_work.c | |||
@@ -0,0 +1,166 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
3 | * | ||
4 | * Provides a framework for enqueueing and running callbacks from hardirq | ||
5 | * context. The enqueueing is NMI-safe. | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/irq_work.h> | ||
11 | #include <linux/hardirq.h> | ||
12 | |||
13 | /* | ||
14 | * An entry can be in one of four states: | ||
15 | * | ||
16 | * free NULL, 0 -> {claimed} : free to be used | ||
17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | ||
18 | * pending next, 3 -> {busy} : queued, pending callback | ||
19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | ||
20 | * | ||
21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
22 | * flags. | ||
23 | */ | ||
24 | |||
25 | #define IRQ_WORK_PENDING 1UL | ||
26 | #define IRQ_WORK_BUSY 2UL | ||
27 | #define IRQ_WORK_FLAGS 3UL | ||
28 | |||
29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | ||
30 | { | ||
31 | return (unsigned long)entry->next & flags; | ||
32 | } | ||
33 | |||
34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
35 | { | ||
36 | unsigned long next = (unsigned long)entry->next; | ||
37 | next &= ~IRQ_WORK_FLAGS; | ||
38 | return (struct irq_work *)next; | ||
39 | } | ||
40 | |||
41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
42 | { | ||
43 | unsigned long next = (unsigned long)entry; | ||
44 | next |= flags; | ||
45 | return (struct irq_work *)next; | ||
46 | } | ||
47 | |||
48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
49 | |||
50 | /* | ||
51 | * Claim the entry so that no one else will poke at it. | ||
52 | */ | ||
53 | static bool irq_work_claim(struct irq_work *entry) | ||
54 | { | ||
55 | struct irq_work *next, *nflags; | ||
56 | |||
57 | do { | ||
58 | next = entry->next; | ||
59 | if ((unsigned long)next & IRQ_WORK_PENDING) | ||
60 | return false; | ||
61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | ||
62 | } while (cmpxchg(&entry->next, next, nflags) != next); | ||
63 | |||
64 | return true; | ||
65 | } | ||
66 | |||
67 | |||
68 | void __weak arch_irq_work_raise(void) | ||
69 | { | ||
70 | /* | ||
71 | * Lame architectures will get the timer tick callback | ||
72 | */ | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Queue the entry and raise the IPI if needed. | ||
77 | */ | ||
78 | static void __irq_work_queue(struct irq_work *entry) | ||
79 | { | ||
80 | struct irq_work *next; | ||
81 | |||
82 | preempt_disable(); | ||
83 | |||
84 | do { | ||
85 | next = __this_cpu_read(irq_work_list); | ||
86 | /* Can assign non-atomic because we keep the flags set. */ | ||
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); | ||
89 | |||
90 | /* The list was empty, raise self-interrupt to start processing. */ | ||
91 | if (!irq_work_next(entry)) | ||
92 | arch_irq_work_raise(); | ||
93 | |||
94 | preempt_enable(); | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Enqueue the irq_work @entry, returns true on success, failure when the | ||
99 | * @entry was already enqueued by someone else. | ||
100 | * | ||
101 | * Can be re-enqueued while the callback is still in progress. | ||
102 | */ | ||
103 | bool irq_work_queue(struct irq_work *entry) | ||
104 | { | ||
105 | if (!irq_work_claim(entry)) { | ||
106 | /* | ||
107 | * Already enqueued, can't do! | ||
108 | */ | ||
109 | return false; | ||
110 | } | ||
111 | |||
112 | __irq_work_queue(entry); | ||
113 | return true; | ||
114 | } | ||
115 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
116 | |||
117 | /* | ||
118 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | ||
119 | * context with local IRQs disabled. | ||
120 | */ | ||
121 | void irq_work_run(void) | ||
122 | { | ||
123 | struct irq_work *list; | ||
124 | |||
125 | if (this_cpu_read(irq_work_list) == NULL) | ||
126 | return; | ||
127 | |||
128 | BUG_ON(!in_irq()); | ||
129 | BUG_ON(!irqs_disabled()); | ||
130 | |||
131 | list = this_cpu_xchg(irq_work_list, NULL); | ||
132 | |||
133 | while (list != NULL) { | ||
134 | struct irq_work *entry = list; | ||
135 | |||
136 | list = irq_work_next(list); | ||
137 | |||
138 | /* | ||
139 | * Clear the PENDING bit, after this point the @entry | ||
140 | * can be re-used. | ||
141 | */ | ||
142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | ||
143 | entry->func(entry); | ||
144 | /* | ||
145 | * Clear the BUSY bit and return to the free state if | ||
146 | * no-one else claimed it meanwhile. | ||
147 | */ | ||
148 | (void)cmpxchg(&entry->next, | ||
149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
150 | NULL); | ||
151 | } | ||
152 | } | ||
153 | EXPORT_SYMBOL_GPL(irq_work_run); | ||
154 | |||
155 | /* | ||
156 | * Synchronize against the irq_work @entry, ensures the entry is not | ||
157 | * currently in use. | ||
158 | */ | ||
159 | void irq_work_sync(struct irq_work *entry) | ||
160 | { | ||
161 | WARN_ON_ONCE(irqs_disabled()); | ||
162 | |||
163 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | ||
164 | cpu_relax(); | ||
165 | } | ||
166 | EXPORT_SYMBOL_GPL(irq_work_sync); | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..a8ce45097f3d --- /dev/null +++ b/kernel/jump_label.c | |||
@@ -0,0 +1,393 @@ | |||
1 | /* | ||
2 | * jump label support | ||
3 | * | ||
4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | ||
5 | * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> | ||
6 | * | ||
7 | */ | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/uaccess.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/sort.h> | ||
14 | #include <linux/err.h> | ||
15 | #include <linux/jump_label.h> | ||
16 | |||
17 | #ifdef HAVE_JUMP_LABEL | ||
18 | |||
19 | /* mutex to protect coming/going of the the jump_label table */ | ||
20 | static DEFINE_MUTEX(jump_label_mutex); | ||
21 | |||
22 | void jump_label_lock(void) | ||
23 | { | ||
24 | mutex_lock(&jump_label_mutex); | ||
25 | } | ||
26 | |||
27 | void jump_label_unlock(void) | ||
28 | { | ||
29 | mutex_unlock(&jump_label_mutex); | ||
30 | } | ||
31 | |||
32 | bool jump_label_enabled(struct jump_label_key *key) | ||
33 | { | ||
34 | return !!atomic_read(&key->enabled); | ||
35 | } | ||
36 | |||
37 | static int jump_label_cmp(const void *a, const void *b) | ||
38 | { | ||
39 | const struct jump_entry *jea = a; | ||
40 | const struct jump_entry *jeb = b; | ||
41 | |||
42 | if (jea->key < jeb->key) | ||
43 | return -1; | ||
44 | |||
45 | if (jea->key > jeb->key) | ||
46 | return 1; | ||
47 | |||
48 | return 0; | ||
49 | } | ||
50 | |||
51 | static void | ||
52 | jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) | ||
53 | { | ||
54 | unsigned long size; | ||
55 | |||
56 | size = (((unsigned long)stop - (unsigned long)start) | ||
57 | / sizeof(struct jump_entry)); | ||
58 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | ||
59 | } | ||
60 | |||
61 | static void jump_label_update(struct jump_label_key *key, int enable); | ||
62 | |||
63 | void jump_label_inc(struct jump_label_key *key) | ||
64 | { | ||
65 | if (atomic_inc_not_zero(&key->enabled)) | ||
66 | return; | ||
67 | |||
68 | jump_label_lock(); | ||
69 | if (atomic_add_return(1, &key->enabled) == 1) | ||
70 | jump_label_update(key, JUMP_LABEL_ENABLE); | ||
71 | jump_label_unlock(); | ||
72 | } | ||
73 | |||
74 | void jump_label_dec(struct jump_label_key *key) | ||
75 | { | ||
76 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) | ||
77 | return; | ||
78 | |||
79 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
80 | jump_label_unlock(); | ||
81 | } | ||
82 | |||
83 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | ||
84 | { | ||
85 | if (entry->code <= (unsigned long)end && | ||
86 | entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) | ||
87 | return 1; | ||
88 | |||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | static int __jump_label_text_reserved(struct jump_entry *iter_start, | ||
93 | struct jump_entry *iter_stop, void *start, void *end) | ||
94 | { | ||
95 | struct jump_entry *iter; | ||
96 | |||
97 | iter = iter_start; | ||
98 | while (iter < iter_stop) { | ||
99 | if (addr_conflict(iter, start, end)) | ||
100 | return 1; | ||
101 | iter++; | ||
102 | } | ||
103 | |||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | static void __jump_label_update(struct jump_label_key *key, | ||
108 | struct jump_entry *entry, | ||
109 | struct jump_entry *stop, int enable) | ||
110 | { | ||
111 | for (; (entry < stop) && | ||
112 | (entry->key == (jump_label_t)(unsigned long)key); | ||
113 | entry++) { | ||
114 | /* | ||
115 | * entry->code set to 0 invalidates module init text sections | ||
116 | * kernel_text_address() verifies we are not in core kernel | ||
117 | * init code, see jump_label_invalidate_module_init(). | ||
118 | */ | ||
119 | if (entry->code && kernel_text_address(entry->code)) | ||
120 | arch_jump_label_transform(entry, enable); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Not all archs need this. | ||
126 | */ | ||
127 | void __weak arch_jump_label_text_poke_early(jump_label_t addr) | ||
128 | { | ||
129 | } | ||
130 | |||
131 | static __init int jump_label_init(void) | ||
132 | { | ||
133 | struct jump_entry *iter_start = __start___jump_table; | ||
134 | struct jump_entry *iter_stop = __stop___jump_table; | ||
135 | struct jump_label_key *key = NULL; | ||
136 | struct jump_entry *iter; | ||
137 | |||
138 | jump_label_lock(); | ||
139 | jump_label_sort_entries(iter_start, iter_stop); | ||
140 | |||
141 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
142 | arch_jump_label_text_poke_early(iter->code); | ||
143 | if (iter->key == (jump_label_t)(unsigned long)key) | ||
144 | continue; | ||
145 | |||
146 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
147 | atomic_set(&key->enabled, 0); | ||
148 | key->entries = iter; | ||
149 | #ifdef CONFIG_MODULES | ||
150 | key->next = NULL; | ||
151 | #endif | ||
152 | } | ||
153 | jump_label_unlock(); | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | early_initcall(jump_label_init); | ||
158 | |||
159 | #ifdef CONFIG_MODULES | ||
160 | |||
161 | struct jump_label_mod { | ||
162 | struct jump_label_mod *next; | ||
163 | struct jump_entry *entries; | ||
164 | struct module *mod; | ||
165 | }; | ||
166 | |||
167 | static int __jump_label_mod_text_reserved(void *start, void *end) | ||
168 | { | ||
169 | struct module *mod; | ||
170 | |||
171 | mod = __module_text_address((unsigned long)start); | ||
172 | if (!mod) | ||
173 | return 0; | ||
174 | |||
175 | WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); | ||
176 | |||
177 | return __jump_label_text_reserved(mod->jump_entries, | ||
178 | mod->jump_entries + mod->num_jump_entries, | ||
179 | start, end); | ||
180 | } | ||
181 | |||
182 | static void __jump_label_mod_update(struct jump_label_key *key, int enable) | ||
183 | { | ||
184 | struct jump_label_mod *mod = key->next; | ||
185 | |||
186 | while (mod) { | ||
187 | struct module *m = mod->mod; | ||
188 | |||
189 | __jump_label_update(key, mod->entries, | ||
190 | m->jump_entries + m->num_jump_entries, | ||
191 | enable); | ||
192 | mod = mod->next; | ||
193 | } | ||
194 | } | ||
195 | |||
196 | /*** | ||
197 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
198 | * @mod: module to patch | ||
199 | * | ||
200 | * Allow for run-time selection of the optimal nops. Before the module | ||
201 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
202 | * the arch specific jump label code. | ||
203 | */ | ||
204 | void jump_label_apply_nops(struct module *mod) | ||
205 | { | ||
206 | struct jump_entry *iter_start = mod->jump_entries; | ||
207 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | ||
208 | struct jump_entry *iter; | ||
209 | |||
210 | /* if the module doesn't have jump label entries, just return */ | ||
211 | if (iter_start == iter_stop) | ||
212 | return; | ||
213 | |||
214 | for (iter = iter_start; iter < iter_stop; iter++) | ||
215 | arch_jump_label_text_poke_early(iter->code); | ||
216 | } | ||
217 | |||
218 | static int jump_label_add_module(struct module *mod) | ||
219 | { | ||
220 | struct jump_entry *iter_start = mod->jump_entries; | ||
221 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | ||
222 | struct jump_entry *iter; | ||
223 | struct jump_label_key *key = NULL; | ||
224 | struct jump_label_mod *jlm; | ||
225 | |||
226 | /* if the module doesn't have jump label entries, just return */ | ||
227 | if (iter_start == iter_stop) | ||
228 | return 0; | ||
229 | |||
230 | jump_label_sort_entries(iter_start, iter_stop); | ||
231 | |||
232 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
233 | if (iter->key == (jump_label_t)(unsigned long)key) | ||
234 | continue; | ||
235 | |||
236 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
237 | |||
238 | if (__module_address(iter->key) == mod) { | ||
239 | atomic_set(&key->enabled, 0); | ||
240 | key->entries = iter; | ||
241 | key->next = NULL; | ||
242 | continue; | ||
243 | } | ||
244 | |||
245 | jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); | ||
246 | if (!jlm) | ||
247 | return -ENOMEM; | ||
248 | |||
249 | jlm->mod = mod; | ||
250 | jlm->entries = iter; | ||
251 | jlm->next = key->next; | ||
252 | key->next = jlm; | ||
253 | |||
254 | if (jump_label_enabled(key)) | ||
255 | __jump_label_update(key, iter, iter_stop, | ||
256 | JUMP_LABEL_ENABLE); | ||
257 | } | ||
258 | |||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | static void jump_label_del_module(struct module *mod) | ||
263 | { | ||
264 | struct jump_entry *iter_start = mod->jump_entries; | ||
265 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | ||
266 | struct jump_entry *iter; | ||
267 | struct jump_label_key *key = NULL; | ||
268 | struct jump_label_mod *jlm, **prev; | ||
269 | |||
270 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
271 | if (iter->key == (jump_label_t)(unsigned long)key) | ||
272 | continue; | ||
273 | |||
274 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
275 | |||
276 | if (__module_address(iter->key) == mod) | ||
277 | continue; | ||
278 | |||
279 | prev = &key->next; | ||
280 | jlm = key->next; | ||
281 | |||
282 | while (jlm && jlm->mod != mod) { | ||
283 | prev = &jlm->next; | ||
284 | jlm = jlm->next; | ||
285 | } | ||
286 | |||
287 | if (jlm) { | ||
288 | *prev = jlm->next; | ||
289 | kfree(jlm); | ||
290 | } | ||
291 | } | ||
292 | } | ||
293 | |||
294 | static void jump_label_invalidate_module_init(struct module *mod) | ||
295 | { | ||
296 | struct jump_entry *iter_start = mod->jump_entries; | ||
297 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | ||
298 | struct jump_entry *iter; | ||
299 | |||
300 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
301 | if (within_module_init(iter->code, mod)) | ||
302 | iter->code = 0; | ||
303 | } | ||
304 | } | ||
305 | |||
306 | static int | ||
307 | jump_label_module_notify(struct notifier_block *self, unsigned long val, | ||
308 | void *data) | ||
309 | { | ||
310 | struct module *mod = data; | ||
311 | int ret = 0; | ||
312 | |||
313 | switch (val) { | ||
314 | case MODULE_STATE_COMING: | ||
315 | jump_label_lock(); | ||
316 | ret = jump_label_add_module(mod); | ||
317 | if (ret) | ||
318 | jump_label_del_module(mod); | ||
319 | jump_label_unlock(); | ||
320 | break; | ||
321 | case MODULE_STATE_GOING: | ||
322 | jump_label_lock(); | ||
323 | jump_label_del_module(mod); | ||
324 | jump_label_unlock(); | ||
325 | break; | ||
326 | case MODULE_STATE_LIVE: | ||
327 | jump_label_lock(); | ||
328 | jump_label_invalidate_module_init(mod); | ||
329 | jump_label_unlock(); | ||
330 | break; | ||
331 | } | ||
332 | |||
333 | return notifier_from_errno(ret); | ||
334 | } | ||
335 | |||
336 | struct notifier_block jump_label_module_nb = { | ||
337 | .notifier_call = jump_label_module_notify, | ||
338 | .priority = 1, /* higher than tracepoints */ | ||
339 | }; | ||
340 | |||
341 | static __init int jump_label_init_module(void) | ||
342 | { | ||
343 | return register_module_notifier(&jump_label_module_nb); | ||
344 | } | ||
345 | early_initcall(jump_label_init_module); | ||
346 | |||
347 | #endif /* CONFIG_MODULES */ | ||
348 | |||
349 | /*** | ||
350 | * jump_label_text_reserved - check if addr range is reserved | ||
351 | * @start: start text addr | ||
352 | * @end: end text addr | ||
353 | * | ||
354 | * checks if the text addr located between @start and @end | ||
355 | * overlaps with any of the jump label patch addresses. Code | ||
356 | * that wants to modify kernel text should first verify that | ||
357 | * it does not overlap with any of the jump label addresses. | ||
358 | * Caller must hold jump_label_mutex. | ||
359 | * | ||
360 | * returns 1 if there is an overlap, 0 otherwise | ||
361 | */ | ||
362 | int jump_label_text_reserved(void *start, void *end) | ||
363 | { | ||
364 | int ret = __jump_label_text_reserved(__start___jump_table, | ||
365 | __stop___jump_table, start, end); | ||
366 | |||
367 | if (ret) | ||
368 | return ret; | ||
369 | |||
370 | #ifdef CONFIG_MODULES | ||
371 | ret = __jump_label_mod_text_reserved(start, end); | ||
372 | #endif | ||
373 | return ret; | ||
374 | } | ||
375 | |||
376 | static void jump_label_update(struct jump_label_key *key, int enable) | ||
377 | { | ||
378 | struct jump_entry *entry = key->entries, *stop = __stop___jump_table; | ||
379 | |||
380 | #ifdef CONFIG_MODULES | ||
381 | struct module *mod = __module_address((jump_label_t)key); | ||
382 | |||
383 | __jump_label_mod_update(key, enable); | ||
384 | |||
385 | if (mod) | ||
386 | stop = mod->jump_entries + mod->num_jump_entries; | ||
387 | #endif | ||
388 | /* if there are no users, entry can be NULL */ | ||
389 | if (entry) | ||
390 | __jump_label_update(key, entry, stop, enable); | ||
391 | } | ||
392 | |||
393 | #endif | ||
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..079f1d39a8b8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) | |||
64 | if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || | 64 | if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || |
65 | arch_is_kernel_text(addr)) | 65 | arch_is_kernel_text(addr)) |
66 | return 1; | 66 | return 1; |
67 | return in_gate_area_no_task(addr); | 67 | return in_gate_area_no_mm(addr); |
68 | } | 68 | } |
69 | 69 | ||
70 | static inline int is_kernel(unsigned long addr) | 70 | static inline int is_kernel(unsigned long addr) |
71 | { | 71 | { |
72 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) | 72 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) |
73 | return 1; | 73 | return 1; |
74 | return in_gate_area_no_task(addr); | 74 | return in_gate_area_no_mm(addr); |
75 | } | 75 | } |
76 | 76 | ||
77 | static int is_ksym_addr(unsigned long addr) | 77 | static int is_ksym_addr(unsigned long addr) |
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | |||
342 | } | 342 | } |
343 | 343 | ||
344 | /* Look up a kernel symbol and return it in a text buffer. */ | 344 | /* Look up a kernel symbol and return it in a text buffer. */ |
345 | int sprint_symbol(char *buffer, unsigned long address) | 345 | static int __sprint_symbol(char *buffer, unsigned long address, |
346 | int symbol_offset) | ||
346 | { | 347 | { |
347 | char *modname; | 348 | char *modname; |
348 | const char *name; | 349 | const char *name; |
349 | unsigned long offset, size; | 350 | unsigned long offset, size; |
350 | int len; | 351 | int len; |
351 | 352 | ||
353 | address += symbol_offset; | ||
352 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); | 354 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); |
353 | if (!name) | 355 | if (!name) |
354 | return sprintf(buffer, "0x%lx", address); | 356 | return sprintf(buffer, "0x%lx", address); |
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) | |||
357 | strcpy(buffer, name); | 359 | strcpy(buffer, name); |
358 | len = strlen(buffer); | 360 | len = strlen(buffer); |
359 | buffer += len; | 361 | buffer += len; |
362 | offset -= symbol_offset; | ||
360 | 363 | ||
361 | if (modname) | 364 | if (modname) |
362 | len += sprintf(buffer, "+%#lx/%#lx [%s]", | 365 | len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); |
363 | offset, size, modname); | ||
364 | else | 366 | else |
365 | len += sprintf(buffer, "+%#lx/%#lx", offset, size); | 367 | len += sprintf(buffer, "+%#lx/%#lx", offset, size); |
366 | 368 | ||
367 | return len; | 369 | return len; |
368 | } | 370 | } |
371 | |||
372 | /** | ||
373 | * sprint_symbol - Look up a kernel symbol and return it in a text buffer | ||
374 | * @buffer: buffer to be stored | ||
375 | * @address: address to lookup | ||
376 | * | ||
377 | * This function looks up a kernel symbol with @address and stores its name, | ||
378 | * offset, size and module name to @buffer if possible. If no symbol was found, | ||
379 | * just saves its @address as is. | ||
380 | * | ||
381 | * This function returns the number of bytes stored in @buffer. | ||
382 | */ | ||
383 | int sprint_symbol(char *buffer, unsigned long address) | ||
384 | { | ||
385 | return __sprint_symbol(buffer, address, 0); | ||
386 | } | ||
387 | |||
369 | EXPORT_SYMBOL_GPL(sprint_symbol); | 388 | EXPORT_SYMBOL_GPL(sprint_symbol); |
370 | 389 | ||
390 | /** | ||
391 | * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer | ||
392 | * @buffer: buffer to be stored | ||
393 | * @address: address to lookup | ||
394 | * | ||
395 | * This function is for stack backtrace and does the same thing as | ||
396 | * sprint_symbol() but with modified/decreased @address. If there is a | ||
397 | * tail-call to the function marked "noreturn", gcc optimized out code after | ||
398 | * the call so that the stack-saved return address could point outside of the | ||
399 | * caller. This function ensures that kallsyms will find the original caller | ||
400 | * by decreasing @address. | ||
401 | * | ||
402 | * This function returns the number of bytes stored in @buffer. | ||
403 | */ | ||
404 | int sprint_backtrace(char *buffer, unsigned long address) | ||
405 | { | ||
406 | return __sprint_symbol(buffer, address, -1); | ||
407 | } | ||
408 | |||
371 | /* Look up a kernel symbol and print it to the kernel messages. */ | 409 | /* Look up a kernel symbol and print it to the kernel messages. */ |
372 | void __print_symbol(const char *fmt, unsigned long address) | 410 | void __print_symbol(const char *fmt, unsigned long address) |
373 | { | 411 | { |
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p) | |||
477 | */ | 515 | */ |
478 | type = iter->exported ? toupper(iter->type) : | 516 | type = iter->exported ? toupper(iter->type) : |
479 | tolower(iter->type); | 517 | tolower(iter->type); |
480 | seq_printf(m, "%0*lx %c %s\t[%s]\n", | 518 | seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, |
481 | (int)(2 * sizeof(void *)), | 519 | type, iter->name, iter->module_name); |
482 | iter->value, type, iter->name, iter->module_name); | ||
483 | } else | 520 | } else |
484 | seq_printf(m, "%0*lx %c %s\n", | 521 | seq_printf(m, "%pK %c %s\n", (void *)iter->value, |
485 | (int)(2 * sizeof(void *)), | 522 | iter->type, iter->name); |
486 | iter->value, iter->type, iter->name); | ||
487 | return 0; | 523 | return 0; |
488 | } | 524 | } |
489 | 525 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d6730..8d814cbc8109 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/kmsg_dump.h> | 35 | #include <linux/kmsg_dump.h> |
36 | #include <linux/syscore_ops.h> | ||
36 | 37 | ||
37 | #include <asm/page.h> | 38 | #include <asm/page.h> |
38 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
144 | /* Initialize the list of destination pages */ | 145 | /* Initialize the list of destination pages */ |
145 | INIT_LIST_HEAD(&image->dest_pages); | 146 | INIT_LIST_HEAD(&image->dest_pages); |
146 | 147 | ||
147 | /* Initialize the list of unuseable pages */ | 148 | /* Initialize the list of unusable pages */ |
148 | INIT_LIST_HEAD(&image->unuseable_pages); | 149 | INIT_LIST_HEAD(&image->unuseable_pages); |
149 | 150 | ||
150 | /* Read in the segments */ | 151 | /* Read in the segments */ |
@@ -163,7 +164,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
163 | * just verifies it is an address we can use. | 164 | * just verifies it is an address we can use. |
164 | * | 165 | * |
165 | * Since the kernel does everything in page size chunks ensure | 166 | * Since the kernel does everything in page size chunks ensure |
166 | * the destination addreses are page aligned. Too many | 167 | * the destination addresses are page aligned. Too many |
167 | * special cases crop of when we don't do this. The most | 168 | * special cases crop of when we don't do this. The most |
168 | * insidious is getting overlapping destination addresses | 169 | * insidious is getting overlapping destination addresses |
169 | * simply because addresses are changed to page size | 170 | * simply because addresses are changed to page size |
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | |||
454 | /* Deal with the destination pages I have inadvertently allocated. | 455 | /* Deal with the destination pages I have inadvertently allocated. |
455 | * | 456 | * |
456 | * Ideally I would convert multi-page allocations into single | 457 | * Ideally I would convert multi-page allocations into single |
457 | * page allocations, and add everyting to image->dest_pages. | 458 | * page allocations, and add everything to image->dest_pages. |
458 | * | 459 | * |
459 | * For now it is simpler to just free the pages. | 460 | * For now it is simpler to just free the pages. |
460 | */ | 461 | */ |
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image) | |||
602 | /* Walk through and free any extra destination pages I may have */ | 603 | /* Walk through and free any extra destination pages I may have */ |
603 | kimage_free_page_list(&image->dest_pages); | 604 | kimage_free_page_list(&image->dest_pages); |
604 | 605 | ||
605 | /* Walk through and free any unuseable pages I have cached */ | 606 | /* Walk through and free any unusable pages I have cached */ |
606 | kimage_free_page_list(&image->unuseable_pages); | 607 | kimage_free_page_list(&image->unuseable_pages); |
607 | 608 | ||
608 | } | 609 | } |
@@ -816,7 +817,7 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
816 | 817 | ||
817 | ptr = kmap(page); | 818 | ptr = kmap(page); |
818 | /* Start with a clear page */ | 819 | /* Start with a clear page */ |
819 | memset(ptr, 0, PAGE_SIZE); | 820 | clear_page(ptr); |
820 | ptr += maddr & ~PAGE_MASK; | 821 | ptr += maddr & ~PAGE_MASK; |
821 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | 822 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); |
822 | if (mchunk > mbytes) | 823 | if (mchunk > mbytes) |
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void) | |||
1099 | return size; | 1100 | return size; |
1100 | } | 1101 | } |
1101 | 1102 | ||
1102 | static void free_reserved_phys_range(unsigned long begin, unsigned long end) | 1103 | void __weak crash_free_reserved_phys_range(unsigned long begin, |
1104 | unsigned long end) | ||
1103 | { | 1105 | { |
1104 | unsigned long addr; | 1106 | unsigned long addr; |
1105 | 1107 | ||
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) | |||
1135 | start = roundup(start, PAGE_SIZE); | 1137 | start = roundup(start, PAGE_SIZE); |
1136 | end = roundup(start + new_size, PAGE_SIZE); | 1138 | end = roundup(start + new_size, PAGE_SIZE); |
1137 | 1139 | ||
1138 | free_reserved_phys_range(end, crashk_res.end); | 1140 | crash_free_reserved_phys_range(end, crashk_res.end); |
1139 | 1141 | ||
1140 | if ((start == end) && (crashk_res.parent != NULL)) | 1142 | if ((start == end) && (crashk_res.parent != NULL)) |
1141 | release_resource(&crashk_res); | 1143 | release_resource(&crashk_res); |
@@ -1529,8 +1531,7 @@ int kernel_kexec(void) | |||
1529 | if (error) | 1531 | if (error) |
1530 | goto Enable_cpus; | 1532 | goto Enable_cpus; |
1531 | local_irq_disable(); | 1533 | local_irq_disable(); |
1532 | /* Suspend system devices */ | 1534 | error = syscore_suspend(); |
1533 | error = sysdev_suspend(PMSG_FREEZE); | ||
1534 | if (error) | 1535 | if (error) |
1535 | goto Enable_irqs; | 1536 | goto Enable_irqs; |
1536 | } else | 1537 | } else |
@@ -1545,7 +1546,7 @@ int kernel_kexec(void) | |||
1545 | 1546 | ||
1546 | #ifdef CONFIG_KEXEC_JUMP | 1547 | #ifdef CONFIG_KEXEC_JUMP |
1547 | if (kexec_image->preserve_context) { | 1548 | if (kexec_image->preserve_context) { |
1548 | sysdev_resume(); | 1549 | syscore_resume(); |
1549 | Enable_irqs: | 1550 | Enable_irqs: |
1550 | local_irq_enable(); | 1551 | local_irq_enable(); |
1551 | Enable_cpus: | 1552 | Enable_cpus: |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a2..47613dfb7b28 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/completion.h> | 27 | #include <linux/completion.h> |
28 | #include <linux/cred.h> | ||
28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
29 | #include <linux/fdtable.h> | 30 | #include <linux/fdtable.h> |
30 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
@@ -43,6 +44,13 @@ extern int max_threads; | |||
43 | 44 | ||
44 | static struct workqueue_struct *khelper_wq; | 45 | static struct workqueue_struct *khelper_wq; |
45 | 46 | ||
47 | #define CAP_BSET (void *)1 | ||
48 | #define CAP_PI (void *)2 | ||
49 | |||
50 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; | ||
51 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; | ||
52 | static DEFINE_SPINLOCK(umh_sysctl_lock); | ||
53 | |||
46 | #ifdef CONFIG_MODULES | 54 | #ifdef CONFIG_MODULES |
47 | 55 | ||
48 | /* | 56 | /* |
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); | |||
132 | static int ____call_usermodehelper(void *data) | 140 | static int ____call_usermodehelper(void *data) |
133 | { | 141 | { |
134 | struct subprocess_info *sub_info = data; | 142 | struct subprocess_info *sub_info = data; |
143 | struct cred *new; | ||
135 | int retval; | 144 | int retval; |
136 | 145 | ||
137 | spin_lock_irq(¤t->sighand->siglock); | 146 | spin_lock_irq(¤t->sighand->siglock); |
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data) | |||
147 | */ | 156 | */ |
148 | set_user_nice(current, 0); | 157 | set_user_nice(current, 0); |
149 | 158 | ||
159 | retval = -ENOMEM; | ||
160 | new = prepare_kernel_cred(current); | ||
161 | if (!new) | ||
162 | goto fail; | ||
163 | |||
164 | spin_lock(&umh_sysctl_lock); | ||
165 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); | ||
166 | new->cap_inheritable = cap_intersect(usermodehelper_inheritable, | ||
167 | new->cap_inheritable); | ||
168 | spin_unlock(&umh_sysctl_lock); | ||
169 | |||
150 | if (sub_info->init) { | 170 | if (sub_info->init) { |
151 | retval = sub_info->init(sub_info); | 171 | retval = sub_info->init(sub_info, new); |
152 | if (retval) | 172 | if (retval) { |
173 | abort_creds(new); | ||
153 | goto fail; | 174 | goto fail; |
175 | } | ||
154 | } | 176 | } |
155 | 177 | ||
178 | commit_creds(new); | ||
179 | |||
156 | retval = kernel_execve(sub_info->path, | 180 | retval = kernel_execve(sub_info->path, |
157 | (const char *const *)sub_info->argv, | 181 | (const char *const *)sub_info->argv, |
158 | (const char *const *)sub_info->envp); | 182 | (const char *const *)sub_info->envp); |
@@ -245,7 +269,6 @@ static void __call_usermodehelper(struct work_struct *work) | |||
245 | } | 269 | } |
246 | } | 270 | } |
247 | 271 | ||
248 | #ifdef CONFIG_PM_SLEEP | ||
249 | /* | 272 | /* |
250 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | 273 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY |
251 | * (used for preventing user land processes from being created after the user | 274 | * (used for preventing user land processes from being created after the user |
@@ -301,6 +324,15 @@ void usermodehelper_enable(void) | |||
301 | usermodehelper_disabled = 0; | 324 | usermodehelper_disabled = 0; |
302 | } | 325 | } |
303 | 326 | ||
327 | /** | ||
328 | * usermodehelper_is_disabled - check if new helpers are allowed to be started | ||
329 | */ | ||
330 | bool usermodehelper_is_disabled(void) | ||
331 | { | ||
332 | return usermodehelper_disabled; | ||
333 | } | ||
334 | EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); | ||
335 | |||
304 | static void helper_lock(void) | 336 | static void helper_lock(void) |
305 | { | 337 | { |
306 | atomic_inc(&running_helpers); | 338 | atomic_inc(&running_helpers); |
@@ -312,12 +344,6 @@ static void helper_unlock(void) | |||
312 | if (atomic_dec_and_test(&running_helpers)) | 344 | if (atomic_dec_and_test(&running_helpers)) |
313 | wake_up(&running_helpers_waitq); | 345 | wake_up(&running_helpers_waitq); |
314 | } | 346 | } |
315 | #else /* CONFIG_PM_SLEEP */ | ||
316 | #define usermodehelper_disabled 0 | ||
317 | |||
318 | static inline void helper_lock(void) {} | ||
319 | static inline void helper_unlock(void) {} | ||
320 | #endif /* CONFIG_PM_SLEEP */ | ||
321 | 347 | ||
322 | /** | 348 | /** |
323 | * call_usermodehelper_setup - prepare to call a usermode helper | 349 | * call_usermodehelper_setup - prepare to call a usermode helper |
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); | |||
364 | * context in which call_usermodehelper_exec is called. | 390 | * context in which call_usermodehelper_exec is called. |
365 | */ | 391 | */ |
366 | void call_usermodehelper_setfns(struct subprocess_info *info, | 392 | void call_usermodehelper_setfns(struct subprocess_info *info, |
367 | int (*init)(struct subprocess_info *info), | 393 | int (*init)(struct subprocess_info *info, struct cred *new), |
368 | void (*cleanup)(struct subprocess_info *info), | 394 | void (*cleanup)(struct subprocess_info *info), |
369 | void *data) | 395 | void *data) |
370 | { | 396 | { |
@@ -418,6 +444,84 @@ unlock: | |||
418 | } | 444 | } |
419 | EXPORT_SYMBOL(call_usermodehelper_exec); | 445 | EXPORT_SYMBOL(call_usermodehelper_exec); |
420 | 446 | ||
447 | static int proc_cap_handler(struct ctl_table *table, int write, | ||
448 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
449 | { | ||
450 | struct ctl_table t; | ||
451 | unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; | ||
452 | kernel_cap_t new_cap; | ||
453 | int err, i; | ||
454 | |||
455 | if (write && (!capable(CAP_SETPCAP) || | ||
456 | !capable(CAP_SYS_MODULE))) | ||
457 | return -EPERM; | ||
458 | |||
459 | /* | ||
460 | * convert from the global kernel_cap_t to the ulong array to print to | ||
461 | * userspace if this is a read. | ||
462 | */ | ||
463 | spin_lock(&umh_sysctl_lock); | ||
464 | for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { | ||
465 | if (table->data == CAP_BSET) | ||
466 | cap_array[i] = usermodehelper_bset.cap[i]; | ||
467 | else if (table->data == CAP_PI) | ||
468 | cap_array[i] = usermodehelper_inheritable.cap[i]; | ||
469 | else | ||
470 | BUG(); | ||
471 | } | ||
472 | spin_unlock(&umh_sysctl_lock); | ||
473 | |||
474 | t = *table; | ||
475 | t.data = &cap_array; | ||
476 | |||
477 | /* | ||
478 | * actually read or write and array of ulongs from userspace. Remember | ||
479 | * these are least significant 32 bits first | ||
480 | */ | ||
481 | err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); | ||
482 | if (err < 0) | ||
483 | return err; | ||
484 | |||
485 | /* | ||
486 | * convert from the sysctl array of ulongs to the kernel_cap_t | ||
487 | * internal representation | ||
488 | */ | ||
489 | for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) | ||
490 | new_cap.cap[i] = cap_array[i]; | ||
491 | |||
492 | /* | ||
493 | * Drop everything not in the new_cap (but don't add things) | ||
494 | */ | ||
495 | spin_lock(&umh_sysctl_lock); | ||
496 | if (write) { | ||
497 | if (table->data == CAP_BSET) | ||
498 | usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); | ||
499 | if (table->data == CAP_PI) | ||
500 | usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); | ||
501 | } | ||
502 | spin_unlock(&umh_sysctl_lock); | ||
503 | |||
504 | return 0; | ||
505 | } | ||
506 | |||
507 | struct ctl_table usermodehelper_table[] = { | ||
508 | { | ||
509 | .procname = "bset", | ||
510 | .data = CAP_BSET, | ||
511 | .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), | ||
512 | .mode = 0600, | ||
513 | .proc_handler = proc_cap_handler, | ||
514 | }, | ||
515 | { | ||
516 | .procname = "inheritable", | ||
517 | .data = CAP_PI, | ||
518 | .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), | ||
519 | .mode = 0600, | ||
520 | .proc_handler = proc_cap_handler, | ||
521 | }, | ||
522 | { } | ||
523 | }; | ||
524 | |||
421 | void __init usermodehelper_init(void) | 525 | void __init usermodehelper_init(void) |
422 | { | 526 | { |
423 | khelper_wq = create_singlethread_workqueue("khelper"); | 527 | khelper_wq = create_singlethread_workqueue("khelper"); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..77981813a1e7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
48 | #include <linux/ftrace.h> | 48 | #include <linux/ftrace.h> |
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/jump_label.h> | ||
50 | 51 | ||
51 | #include <asm-generic/sections.h> | 52 | #include <asm-generic/sections.h> |
52 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | |||
73 | /* NOTE: change this value only with kprobe_mutex held */ | 74 | /* NOTE: change this value only with kprobe_mutex held */ |
74 | static bool kprobes_all_disarmed; | 75 | static bool kprobes_all_disarmed; |
75 | 76 | ||
76 | static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 77 | /* This protects kprobe_table and optimizing_list */ |
78 | static DEFINE_MUTEX(kprobe_mutex); | ||
77 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
78 | static struct { | 80 | static struct { |
79 | spinlock_t lock ____cacheline_aligned_in_smp; | 81 | spinlock_t lock ____cacheline_aligned_in_smp; |
@@ -315,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | |||
315 | /* We have preemption disabled.. so it is safe to use __ versions */ | 317 | /* We have preemption disabled.. so it is safe to use __ versions */ |
316 | static inline void set_kprobe_instance(struct kprobe *kp) | 318 | static inline void set_kprobe_instance(struct kprobe *kp) |
317 | { | 319 | { |
318 | __get_cpu_var(kprobe_instance) = kp; | 320 | __this_cpu_write(kprobe_instance, kp); |
319 | } | 321 | } |
320 | 322 | ||
321 | static inline void reset_kprobe_instance(void) | 323 | static inline void reset_kprobe_instance(void) |
322 | { | 324 | { |
323 | __get_cpu_var(kprobe_instance) = NULL; | 325 | __this_cpu_write(kprobe_instance, NULL); |
324 | } | 326 | } |
325 | 327 | ||
326 | /* | 328 | /* |
@@ -352,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) | |||
352 | return p->pre_handler == aggr_pre_handler; | 354 | return p->pre_handler == aggr_pre_handler; |
353 | } | 355 | } |
354 | 356 | ||
357 | /* Return true(!0) if the kprobe is unused */ | ||
358 | static inline int kprobe_unused(struct kprobe *p) | ||
359 | { | ||
360 | return kprobe_aggrprobe(p) && kprobe_disabled(p) && | ||
361 | list_empty(&p->list); | ||
362 | } | ||
363 | |||
355 | /* | 364 | /* |
356 | * Keep all fields in the kprobe consistent | 365 | * Keep all fields in the kprobe consistent |
357 | */ | 366 | */ |
358 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | 367 | static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) |
359 | { | 368 | { |
360 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | 369 | memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); |
361 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | 370 | memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); |
362 | } | 371 | } |
363 | 372 | ||
364 | #ifdef CONFIG_OPTPROBES | 373 | #ifdef CONFIG_OPTPROBES |
@@ -382,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
382 | } | 391 | } |
383 | } | 392 | } |
384 | 393 | ||
394 | /* Free optimized instructions and optimized_kprobe */ | ||
395 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
396 | { | ||
397 | struct optimized_kprobe *op; | ||
398 | |||
399 | op = container_of(p, struct optimized_kprobe, kp); | ||
400 | arch_remove_optimized_kprobe(op); | ||
401 | arch_remove_kprobe(p); | ||
402 | kfree(op); | ||
403 | } | ||
404 | |||
385 | /* Return true(!0) if the kprobe is ready for optimization. */ | 405 | /* Return true(!0) if the kprobe is ready for optimization. */ |
386 | static inline int kprobe_optready(struct kprobe *p) | 406 | static inline int kprobe_optready(struct kprobe *p) |
387 | { | 407 | { |
@@ -395,11 +415,38 @@ static inline int kprobe_optready(struct kprobe *p) | |||
395 | return 0; | 415 | return 0; |
396 | } | 416 | } |
397 | 417 | ||
418 | /* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ | ||
419 | static inline int kprobe_disarmed(struct kprobe *p) | ||
420 | { | ||
421 | struct optimized_kprobe *op; | ||
422 | |||
423 | /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ | ||
424 | if (!kprobe_aggrprobe(p)) | ||
425 | return kprobe_disabled(p); | ||
426 | |||
427 | op = container_of(p, struct optimized_kprobe, kp); | ||
428 | |||
429 | return kprobe_disabled(p) && list_empty(&op->list); | ||
430 | } | ||
431 | |||
432 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ | ||
433 | static int __kprobes kprobe_queued(struct kprobe *p) | ||
434 | { | ||
435 | struct optimized_kprobe *op; | ||
436 | |||
437 | if (kprobe_aggrprobe(p)) { | ||
438 | op = container_of(p, struct optimized_kprobe, kp); | ||
439 | if (!list_empty(&op->list)) | ||
440 | return 1; | ||
441 | } | ||
442 | return 0; | ||
443 | } | ||
444 | |||
398 | /* | 445 | /* |
399 | * Return an optimized kprobe whose optimizing code replaces | 446 | * Return an optimized kprobe whose optimizing code replaces |
400 | * instructions including addr (exclude breakpoint). | 447 | * instructions including addr (exclude breakpoint). |
401 | */ | 448 | */ |
402 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | 449 | static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) |
403 | { | 450 | { |
404 | int i; | 451 | int i; |
405 | struct kprobe *p = NULL; | 452 | struct kprobe *p = NULL; |
@@ -420,30 +467,23 @@ struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | |||
420 | 467 | ||
421 | /* Optimization staging list, protected by kprobe_mutex */ | 468 | /* Optimization staging list, protected by kprobe_mutex */ |
422 | static LIST_HEAD(optimizing_list); | 469 | static LIST_HEAD(optimizing_list); |
470 | static LIST_HEAD(unoptimizing_list); | ||
423 | 471 | ||
424 | static void kprobe_optimizer(struct work_struct *work); | 472 | static void kprobe_optimizer(struct work_struct *work); |
425 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | 473 | static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); |
474 | static DECLARE_COMPLETION(optimizer_comp); | ||
426 | #define OPTIMIZE_DELAY 5 | 475 | #define OPTIMIZE_DELAY 5 |
427 | 476 | ||
428 | /* Kprobe jump optimizer */ | 477 | /* |
429 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 478 | * Optimize (replace a breakpoint with a jump) kprobes listed on |
479 | * optimizing_list. | ||
480 | */ | ||
481 | static __kprobes void do_optimize_kprobes(void) | ||
430 | { | 482 | { |
431 | struct optimized_kprobe *op, *tmp; | 483 | /* Optimization never be done when disarmed */ |
432 | 484 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | |
433 | /* Lock modules while optimizing kprobes */ | 485 | list_empty(&optimizing_list)) |
434 | mutex_lock(&module_mutex); | 486 | return; |
435 | mutex_lock(&kprobe_mutex); | ||
436 | if (kprobes_all_disarmed || !kprobes_allow_optimization) | ||
437 | goto end; | ||
438 | |||
439 | /* | ||
440 | * Wait for quiesence period to ensure all running interrupts | ||
441 | * are done. Because optprobe may modify multiple instructions | ||
442 | * there is a chance that Nth instruction is interrupted. In that | ||
443 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
444 | * instruction. This wait is for avoiding it. | ||
445 | */ | ||
446 | synchronize_sched(); | ||
447 | 487 | ||
448 | /* | 488 | /* |
449 | * The optimization/unoptimization refers online_cpus via | 489 | * The optimization/unoptimization refers online_cpus via |
@@ -457,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
457 | */ | 497 | */ |
458 | get_online_cpus(); | 498 | get_online_cpus(); |
459 | mutex_lock(&text_mutex); | 499 | mutex_lock(&text_mutex); |
460 | list_for_each_entry_safe(op, tmp, &optimizing_list, list) { | 500 | arch_optimize_kprobes(&optimizing_list); |
461 | WARN_ON(kprobe_disabled(&op->kp)); | 501 | mutex_unlock(&text_mutex); |
462 | if (arch_optimize_kprobe(op) < 0) | 502 | put_online_cpus(); |
463 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 503 | } |
464 | list_del_init(&op->list); | 504 | |
505 | /* | ||
506 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | ||
507 | * if need) kprobes listed on unoptimizing_list. | ||
508 | */ | ||
509 | static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) | ||
510 | { | ||
511 | struct optimized_kprobe *op, *tmp; | ||
512 | |||
513 | /* Unoptimization must be done anytime */ | ||
514 | if (list_empty(&unoptimizing_list)) | ||
515 | return; | ||
516 | |||
517 | /* Ditto to do_optimize_kprobes */ | ||
518 | get_online_cpus(); | ||
519 | mutex_lock(&text_mutex); | ||
520 | arch_unoptimize_kprobes(&unoptimizing_list, free_list); | ||
521 | /* Loop free_list for disarming */ | ||
522 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
523 | /* Disarm probes if marked disabled */ | ||
524 | if (kprobe_disabled(&op->kp)) | ||
525 | arch_disarm_kprobe(&op->kp); | ||
526 | if (kprobe_unused(&op->kp)) { | ||
527 | /* | ||
528 | * Remove unused probes from hash list. After waiting | ||
529 | * for synchronization, these probes are reclaimed. | ||
530 | * (reclaiming is done by do_free_cleaned_kprobes.) | ||
531 | */ | ||
532 | hlist_del_rcu(&op->kp.hlist); | ||
533 | } else | ||
534 | list_del_init(&op->list); | ||
465 | } | 535 | } |
466 | mutex_unlock(&text_mutex); | 536 | mutex_unlock(&text_mutex); |
467 | put_online_cpus(); | 537 | put_online_cpus(); |
468 | end: | 538 | } |
539 | |||
540 | /* Reclaim all kprobes on the free_list */ | ||
541 | static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) | ||
542 | { | ||
543 | struct optimized_kprobe *op, *tmp; | ||
544 | |||
545 | list_for_each_entry_safe(op, tmp, free_list, list) { | ||
546 | BUG_ON(!kprobe_unused(&op->kp)); | ||
547 | list_del_init(&op->list); | ||
548 | free_aggr_kprobe(&op->kp); | ||
549 | } | ||
550 | } | ||
551 | |||
552 | /* Start optimizer after OPTIMIZE_DELAY passed */ | ||
553 | static __kprobes void kick_kprobe_optimizer(void) | ||
554 | { | ||
555 | if (!delayed_work_pending(&optimizing_work)) | ||
556 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | ||
557 | } | ||
558 | |||
559 | /* Kprobe jump optimizer */ | ||
560 | static __kprobes void kprobe_optimizer(struct work_struct *work) | ||
561 | { | ||
562 | LIST_HEAD(free_list); | ||
563 | |||
564 | /* Lock modules while optimizing kprobes */ | ||
565 | mutex_lock(&module_mutex); | ||
566 | mutex_lock(&kprobe_mutex); | ||
567 | |||
568 | /* | ||
569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | ||
570 | * kprobes before waiting for quiesence period. | ||
571 | */ | ||
572 | do_unoptimize_kprobes(&free_list); | ||
573 | |||
574 | /* | ||
575 | * Step 2: Wait for quiesence period to ensure all running interrupts | ||
576 | * are done. Because optprobe may modify multiple instructions | ||
577 | * there is a chance that Nth instruction is interrupted. In that | ||
578 | * case, running interrupt can return to 2nd-Nth byte of jump | ||
579 | * instruction. This wait is for avoiding it. | ||
580 | */ | ||
581 | synchronize_sched(); | ||
582 | |||
583 | /* Step 3: Optimize kprobes after quiesence period */ | ||
584 | do_optimize_kprobes(); | ||
585 | |||
586 | /* Step 4: Free cleaned kprobes after quiesence period */ | ||
587 | do_free_cleaned_kprobes(&free_list); | ||
588 | |||
469 | mutex_unlock(&kprobe_mutex); | 589 | mutex_unlock(&kprobe_mutex); |
470 | mutex_unlock(&module_mutex); | 590 | mutex_unlock(&module_mutex); |
591 | |||
592 | /* Step 5: Kick optimizer again if needed */ | ||
593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | ||
594 | kick_kprobe_optimizer(); | ||
595 | else | ||
596 | /* Wake up all waiters */ | ||
597 | complete_all(&optimizer_comp); | ||
598 | } | ||
599 | |||
600 | /* Wait for completing optimization and unoptimization */ | ||
601 | static __kprobes void wait_for_kprobe_optimizer(void) | ||
602 | { | ||
603 | if (delayed_work_pending(&optimizing_work)) | ||
604 | wait_for_completion(&optimizer_comp); | ||
471 | } | 605 | } |
472 | 606 | ||
473 | /* Optimize kprobe if p is ready to be optimized */ | 607 | /* Optimize kprobe if p is ready to be optimized */ |
@@ -493,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) | |||
493 | /* Check if it is already optimized. */ | 627 | /* Check if it is already optimized. */ |
494 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) | 628 | if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) |
495 | return; | 629 | return; |
496 | |||
497 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; | 630 | op->kp.flags |= KPROBE_FLAG_OPTIMIZED; |
498 | list_add(&op->list, &optimizing_list); | 631 | |
499 | if (!delayed_work_pending(&optimizing_work)) | 632 | if (!list_empty(&op->list)) |
500 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | 633 | /* This is under unoptimizing. Just dequeue the probe */ |
634 | list_del_init(&op->list); | ||
635 | else { | ||
636 | list_add(&op->list, &optimizing_list); | ||
637 | kick_kprobe_optimizer(); | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /* Short cut to direct unoptimizing */ | ||
642 | static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | ||
643 | { | ||
644 | get_online_cpus(); | ||
645 | arch_unoptimize_kprobe(op); | ||
646 | put_online_cpus(); | ||
647 | if (kprobe_disabled(&op->kp)) | ||
648 | arch_disarm_kprobe(&op->kp); | ||
501 | } | 649 | } |
502 | 650 | ||
503 | /* Unoptimize a kprobe if p is optimized */ | 651 | /* Unoptimize a kprobe if p is optimized */ |
504 | static __kprobes void unoptimize_kprobe(struct kprobe *p) | 652 | static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) |
505 | { | 653 | { |
506 | struct optimized_kprobe *op; | 654 | struct optimized_kprobe *op; |
507 | 655 | ||
508 | if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { | 656 | if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) |
509 | op = container_of(p, struct optimized_kprobe, kp); | 657 | return; /* This is not an optprobe nor optimized */ |
510 | if (!list_empty(&op->list)) | 658 | |
511 | /* Dequeue from the optimization queue */ | 659 | op = container_of(p, struct optimized_kprobe, kp); |
660 | if (!kprobe_optimized(p)) { | ||
661 | /* Unoptimized or unoptimizing case */ | ||
662 | if (force && !list_empty(&op->list)) { | ||
663 | /* | ||
664 | * Only if this is unoptimizing kprobe and forced, | ||
665 | * forcibly unoptimize it. (No need to unoptimize | ||
666 | * unoptimized kprobe again :) | ||
667 | */ | ||
512 | list_del_init(&op->list); | 668 | list_del_init(&op->list); |
513 | else | 669 | force_unoptimize_kprobe(op); |
514 | /* Replace jump with break */ | 670 | } |
515 | arch_unoptimize_kprobe(op); | 671 | return; |
516 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 672 | } |
673 | |||
674 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | ||
675 | if (!list_empty(&op->list)) { | ||
676 | /* Dequeue from the optimization queue */ | ||
677 | list_del_init(&op->list); | ||
678 | return; | ||
679 | } | ||
680 | /* Optimized kprobe case */ | ||
681 | if (force) | ||
682 | /* Forcibly update the code: this is a special case */ | ||
683 | force_unoptimize_kprobe(op); | ||
684 | else { | ||
685 | list_add(&op->list, &unoptimizing_list); | ||
686 | kick_kprobe_optimizer(); | ||
517 | } | 687 | } |
518 | } | 688 | } |
519 | 689 | ||
690 | /* Cancel unoptimizing for reusing */ | ||
691 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
692 | { | ||
693 | struct optimized_kprobe *op; | ||
694 | |||
695 | BUG_ON(!kprobe_unused(ap)); | ||
696 | /* | ||
697 | * Unused kprobe MUST be on the way of delayed unoptimizing (means | ||
698 | * there is still a relative jump) and disabled. | ||
699 | */ | ||
700 | op = container_of(ap, struct optimized_kprobe, kp); | ||
701 | if (unlikely(list_empty(&op->list))) | ||
702 | printk(KERN_WARNING "Warning: found a stray unused " | ||
703 | "aggrprobe@%p\n", ap->addr); | ||
704 | /* Enable the probe again */ | ||
705 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
706 | /* Optimize it again (remove from op->list) */ | ||
707 | BUG_ON(!kprobe_optready(ap)); | ||
708 | optimize_kprobe(ap); | ||
709 | } | ||
710 | |||
520 | /* Remove optimized instructions */ | 711 | /* Remove optimized instructions */ |
521 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | 712 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) |
522 | { | 713 | { |
523 | struct optimized_kprobe *op; | 714 | struct optimized_kprobe *op; |
524 | 715 | ||
525 | op = container_of(p, struct optimized_kprobe, kp); | 716 | op = container_of(p, struct optimized_kprobe, kp); |
526 | if (!list_empty(&op->list)) { | 717 | if (!list_empty(&op->list)) |
527 | /* Dequeue from the optimization queue */ | 718 | /* Dequeue from the (un)optimization queue */ |
528 | list_del_init(&op->list); | 719 | list_del_init(&op->list); |
529 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; | 720 | |
530 | } | 721 | op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; |
531 | /* Don't unoptimize, because the target code will be freed. */ | 722 | /* Don't touch the code, because it is already freed. */ |
532 | arch_remove_optimized_kprobe(op); | 723 | arch_remove_optimized_kprobe(op); |
533 | } | 724 | } |
534 | 725 | ||
@@ -541,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | |||
541 | arch_prepare_optimized_kprobe(op); | 732 | arch_prepare_optimized_kprobe(op); |
542 | } | 733 | } |
543 | 734 | ||
544 | /* Free optimized instructions and optimized_kprobe */ | ||
545 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | ||
546 | { | ||
547 | struct optimized_kprobe *op; | ||
548 | |||
549 | op = container_of(p, struct optimized_kprobe, kp); | ||
550 | arch_remove_optimized_kprobe(op); | ||
551 | kfree(op); | ||
552 | } | ||
553 | |||
554 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 735 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
555 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 736 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
556 | { | 737 | { |
@@ -585,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
585 | op = container_of(ap, struct optimized_kprobe, kp); | 766 | op = container_of(ap, struct optimized_kprobe, kp); |
586 | if (!arch_prepared_optinsn(&op->optinsn)) { | 767 | if (!arch_prepared_optinsn(&op->optinsn)) { |
587 | /* If failed to setup optimizing, fallback to kprobe */ | 768 | /* If failed to setup optimizing, fallback to kprobe */ |
588 | free_aggr_kprobe(ap); | 769 | arch_remove_optimized_kprobe(op); |
770 | kfree(op); | ||
589 | return; | 771 | return; |
590 | } | 772 | } |
591 | 773 | ||
@@ -594,6 +776,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
594 | } | 776 | } |
595 | 777 | ||
596 | #ifdef CONFIG_SYSCTL | 778 | #ifdef CONFIG_SYSCTL |
779 | /* This should be called with kprobe_mutex locked */ | ||
597 | static void __kprobes optimize_all_kprobes(void) | 780 | static void __kprobes optimize_all_kprobes(void) |
598 | { | 781 | { |
599 | struct hlist_head *head; | 782 | struct hlist_head *head; |
@@ -606,17 +789,16 @@ static void __kprobes optimize_all_kprobes(void) | |||
606 | return; | 789 | return; |
607 | 790 | ||
608 | kprobes_allow_optimization = true; | 791 | kprobes_allow_optimization = true; |
609 | mutex_lock(&text_mutex); | ||
610 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 792 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
611 | head = &kprobe_table[i]; | 793 | head = &kprobe_table[i]; |
612 | hlist_for_each_entry_rcu(p, node, head, hlist) | 794 | hlist_for_each_entry_rcu(p, node, head, hlist) |
613 | if (!kprobe_disabled(p)) | 795 | if (!kprobe_disabled(p)) |
614 | optimize_kprobe(p); | 796 | optimize_kprobe(p); |
615 | } | 797 | } |
616 | mutex_unlock(&text_mutex); | ||
617 | printk(KERN_INFO "Kprobes globally optimized\n"); | 798 | printk(KERN_INFO "Kprobes globally optimized\n"); |
618 | } | 799 | } |
619 | 800 | ||
801 | /* This should be called with kprobe_mutex locked */ | ||
620 | static void __kprobes unoptimize_all_kprobes(void) | 802 | static void __kprobes unoptimize_all_kprobes(void) |
621 | { | 803 | { |
622 | struct hlist_head *head; | 804 | struct hlist_head *head; |
@@ -629,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) | |||
629 | return; | 811 | return; |
630 | 812 | ||
631 | kprobes_allow_optimization = false; | 813 | kprobes_allow_optimization = false; |
632 | printk(KERN_INFO "Kprobes globally unoptimized\n"); | ||
633 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | ||
634 | mutex_lock(&text_mutex); | ||
635 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 814 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
636 | head = &kprobe_table[i]; | 815 | head = &kprobe_table[i]; |
637 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 816 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
638 | if (!kprobe_disabled(p)) | 817 | if (!kprobe_disabled(p)) |
639 | unoptimize_kprobe(p); | 818 | unoptimize_kprobe(p, false); |
640 | } | 819 | } |
641 | } | 820 | } |
642 | 821 | /* Wait for unoptimizing completion */ | |
643 | mutex_unlock(&text_mutex); | 822 | wait_for_kprobe_optimizer(); |
644 | put_online_cpus(); | 823 | printk(KERN_INFO "Kprobes globally unoptimized\n"); |
645 | /* Allow all currently running kprobes to complete */ | ||
646 | synchronize_sched(); | ||
647 | } | 824 | } |
648 | 825 | ||
649 | int sysctl_kprobes_optimization; | 826 | int sysctl_kprobes_optimization; |
@@ -667,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
667 | } | 844 | } |
668 | #endif /* CONFIG_SYSCTL */ | 845 | #endif /* CONFIG_SYSCTL */ |
669 | 846 | ||
847 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ | ||
670 | static void __kprobes __arm_kprobe(struct kprobe *p) | 848 | static void __kprobes __arm_kprobe(struct kprobe *p) |
671 | { | 849 | { |
672 | struct kprobe *old_p; | 850 | struct kprobe *_p; |
673 | 851 | ||
674 | /* Check collision with other optimized kprobes */ | 852 | /* Check collision with other optimized kprobes */ |
675 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 853 | _p = get_optimized_kprobe((unsigned long)p->addr); |
676 | if (unlikely(old_p)) | 854 | if (unlikely(_p)) |
677 | unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ | 855 | /* Fallback to unoptimized kprobe */ |
856 | unoptimize_kprobe(_p, true); | ||
678 | 857 | ||
679 | arch_arm_kprobe(p); | 858 | arch_arm_kprobe(p); |
680 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ | 859 | optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ |
681 | } | 860 | } |
682 | 861 | ||
683 | static void __kprobes __disarm_kprobe(struct kprobe *p) | 862 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ |
863 | static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) | ||
684 | { | 864 | { |
685 | struct kprobe *old_p; | 865 | struct kprobe *_p; |
686 | 866 | ||
687 | unoptimize_kprobe(p); /* Try to unoptimize */ | 867 | unoptimize_kprobe(p, false); /* Try to unoptimize */ |
688 | arch_disarm_kprobe(p); | ||
689 | 868 | ||
690 | /* If another kprobe was blocked, optimize it. */ | 869 | if (!kprobe_queued(p)) { |
691 | old_p = get_optimized_kprobe((unsigned long)p->addr); | 870 | arch_disarm_kprobe(p); |
692 | if (unlikely(old_p)) | 871 | /* If another kprobe was blocked, optimize it. */ |
693 | optimize_kprobe(old_p); | 872 | _p = get_optimized_kprobe((unsigned long)p->addr); |
873 | if (unlikely(_p) && reopt) | ||
874 | optimize_kprobe(_p); | ||
875 | } | ||
876 | /* TODO: reoptimize others after unoptimized this probe */ | ||
694 | } | 877 | } |
695 | 878 | ||
696 | #else /* !CONFIG_OPTPROBES */ | 879 | #else /* !CONFIG_OPTPROBES */ |
697 | 880 | ||
698 | #define optimize_kprobe(p) do {} while (0) | 881 | #define optimize_kprobe(p) do {} while (0) |
699 | #define unoptimize_kprobe(p) do {} while (0) | 882 | #define unoptimize_kprobe(p, f) do {} while (0) |
700 | #define kill_optimized_kprobe(p) do {} while (0) | 883 | #define kill_optimized_kprobe(p) do {} while (0) |
701 | #define prepare_optimized_kprobe(p) do {} while (0) | 884 | #define prepare_optimized_kprobe(p) do {} while (0) |
702 | #define try_to_optimize_kprobe(p) do {} while (0) | 885 | #define try_to_optimize_kprobe(p) do {} while (0) |
703 | #define __arm_kprobe(p) arch_arm_kprobe(p) | 886 | #define __arm_kprobe(p) arch_arm_kprobe(p) |
704 | #define __disarm_kprobe(p) arch_disarm_kprobe(p) | 887 | #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) |
888 | #define kprobe_disarmed(p) kprobe_disabled(p) | ||
889 | #define wait_for_kprobe_optimizer() do {} while (0) | ||
890 | |||
891 | /* There should be no unused kprobes can be reused without optimization */ | ||
892 | static void reuse_unused_kprobe(struct kprobe *ap) | ||
893 | { | ||
894 | printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); | ||
895 | BUG_ON(kprobe_unused(ap)); | ||
896 | } | ||
705 | 897 | ||
706 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 898 | static __kprobes void free_aggr_kprobe(struct kprobe *p) |
707 | { | 899 | { |
900 | arch_remove_kprobe(p); | ||
708 | kfree(p); | 901 | kfree(p); |
709 | } | 902 | } |
710 | 903 | ||
@@ -730,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
730 | /* Disarm a kprobe with text_mutex */ | 923 | /* Disarm a kprobe with text_mutex */ |
731 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 924 | static void __kprobes disarm_kprobe(struct kprobe *kp) |
732 | { | 925 | { |
733 | get_online_cpus(); /* For avoiding text_mutex deadlock */ | 926 | /* Ditto */ |
734 | mutex_lock(&text_mutex); | 927 | mutex_lock(&text_mutex); |
735 | __disarm_kprobe(kp); | 928 | __disarm_kprobe(kp, true); |
736 | mutex_unlock(&text_mutex); | 929 | mutex_unlock(&text_mutex); |
737 | put_online_cpus(); | ||
738 | } | 930 | } |
739 | 931 | ||
740 | /* | 932 | /* |
@@ -773,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
773 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 965 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
774 | int trapnr) | 966 | int trapnr) |
775 | { | 967 | { |
776 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 968 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
777 | 969 | ||
778 | /* | 970 | /* |
779 | * if we faulted "during" the execution of a user specified | 971 | * if we faulted "during" the execution of a user specified |
@@ -788,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
788 | 980 | ||
789 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 981 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
790 | { | 982 | { |
791 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 983 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
792 | int ret = 0; | 984 | int ret = 0; |
793 | 985 | ||
794 | if (cur && cur->break_handler) { | 986 | if (cur && cur->break_handler) { |
@@ -831,6 +1023,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
831 | 1023 | ||
832 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | 1024 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, |
833 | struct hlist_head **head, unsigned long *flags) | 1025 | struct hlist_head **head, unsigned long *flags) |
1026 | __acquires(hlist_lock) | ||
834 | { | 1027 | { |
835 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1028 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
836 | spinlock_t *hlist_lock; | 1029 | spinlock_t *hlist_lock; |
@@ -842,6 +1035,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | |||
842 | 1035 | ||
843 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 1036 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
844 | unsigned long *flags) | 1037 | unsigned long *flags) |
1038 | __acquires(hlist_lock) | ||
845 | { | 1039 | { |
846 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1040 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
847 | spin_lock_irqsave(hlist_lock, *flags); | 1041 | spin_lock_irqsave(hlist_lock, *flags); |
@@ -849,6 +1043,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, | |||
849 | 1043 | ||
850 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 1044 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
851 | unsigned long *flags) | 1045 | unsigned long *flags) |
1046 | __releases(hlist_lock) | ||
852 | { | 1047 | { |
853 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1048 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
854 | spinlock_t *hlist_lock; | 1049 | spinlock_t *hlist_lock; |
@@ -857,7 +1052,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | |||
857 | spin_unlock_irqrestore(hlist_lock, *flags); | 1052 | spin_unlock_irqrestore(hlist_lock, *flags); |
858 | } | 1053 | } |
859 | 1054 | ||
860 | void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) | 1055 | static void __kprobes kretprobe_table_unlock(unsigned long hash, |
1056 | unsigned long *flags) | ||
1057 | __releases(hlist_lock) | ||
861 | { | 1058 | { |
862 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1059 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
863 | spin_unlock_irqrestore(hlist_lock, *flags); | 1060 | spin_unlock_irqrestore(hlist_lock, *flags); |
@@ -935,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
935 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 1132 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
936 | 1133 | ||
937 | if (p->break_handler || p->post_handler) | 1134 | if (p->break_handler || p->post_handler) |
938 | unoptimize_kprobe(ap); /* Fall back to normal kprobe */ | 1135 | unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ |
939 | 1136 | ||
940 | if (p->break_handler) { | 1137 | if (p->break_handler) { |
941 | if (ap->break_handler) | 1138 | if (ap->break_handler) |
@@ -986,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
986 | * This is the second or subsequent kprobe at the address - handle | 1183 | * This is the second or subsequent kprobe at the address - handle |
987 | * the intricacies | 1184 | * the intricacies |
988 | */ | 1185 | */ |
989 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 1186 | static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, |
990 | struct kprobe *p) | 1187 | struct kprobe *p) |
991 | { | 1188 | { |
992 | int ret = 0; | 1189 | int ret = 0; |
993 | struct kprobe *ap = old_p; | 1190 | struct kprobe *ap = orig_p; |
994 | 1191 | ||
995 | if (!kprobe_aggrprobe(old_p)) { | 1192 | if (!kprobe_aggrprobe(orig_p)) { |
996 | /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
997 | ap = alloc_aggr_kprobe(old_p); | 1194 | ap = alloc_aggr_kprobe(orig_p); |
998 | if (!ap) | 1195 | if (!ap) |
999 | return -ENOMEM; | 1196 | return -ENOMEM; |
1000 | init_aggr_kprobe(ap, old_p); | 1197 | init_aggr_kprobe(ap, orig_p); |
1001 | } | 1198 | } else if (kprobe_unused(ap)) |
1199 | /* This probe is going to die. Rescue it */ | ||
1200 | reuse_unused_kprobe(ap); | ||
1002 | 1201 | ||
1003 | if (kprobe_gone(ap)) { | 1202 | if (kprobe_gone(ap)) { |
1004 | /* | 1203 | /* |
@@ -1032,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
1032 | return add_new_kprobe(ap, p); | 1231 | return add_new_kprobe(ap, p); |
1033 | } | 1232 | } |
1034 | 1233 | ||
1035 | /* Try to disable aggr_kprobe, and return 1 if succeeded.*/ | ||
1036 | static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) | ||
1037 | { | ||
1038 | struct kprobe *kp; | ||
1039 | |||
1040 | list_for_each_entry_rcu(kp, &p->list, list) { | ||
1041 | if (!kprobe_disabled(kp)) | ||
1042 | /* | ||
1043 | * There is an active probe on the list. | ||
1044 | * We can't disable aggr_kprobe. | ||
1045 | */ | ||
1046 | return 0; | ||
1047 | } | ||
1048 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1049 | return 1; | ||
1050 | } | ||
1051 | |||
1052 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1234 | static int __kprobes in_kprobes_functions(unsigned long addr) |
1053 | { | 1235 | { |
1054 | struct kprobe_blackpoint *kb; | 1236 | struct kprobe_blackpoint *kb; |
@@ -1091,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
1091 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
1092 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | 1274 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) |
1093 | { | 1275 | { |
1094 | struct kprobe *old_p, *list_p; | 1276 | struct kprobe *ap, *list_p; |
1095 | 1277 | ||
1096 | old_p = get_kprobe(p->addr); | 1278 | ap = get_kprobe(p->addr); |
1097 | if (unlikely(!old_p)) | 1279 | if (unlikely(!ap)) |
1098 | return NULL; | 1280 | return NULL; |
1099 | 1281 | ||
1100 | if (p != old_p) { | 1282 | if (p != ap) { |
1101 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 1283 | list_for_each_entry_rcu(list_p, &ap->list, list) |
1102 | if (list_p == p) | 1284 | if (list_p == p) |
1103 | /* kprobe p is a valid probe */ | 1285 | /* kprobe p is a valid probe */ |
1104 | goto valid; | 1286 | goto valid; |
1105 | return NULL; | 1287 | return NULL; |
1106 | } | 1288 | } |
1107 | valid: | 1289 | valid: |
1108 | return old_p; | 1290 | return ap; |
1109 | } | 1291 | } |
1110 | 1292 | ||
1111 | /* Return error if the kprobe is being re-registered */ | 1293 | /* Return error if the kprobe is being re-registered */ |
1112 | static inline int check_kprobe_rereg(struct kprobe *p) | 1294 | static inline int check_kprobe_rereg(struct kprobe *p) |
1113 | { | 1295 | { |
1114 | int ret = 0; | 1296 | int ret = 0; |
1115 | struct kprobe *old_p; | ||
1116 | 1297 | ||
1117 | mutex_lock(&kprobe_mutex); | 1298 | mutex_lock(&kprobe_mutex); |
1118 | old_p = __get_valid_kprobe(p); | 1299 | if (__get_valid_kprobe(p)) |
1119 | if (old_p) | ||
1120 | ret = -EINVAL; | 1300 | ret = -EINVAL; |
1121 | mutex_unlock(&kprobe_mutex); | 1301 | mutex_unlock(&kprobe_mutex); |
1302 | |||
1122 | return ret; | 1303 | return ret; |
1123 | } | 1304 | } |
1124 | 1305 | ||
@@ -1138,13 +1319,13 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1138 | if (ret) | 1319 | if (ret) |
1139 | return ret; | 1320 | return ret; |
1140 | 1321 | ||
1322 | jump_label_lock(); | ||
1141 | preempt_disable(); | 1323 | preempt_disable(); |
1142 | if (!kernel_text_address((unsigned long) p->addr) || | 1324 | if (!kernel_text_address((unsigned long) p->addr) || |
1143 | in_kprobes_functions((unsigned long) p->addr) || | 1325 | in_kprobes_functions((unsigned long) p->addr) || |
1144 | ftrace_text_reserved(p->addr, p->addr)) { | 1326 | ftrace_text_reserved(p->addr, p->addr) || |
1145 | preempt_enable(); | 1327 | jump_label_text_reserved(p->addr, p->addr)) |
1146 | return -EINVAL; | 1328 | goto fail_with_jump_label; |
1147 | } | ||
1148 | 1329 | ||
1149 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ | 1330 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ |
1150 | p->flags &= KPROBE_FLAG_DISABLED; | 1331 | p->flags &= KPROBE_FLAG_DISABLED; |
@@ -1158,10 +1339,9 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1158 | * We must hold a refcount of the probed module while updating | 1339 | * We must hold a refcount of the probed module while updating |
1159 | * its code to prohibit unexpected unloading. | 1340 | * its code to prohibit unexpected unloading. |
1160 | */ | 1341 | */ |
1161 | if (unlikely(!try_module_get(probed_mod))) { | 1342 | if (unlikely(!try_module_get(probed_mod))) |
1162 | preempt_enable(); | 1343 | goto fail_with_jump_label; |
1163 | return -EINVAL; | 1344 | |
1164 | } | ||
1165 | /* | 1345 | /* |
1166 | * If the module freed .init.text, we couldn't insert | 1346 | * If the module freed .init.text, we couldn't insert |
1167 | * kprobes in there. | 1347 | * kprobes in there. |
@@ -1169,16 +1349,18 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1169 | if (within_module_init((unsigned long)p->addr, probed_mod) && | 1349 | if (within_module_init((unsigned long)p->addr, probed_mod) && |
1170 | probed_mod->state != MODULE_STATE_COMING) { | 1350 | probed_mod->state != MODULE_STATE_COMING) { |
1171 | module_put(probed_mod); | 1351 | module_put(probed_mod); |
1172 | preempt_enable(); | 1352 | goto fail_with_jump_label; |
1173 | return -EINVAL; | ||
1174 | } | 1353 | } |
1175 | } | 1354 | } |
1176 | preempt_enable(); | 1355 | preempt_enable(); |
1356 | jump_label_unlock(); | ||
1177 | 1357 | ||
1178 | p->nmissed = 0; | 1358 | p->nmissed = 0; |
1179 | INIT_LIST_HEAD(&p->list); | 1359 | INIT_LIST_HEAD(&p->list); |
1180 | mutex_lock(&kprobe_mutex); | 1360 | mutex_lock(&kprobe_mutex); |
1181 | 1361 | ||
1362 | jump_label_lock(); /* needed to call jump_label_text_reserved() */ | ||
1363 | |||
1182 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ | 1364 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ |
1183 | mutex_lock(&text_mutex); | 1365 | mutex_lock(&text_mutex); |
1184 | 1366 | ||
@@ -1206,76 +1388,136 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1206 | out: | 1388 | out: |
1207 | mutex_unlock(&text_mutex); | 1389 | mutex_unlock(&text_mutex); |
1208 | put_online_cpus(); | 1390 | put_online_cpus(); |
1391 | jump_label_unlock(); | ||
1209 | mutex_unlock(&kprobe_mutex); | 1392 | mutex_unlock(&kprobe_mutex); |
1210 | 1393 | ||
1211 | if (probed_mod) | 1394 | if (probed_mod) |
1212 | module_put(probed_mod); | 1395 | module_put(probed_mod); |
1213 | 1396 | ||
1214 | return ret; | 1397 | return ret; |
1398 | |||
1399 | fail_with_jump_label: | ||
1400 | preempt_enable(); | ||
1401 | jump_label_unlock(); | ||
1402 | return -EINVAL; | ||
1215 | } | 1403 | } |
1216 | EXPORT_SYMBOL_GPL(register_kprobe); | 1404 | EXPORT_SYMBOL_GPL(register_kprobe); |
1217 | 1405 | ||
1406 | /* Check if all probes on the aggrprobe are disabled */ | ||
1407 | static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | ||
1408 | { | ||
1409 | struct kprobe *kp; | ||
1410 | |||
1411 | list_for_each_entry_rcu(kp, &ap->list, list) | ||
1412 | if (!kprobe_disabled(kp)) | ||
1413 | /* | ||
1414 | * There is an active probe on the list. | ||
1415 | * We can't disable this ap. | ||
1416 | */ | ||
1417 | return 0; | ||
1418 | |||
1419 | return 1; | ||
1420 | } | ||
1421 | |||
1422 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ | ||
1423 | static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | ||
1424 | { | ||
1425 | struct kprobe *orig_p; | ||
1426 | |||
1427 | /* Get an original kprobe for return */ | ||
1428 | orig_p = __get_valid_kprobe(p); | ||
1429 | if (unlikely(orig_p == NULL)) | ||
1430 | return NULL; | ||
1431 | |||
1432 | if (!kprobe_disabled(p)) { | ||
1433 | /* Disable probe if it is a child probe */ | ||
1434 | if (p != orig_p) | ||
1435 | p->flags |= KPROBE_FLAG_DISABLED; | ||
1436 | |||
1437 | /* Try to disarm and disable this/parent probe */ | ||
1438 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | ||
1439 | disarm_kprobe(orig_p); | ||
1440 | orig_p->flags |= KPROBE_FLAG_DISABLED; | ||
1441 | } | ||
1442 | } | ||
1443 | |||
1444 | return orig_p; | ||
1445 | } | ||
1446 | |||
1218 | /* | 1447 | /* |
1219 | * Unregister a kprobe without a scheduler synchronization. | 1448 | * Unregister a kprobe without a scheduler synchronization. |
1220 | */ | 1449 | */ |
1221 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) | 1450 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) |
1222 | { | 1451 | { |
1223 | struct kprobe *old_p, *list_p; | 1452 | struct kprobe *ap, *list_p; |
1224 | 1453 | ||
1225 | old_p = __get_valid_kprobe(p); | 1454 | /* Disable kprobe. This will disarm it if needed. */ |
1226 | if (old_p == NULL) | 1455 | ap = __disable_kprobe(p); |
1456 | if (ap == NULL) | ||
1227 | return -EINVAL; | 1457 | return -EINVAL; |
1228 | 1458 | ||
1229 | if (old_p == p || | 1459 | if (ap == p) |
1230 | (kprobe_aggrprobe(old_p) && | ||
1231 | list_is_singular(&old_p->list))) { | ||
1232 | /* | 1460 | /* |
1233 | * Only probe on the hash list. Disarm only if kprobes are | 1461 | * This probe is an independent(and non-optimized) kprobe |
1234 | * enabled and not gone - otherwise, the breakpoint would | 1462 | * (not an aggrprobe). Remove from the hash list. |
1235 | * already have been removed. We save on flushing icache. | ||
1236 | */ | 1463 | */ |
1237 | if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) | 1464 | goto disarmed; |
1238 | disarm_kprobe(old_p); | 1465 | |
1239 | hlist_del_rcu(&old_p->hlist); | 1466 | /* Following process expects this probe is an aggrprobe */ |
1240 | } else { | 1467 | WARN_ON(!kprobe_aggrprobe(ap)); |
1468 | |||
1469 | if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) | ||
1470 | /* | ||
1471 | * !disarmed could be happen if the probe is under delayed | ||
1472 | * unoptimizing. | ||
1473 | */ | ||
1474 | goto disarmed; | ||
1475 | else { | ||
1476 | /* If disabling probe has special handlers, update aggrprobe */ | ||
1241 | if (p->break_handler && !kprobe_gone(p)) | 1477 | if (p->break_handler && !kprobe_gone(p)) |
1242 | old_p->break_handler = NULL; | 1478 | ap->break_handler = NULL; |
1243 | if (p->post_handler && !kprobe_gone(p)) { | 1479 | if (p->post_handler && !kprobe_gone(p)) { |
1244 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | 1480 | list_for_each_entry_rcu(list_p, &ap->list, list) { |
1245 | if ((list_p != p) && (list_p->post_handler)) | 1481 | if ((list_p != p) && (list_p->post_handler)) |
1246 | goto noclean; | 1482 | goto noclean; |
1247 | } | 1483 | } |
1248 | old_p->post_handler = NULL; | 1484 | ap->post_handler = NULL; |
1249 | } | 1485 | } |
1250 | noclean: | 1486 | noclean: |
1487 | /* | ||
1488 | * Remove from the aggrprobe: this path will do nothing in | ||
1489 | * __unregister_kprobe_bottom(). | ||
1490 | */ | ||
1251 | list_del_rcu(&p->list); | 1491 | list_del_rcu(&p->list); |
1252 | if (!kprobe_disabled(old_p)) { | 1492 | if (!kprobe_disabled(ap) && !kprobes_all_disarmed) |
1253 | try_to_disable_aggr_kprobe(old_p); | 1493 | /* |
1254 | if (!kprobes_all_disarmed) { | 1494 | * Try to optimize this probe again, because post |
1255 | if (kprobe_disabled(old_p)) | 1495 | * handler may have been changed. |
1256 | disarm_kprobe(old_p); | 1496 | */ |
1257 | else | 1497 | optimize_kprobe(ap); |
1258 | /* Try to optimize this probe again */ | ||
1259 | optimize_kprobe(old_p); | ||
1260 | } | ||
1261 | } | ||
1262 | } | 1498 | } |
1263 | return 0; | 1499 | return 0; |
1500 | |||
1501 | disarmed: | ||
1502 | BUG_ON(!kprobe_disarmed(ap)); | ||
1503 | hlist_del_rcu(&ap->hlist); | ||
1504 | return 0; | ||
1264 | } | 1505 | } |
1265 | 1506 | ||
1266 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 1507 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
1267 | { | 1508 | { |
1268 | struct kprobe *old_p; | 1509 | struct kprobe *ap; |
1269 | 1510 | ||
1270 | if (list_empty(&p->list)) | 1511 | if (list_empty(&p->list)) |
1512 | /* This is an independent kprobe */ | ||
1271 | arch_remove_kprobe(p); | 1513 | arch_remove_kprobe(p); |
1272 | else if (list_is_singular(&p->list)) { | 1514 | else if (list_is_singular(&p->list)) { |
1273 | /* "p" is the last child of an aggr_kprobe */ | 1515 | /* This is the last child of an aggrprobe */ |
1274 | old_p = list_entry(p->list.next, struct kprobe, list); | 1516 | ap = list_entry(p->list.next, struct kprobe, list); |
1275 | list_del(&p->list); | 1517 | list_del(&p->list); |
1276 | arch_remove_kprobe(old_p); | 1518 | free_aggr_kprobe(ap); |
1277 | free_aggr_kprobe(old_p); | ||
1278 | } | 1519 | } |
1520 | /* Otherwise, do nothing. */ | ||
1279 | } | 1521 | } |
1280 | 1522 | ||
1281 | int __kprobes register_kprobes(struct kprobe **kps, int num) | 1523 | int __kprobes register_kprobes(struct kprobe **kps, int num) |
@@ -1339,18 +1581,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) | |||
1339 | if (num <= 0) | 1581 | if (num <= 0) |
1340 | return -EINVAL; | 1582 | return -EINVAL; |
1341 | for (i = 0; i < num; i++) { | 1583 | for (i = 0; i < num; i++) { |
1342 | unsigned long addr; | 1584 | unsigned long addr, offset; |
1343 | jp = jps[i]; | 1585 | jp = jps[i]; |
1344 | addr = arch_deref_entry_point(jp->entry); | 1586 | addr = arch_deref_entry_point(jp->entry); |
1345 | 1587 | ||
1346 | if (!kernel_text_address(addr)) | 1588 | /* Verify probepoint is a function entry point */ |
1347 | ret = -EINVAL; | 1589 | if (kallsyms_lookup_size_offset(addr, NULL, &offset) && |
1348 | else { | 1590 | offset == 0) { |
1349 | /* Todo: Verify probepoint is a function entry point */ | ||
1350 | jp->kp.pre_handler = setjmp_pre_handler; | 1591 | jp->kp.pre_handler = setjmp_pre_handler; |
1351 | jp->kp.break_handler = longjmp_break_handler; | 1592 | jp->kp.break_handler = longjmp_break_handler; |
1352 | ret = register_kprobe(&jp->kp); | 1593 | ret = register_kprobe(&jp->kp); |
1353 | } | 1594 | } else |
1595 | ret = -EINVAL; | ||
1596 | |||
1354 | if (ret < 0) { | 1597 | if (ret < 0) { |
1355 | if (i > 0) | 1598 | if (i > 0) |
1356 | unregister_jprobes(jps, i); | 1599 | unregister_jprobes(jps, i); |
@@ -1592,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1592 | int __kprobes disable_kprobe(struct kprobe *kp) | 1835 | int __kprobes disable_kprobe(struct kprobe *kp) |
1593 | { | 1836 | { |
1594 | int ret = 0; | 1837 | int ret = 0; |
1595 | struct kprobe *p; | ||
1596 | 1838 | ||
1597 | mutex_lock(&kprobe_mutex); | 1839 | mutex_lock(&kprobe_mutex); |
1598 | 1840 | ||
1599 | /* Check whether specified probe is valid. */ | 1841 | /* Disable this kprobe */ |
1600 | p = __get_valid_kprobe(kp); | 1842 | if (__disable_kprobe(kp) == NULL) |
1601 | if (unlikely(p == NULL)) { | ||
1602 | ret = -EINVAL; | 1843 | ret = -EINVAL; |
1603 | goto out; | ||
1604 | } | ||
1605 | 1844 | ||
1606 | /* If the probe is already disabled (or gone), just return */ | ||
1607 | if (kprobe_disabled(kp)) | ||
1608 | goto out; | ||
1609 | |||
1610 | kp->flags |= KPROBE_FLAG_DISABLED; | ||
1611 | if (p != kp) | ||
1612 | /* When kp != p, p is always enabled. */ | ||
1613 | try_to_disable_aggr_kprobe(p); | ||
1614 | |||
1615 | if (!kprobes_all_disarmed && kprobe_disabled(p)) | ||
1616 | disarm_kprobe(p); | ||
1617 | out: | ||
1618 | mutex_unlock(&kprobe_mutex); | 1845 | mutex_unlock(&kprobe_mutex); |
1619 | return ret; | 1846 | return ret; |
1620 | } | 1847 | } |
@@ -1912,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) | |||
1912 | mutex_lock(&kprobe_mutex); | 2139 | mutex_lock(&kprobe_mutex); |
1913 | 2140 | ||
1914 | /* If kprobes are already disarmed, just return */ | 2141 | /* If kprobes are already disarmed, just return */ |
1915 | if (kprobes_all_disarmed) | 2142 | if (kprobes_all_disarmed) { |
1916 | goto already_disabled; | 2143 | mutex_unlock(&kprobe_mutex); |
2144 | return; | ||
2145 | } | ||
1917 | 2146 | ||
1918 | kprobes_all_disarmed = true; | 2147 | kprobes_all_disarmed = true; |
1919 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2148 | printk(KERN_INFO "Kprobes globally disabled\n"); |
1920 | 2149 | ||
1921 | /* | ||
1922 | * Here we call get_online_cpus() for avoiding text_mutex deadlock, | ||
1923 | * because disarming may also unoptimize kprobes. | ||
1924 | */ | ||
1925 | get_online_cpus(); | ||
1926 | mutex_lock(&text_mutex); | 2150 | mutex_lock(&text_mutex); |
1927 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2151 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1928 | head = &kprobe_table[i]; | 2152 | head = &kprobe_table[i]; |
1929 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2153 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1930 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2154 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
1931 | __disarm_kprobe(p); | 2155 | __disarm_kprobe(p, false); |
1932 | } | 2156 | } |
1933 | } | 2157 | } |
1934 | |||
1935 | mutex_unlock(&text_mutex); | 2158 | mutex_unlock(&text_mutex); |
1936 | put_online_cpus(); | ||
1937 | mutex_unlock(&kprobe_mutex); | 2159 | mutex_unlock(&kprobe_mutex); |
1938 | /* Allow all currently running kprobes to complete */ | ||
1939 | synchronize_sched(); | ||
1940 | return; | ||
1941 | 2160 | ||
1942 | already_disabled: | 2161 | /* Wait for disarming all kprobes by optimizer */ |
1943 | mutex_unlock(&kprobe_mutex); | 2162 | wait_for_kprobe_optimizer(); |
1944 | return; | ||
1945 | } | 2163 | } |
1946 | 2164 | ||
1947 | /* | 2165 | /* |
@@ -1992,6 +2210,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
1992 | static const struct file_operations fops_kp = { | 2210 | static const struct file_operations fops_kp = { |
1993 | .read = read_enabled_file_bool, | 2211 | .read = read_enabled_file_bool, |
1994 | .write = write_enabled_file_bool, | 2212 | .write = write_enabled_file_bool, |
2213 | .llseek = default_llseek, | ||
1995 | }; | 2214 | }; |
1996 | 2215 | ||
1997 | static int __kprobes debugfs_kprobe_init(void) | 2216 | static int __kprobes debugfs_kprobe_init(void) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0b624e791805..3b053c04dd86 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
17 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/capability.h> | ||
19 | 20 | ||
20 | #define KERNEL_ATTR_RO(_name) \ | 21 | #define KERNEL_ATTR_RO(_name) \ |
21 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | 22 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo); | |||
131 | 132 | ||
132 | #endif /* CONFIG_KEXEC */ | 133 | #endif /* CONFIG_KEXEC */ |
133 | 134 | ||
135 | /* whether file capabilities are enabled */ | ||
136 | static ssize_t fscaps_show(struct kobject *kobj, | ||
137 | struct kobj_attribute *attr, char *buf) | ||
138 | { | ||
139 | return sprintf(buf, "%d\n", file_caps_enabled); | ||
140 | } | ||
141 | KERNEL_ATTR_RO(fscaps); | ||
142 | |||
134 | /* | 143 | /* |
135 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 144 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
136 | */ | 145 | */ |
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj; | |||
158 | EXPORT_SYMBOL_GPL(kernel_kobj); | 167 | EXPORT_SYMBOL_GPL(kernel_kobj); |
159 | 168 | ||
160 | static struct attribute * kernel_attrs[] = { | 169 | static struct attribute * kernel_attrs[] = { |
170 | &fscaps_attr.attr, | ||
161 | #if defined(CONFIG_HOTPLUG) | 171 | #if defined(CONFIG_HOTPLUG) |
162 | &uevent_seqnum_attr.attr, | 172 | &uevent_seqnum_attr.attr, |
163 | &uevent_helper_attr.attr, | 173 | &uevent_helper_attr.attr, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d1..4ba7cccb4994 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -27,6 +27,7 @@ struct kthread_create_info | |||
27 | /* Information passed to kthread() from kthreadd. */ | 27 | /* Information passed to kthread() from kthreadd. */ |
28 | int (*threadfn)(void *data); | 28 | int (*threadfn)(void *data); |
29 | void *data; | 29 | void *data; |
30 | int node; | ||
30 | 31 | ||
31 | /* Result passed back to kthread_create() from kthreadd. */ | 32 | /* Result passed back to kthread_create() from kthreadd. */ |
32 | struct task_struct *result; | 33 | struct task_struct *result; |
@@ -98,10 +99,23 @@ static int kthread(void *_create) | |||
98 | do_exit(ret); | 99 | do_exit(ret); |
99 | } | 100 | } |
100 | 101 | ||
102 | /* called from do_fork() to get node information for about to be created task */ | ||
103 | int tsk_fork_get_node(struct task_struct *tsk) | ||
104 | { | ||
105 | #ifdef CONFIG_NUMA | ||
106 | if (tsk == kthreadd_task) | ||
107 | return tsk->pref_node_fork; | ||
108 | #endif | ||
109 | return numa_node_id(); | ||
110 | } | ||
111 | |||
101 | static void create_kthread(struct kthread_create_info *create) | 112 | static void create_kthread(struct kthread_create_info *create) |
102 | { | 113 | { |
103 | int pid; | 114 | int pid; |
104 | 115 | ||
116 | #ifdef CONFIG_NUMA | ||
117 | current->pref_node_fork = create->node; | ||
118 | #endif | ||
105 | /* We want our own signal handler (we take no signals by default). */ | 119 | /* We want our own signal handler (we take no signals by default). */ |
106 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); | 120 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); |
107 | if (pid < 0) { | 121 | if (pid < 0) { |
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create) | |||
111 | } | 125 | } |
112 | 126 | ||
113 | /** | 127 | /** |
114 | * kthread_create - create a kthread. | 128 | * kthread_create_on_node - create a kthread. |
115 | * @threadfn: the function to run until signal_pending(current). | 129 | * @threadfn: the function to run until signal_pending(current). |
116 | * @data: data ptr for @threadfn. | 130 | * @data: data ptr for @threadfn. |
131 | * @node: memory node number. | ||
117 | * @namefmt: printf-style name for the thread. | 132 | * @namefmt: printf-style name for the thread. |
118 | * | 133 | * |
119 | * Description: This helper function creates and names a kernel | 134 | * Description: This helper function creates and names a kernel |
120 | * thread. The thread will be stopped: use wake_up_process() to start | 135 | * thread. The thread will be stopped: use wake_up_process() to start |
121 | * it. See also kthread_run(). | 136 | * it. See also kthread_run(). |
122 | * | 137 | * |
138 | * If thread is going to be bound on a particular cpu, give its node | ||
139 | * in @node, to get NUMA affinity for kthread stack, or else give -1. | ||
123 | * When woken, the thread will run @threadfn() with @data as its | 140 | * When woken, the thread will run @threadfn() with @data as its |
124 | * argument. @threadfn() can either call do_exit() directly if it is a | 141 | * argument. @threadfn() can either call do_exit() directly if it is a |
125 | * standalone thread for which noone will call kthread_stop(), or | 142 | * standalone thread for which no one will call kthread_stop(), or |
126 | * return when 'kthread_should_stop()' is true (which means | 143 | * return when 'kthread_should_stop()' is true (which means |
127 | * kthread_stop() has been called). The return value should be zero | 144 | * kthread_stop() has been called). The return value should be zero |
128 | * or a negative error number; it will be passed to kthread_stop(). | 145 | * or a negative error number; it will be passed to kthread_stop(). |
129 | * | 146 | * |
130 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 147 | * Returns a task_struct or ERR_PTR(-ENOMEM). |
131 | */ | 148 | */ |
132 | struct task_struct *kthread_create(int (*threadfn)(void *data), | 149 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
133 | void *data, | 150 | void *data, |
134 | const char namefmt[], | 151 | int node, |
135 | ...) | 152 | const char namefmt[], |
153 | ...) | ||
136 | { | 154 | { |
137 | struct kthread_create_info create; | 155 | struct kthread_create_info create; |
138 | 156 | ||
139 | create.threadfn = threadfn; | 157 | create.threadfn = threadfn; |
140 | create.data = data; | 158 | create.data = data; |
159 | create.node = node; | ||
141 | init_completion(&create.done); | 160 | init_completion(&create.done); |
142 | 161 | ||
143 | spin_lock(&kthread_create_lock); | 162 | spin_lock(&kthread_create_lock); |
@@ -148,7 +167,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
148 | wait_for_completion(&create.done); | 167 | wait_for_completion(&create.done); |
149 | 168 | ||
150 | if (!IS_ERR(create.result)) { | 169 | if (!IS_ERR(create.result)) { |
151 | struct sched_param param = { .sched_priority = 0 }; | 170 | static const struct sched_param param = { .sched_priority = 0 }; |
152 | va_list args; | 171 | va_list args; |
153 | 172 | ||
154 | va_start(args, namefmt); | 173 | va_start(args, namefmt); |
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
164 | } | 183 | } |
165 | return create.result; | 184 | return create.result; |
166 | } | 185 | } |
167 | EXPORT_SYMBOL(kthread_create); | 186 | EXPORT_SYMBOL(kthread_create_on_node); |
168 | 187 | ||
169 | /** | 188 | /** |
170 | * kthread_bind - bind a just-created kthread to a cpu. | 189 | * kthread_bind - bind a just-created kthread to a cpu. |
@@ -183,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) | |||
183 | return; | 202 | return; |
184 | } | 203 | } |
185 | 204 | ||
186 | p->cpus_allowed = cpumask_of_cpu(cpu); | 205 | /* It's safe because the task is inactive. */ |
187 | p->rt.nr_cpus_allowed = 1; | 206 | do_set_cpus_allowed(p, cpumask_of(cpu)); |
188 | p->flags |= PF_THREAD_BOUND; | 207 | p->flags |= PF_THREAD_BOUND; |
189 | } | 208 | } |
190 | EXPORT_SYMBOL(kthread_bind); | 209 | EXPORT_SYMBOL(kthread_bind); |
@@ -265,6 +284,17 @@ int kthreadd(void *unused) | |||
265 | return 0; | 284 | return 0; |
266 | } | 285 | } |
267 | 286 | ||
287 | void __init_kthread_worker(struct kthread_worker *worker, | ||
288 | const char *name, | ||
289 | struct lock_class_key *key) | ||
290 | { | ||
291 | spin_lock_init(&worker->lock); | ||
292 | lockdep_set_class_and_name(&worker->lock, key, name); | ||
293 | INIT_LIST_HEAD(&worker->work_list); | ||
294 | worker->task = NULL; | ||
295 | } | ||
296 | EXPORT_SYMBOL_GPL(__init_kthread_worker); | ||
297 | |||
268 | /** | 298 | /** |
269 | * kthread_worker_fn - kthread function to process kthread_worker | 299 | * kthread_worker_fn - kthread function to process kthread_worker |
270 | * @worker_ptr: pointer to initialized kthread_worker | 300 | * @worker_ptr: pointer to initialized kthread_worker |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d415..376066e10413 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, | |||
153 | } | 153 | } |
154 | 154 | ||
155 | /** | 155 | /** |
156 | * __account_scheduler_latency - record an occured latency | 156 | * __account_scheduler_latency - record an occurred latency |
157 | * @tsk - the task struct of the task hitting the latency | 157 | * @tsk - the task struct of the task hitting the latency |
158 | * @usecs - the duration of the latency in microseconds | 158 | * @usecs - the duration of the latency in microseconds |
159 | * @inter - 1 if the sleep was interruptible, 0 if uninterruptible | 159 | * @inter - 1 if the sleep was interruptible, 0 if uninterruptible |
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
194 | 194 | ||
195 | account_global_scheduler_latency(tsk, &lat); | 195 | account_global_scheduler_latency(tsk, &lat); |
196 | 196 | ||
197 | /* | 197 | for (i = 0; i < tsk->latency_record_count; i++) { |
198 | * short term hack; if we're > 32 we stop; future we recycle: | ||
199 | */ | ||
200 | tsk->latency_record_count++; | ||
201 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
202 | goto out_unlock; | ||
203 | |||
204 | for (i = 0; i < LT_SAVECOUNT; i++) { | ||
205 | struct latency_record *mylat; | 198 | struct latency_record *mylat; |
206 | int same = 1; | 199 | int same = 1; |
207 | 200 | ||
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
227 | } | 220 | } |
228 | } | 221 | } |
229 | 222 | ||
223 | /* | ||
224 | * short term hack; if we're > 32 we stop; future we recycle: | ||
225 | */ | ||
226 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
227 | goto out_unlock; | ||
228 | |||
230 | /* Allocated a new one: */ | 229 | /* Allocated a new one: */ |
231 | i = tsk->latency_record_count; | 230 | i = tsk->latency_record_count++; |
232 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | 231 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); |
233 | 232 | ||
234 | out_unlock: | 233 | out_unlock: |
@@ -242,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) | |||
242 | seq_puts(m, "Latency Top version : v0.1\n"); | 241 | seq_puts(m, "Latency Top version : v0.1\n"); |
243 | 242 | ||
244 | for (i = 0; i < MAXLR; i++) { | 243 | for (i = 0; i < MAXLR; i++) { |
245 | if (latency_record[i].backtrace[0]) { | 244 | struct latency_record *lr = &latency_record[i]; |
245 | |||
246 | if (lr->backtrace[0]) { | ||
246 | int q; | 247 | int q; |
247 | seq_printf(m, "%i %lu %lu ", | 248 | seq_printf(m, "%i %lu %lu", |
248 | latency_record[i].count, | 249 | lr->count, lr->time, lr->max); |
249 | latency_record[i].time, | ||
250 | latency_record[i].max); | ||
251 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | 250 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { |
252 | char sym[KSYM_SYMBOL_LEN]; | 251 | unsigned long bt = lr->backtrace[q]; |
253 | char *c; | 252 | if (!bt) |
254 | if (!latency_record[i].backtrace[q]) | ||
255 | break; | 253 | break; |
256 | if (latency_record[i].backtrace[q] == ULONG_MAX) | 254 | if (bt == ULONG_MAX) |
257 | break; | 255 | break; |
258 | sprint_symbol(sym, latency_record[i].backtrace[q]); | 256 | seq_printf(m, " %ps", (void *)bt); |
259 | c = strchr(sym, '+'); | ||
260 | if (c) | ||
261 | *c = 0; | ||
262 | seq_printf(m, "%s ", sym); | ||
263 | } | 257 | } |
264 | seq_printf(m, "\n"); | 258 | seq_printf(m, "\n"); |
265 | } | 259 | } |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..298c9276dfdb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
490 | usage[i] = '\0'; | 490 | usage[i] = '\0'; |
491 | } | 491 | } |
492 | 492 | ||
493 | static int __print_lock_name(struct lock_class *class) | ||
494 | { | ||
495 | char str[KSYM_NAME_LEN]; | ||
496 | const char *name; | ||
497 | |||
498 | name = class->name; | ||
499 | if (!name) | ||
500 | name = __get_key_name(class->key, str); | ||
501 | |||
502 | return printk("%s", name); | ||
503 | } | ||
504 | |||
493 | static void print_lock_name(struct lock_class *class) | 505 | static void print_lock_name(struct lock_class *class) |
494 | { | 506 | { |
495 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | 507 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; |
@@ -639,6 +651,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
639 | } | 651 | } |
640 | #endif | 652 | #endif |
641 | 653 | ||
654 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
655 | debug_locks_off(); | ||
656 | printk(KERN_ERR | ||
657 | "BUG: looking up invalid subclass: %u\n", subclass); | ||
658 | printk(KERN_ERR | ||
659 | "turning off the locking correctness validator.\n"); | ||
660 | dump_stack(); | ||
661 | return NULL; | ||
662 | } | ||
663 | |||
642 | /* | 664 | /* |
643 | * Static locks do not have their class-keys yet - for them the key | 665 | * Static locks do not have their class-keys yet - for them the key |
644 | * is the lock object itself: | 666 | * is the lock object itself: |
@@ -774,7 +796,9 @@ out_unlock_set: | |||
774 | raw_local_irq_restore(flags); | 796 | raw_local_irq_restore(flags); |
775 | 797 | ||
776 | if (!subclass || force) | 798 | if (!subclass || force) |
777 | lock->class_cache = class; | 799 | lock->class_cache[0] = class; |
800 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | ||
801 | lock->class_cache[subclass] = class; | ||
778 | 802 | ||
779 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | 803 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
780 | return NULL; | 804 | return NULL; |
@@ -1041,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth) | |||
1041 | return 0; | 1065 | return 0; |
1042 | } | 1066 | } |
1043 | 1067 | ||
1068 | static void | ||
1069 | print_circular_lock_scenario(struct held_lock *src, | ||
1070 | struct held_lock *tgt, | ||
1071 | struct lock_list *prt) | ||
1072 | { | ||
1073 | struct lock_class *source = hlock_class(src); | ||
1074 | struct lock_class *target = hlock_class(tgt); | ||
1075 | struct lock_class *parent = prt->class; | ||
1076 | |||
1077 | /* | ||
1078 | * A direct locking problem where unsafe_class lock is taken | ||
1079 | * directly by safe_class lock, then all we need to show | ||
1080 | * is the deadlock scenario, as it is obvious that the | ||
1081 | * unsafe lock is taken under the safe lock. | ||
1082 | * | ||
1083 | * But if there is a chain instead, where the safe lock takes | ||
1084 | * an intermediate lock (middle_class) where this lock is | ||
1085 | * not the same as the safe lock, then the lock chain is | ||
1086 | * used to describe the problem. Otherwise we would need | ||
1087 | * to show a different CPU case for each link in the chain | ||
1088 | * from the safe_class lock to the unsafe_class lock. | ||
1089 | */ | ||
1090 | if (parent != source) { | ||
1091 | printk("Chain exists of:\n "); | ||
1092 | __print_lock_name(source); | ||
1093 | printk(" --> "); | ||
1094 | __print_lock_name(parent); | ||
1095 | printk(" --> "); | ||
1096 | __print_lock_name(target); | ||
1097 | printk("\n\n"); | ||
1098 | } | ||
1099 | |||
1100 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1101 | printk(" CPU0 CPU1\n"); | ||
1102 | printk(" ---- ----\n"); | ||
1103 | printk(" lock("); | ||
1104 | __print_lock_name(target); | ||
1105 | printk(");\n"); | ||
1106 | printk(" lock("); | ||
1107 | __print_lock_name(parent); | ||
1108 | printk(");\n"); | ||
1109 | printk(" lock("); | ||
1110 | __print_lock_name(target); | ||
1111 | printk(");\n"); | ||
1112 | printk(" lock("); | ||
1113 | __print_lock_name(source); | ||
1114 | printk(");\n"); | ||
1115 | printk("\n *** DEADLOCK ***\n\n"); | ||
1116 | } | ||
1117 | |||
1044 | /* | 1118 | /* |
1045 | * When a circular dependency is detected, print the | 1119 | * When a circular dependency is detected, print the |
1046 | * header first: | 1120 | * header first: |
@@ -1084,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1084 | { | 1158 | { |
1085 | struct task_struct *curr = current; | 1159 | struct task_struct *curr = current; |
1086 | struct lock_list *parent; | 1160 | struct lock_list *parent; |
1161 | struct lock_list *first_parent; | ||
1087 | int depth; | 1162 | int depth; |
1088 | 1163 | ||
1089 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1164 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
@@ -1097,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1097 | print_circular_bug_header(target, depth, check_src, check_tgt); | 1172 | print_circular_bug_header(target, depth, check_src, check_tgt); |
1098 | 1173 | ||
1099 | parent = get_lock_parent(target); | 1174 | parent = get_lock_parent(target); |
1175 | first_parent = parent; | ||
1100 | 1176 | ||
1101 | while (parent) { | 1177 | while (parent) { |
1102 | print_circular_bug_entry(parent, --depth); | 1178 | print_circular_bug_entry(parent, --depth); |
@@ -1104,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1104 | } | 1180 | } |
1105 | 1181 | ||
1106 | printk("\nother info that might help us debug this:\n\n"); | 1182 | printk("\nother info that might help us debug this:\n\n"); |
1183 | print_circular_lock_scenario(check_src, check_tgt, | ||
1184 | first_parent); | ||
1185 | |||
1107 | lockdep_print_held_locks(curr); | 1186 | lockdep_print_held_locks(curr); |
1108 | 1187 | ||
1109 | printk("\nstack backtrace:\n"); | 1188 | printk("\nstack backtrace:\n"); |
@@ -1302,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1302 | printk("\n"); | 1381 | printk("\n"); |
1303 | 1382 | ||
1304 | if (depth == 0 && (entry != root)) { | 1383 | if (depth == 0 && (entry != root)) { |
1305 | printk("lockdep:%s bad BFS generated tree\n", __func__); | 1384 | printk("lockdep:%s bad path found in chain graph\n", __func__); |
1306 | break; | 1385 | break; |
1307 | } | 1386 | } |
1308 | 1387 | ||
@@ -1313,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1313 | return; | 1392 | return; |
1314 | } | 1393 | } |
1315 | 1394 | ||
1395 | static void | ||
1396 | print_irq_lock_scenario(struct lock_list *safe_entry, | ||
1397 | struct lock_list *unsafe_entry, | ||
1398 | struct lock_class *prev_class, | ||
1399 | struct lock_class *next_class) | ||
1400 | { | ||
1401 | struct lock_class *safe_class = safe_entry->class; | ||
1402 | struct lock_class *unsafe_class = unsafe_entry->class; | ||
1403 | struct lock_class *middle_class = prev_class; | ||
1404 | |||
1405 | if (middle_class == safe_class) | ||
1406 | middle_class = next_class; | ||
1407 | |||
1408 | /* | ||
1409 | * A direct locking problem where unsafe_class lock is taken | ||
1410 | * directly by safe_class lock, then all we need to show | ||
1411 | * is the deadlock scenario, as it is obvious that the | ||
1412 | * unsafe lock is taken under the safe lock. | ||
1413 | * | ||
1414 | * But if there is a chain instead, where the safe lock takes | ||
1415 | * an intermediate lock (middle_class) where this lock is | ||
1416 | * not the same as the safe lock, then the lock chain is | ||
1417 | * used to describe the problem. Otherwise we would need | ||
1418 | * to show a different CPU case for each link in the chain | ||
1419 | * from the safe_class lock to the unsafe_class lock. | ||
1420 | */ | ||
1421 | if (middle_class != unsafe_class) { | ||
1422 | printk("Chain exists of:\n "); | ||
1423 | __print_lock_name(safe_class); | ||
1424 | printk(" --> "); | ||
1425 | __print_lock_name(middle_class); | ||
1426 | printk(" --> "); | ||
1427 | __print_lock_name(unsafe_class); | ||
1428 | printk("\n\n"); | ||
1429 | } | ||
1430 | |||
1431 | printk(" Possible interrupt unsafe locking scenario:\n\n"); | ||
1432 | printk(" CPU0 CPU1\n"); | ||
1433 | printk(" ---- ----\n"); | ||
1434 | printk(" lock("); | ||
1435 | __print_lock_name(unsafe_class); | ||
1436 | printk(");\n"); | ||
1437 | printk(" local_irq_disable();\n"); | ||
1438 | printk(" lock("); | ||
1439 | __print_lock_name(safe_class); | ||
1440 | printk(");\n"); | ||
1441 | printk(" lock("); | ||
1442 | __print_lock_name(middle_class); | ||
1443 | printk(");\n"); | ||
1444 | printk(" <Interrupt>\n"); | ||
1445 | printk(" lock("); | ||
1446 | __print_lock_name(safe_class); | ||
1447 | printk(");\n"); | ||
1448 | printk("\n *** DEADLOCK ***\n\n"); | ||
1449 | } | ||
1450 | |||
1316 | static int | 1451 | static int |
1317 | print_bad_irq_dependency(struct task_struct *curr, | 1452 | print_bad_irq_dependency(struct task_struct *curr, |
1318 | struct lock_list *prev_root, | 1453 | struct lock_list *prev_root, |
@@ -1364,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1364 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); | 1499 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); |
1365 | 1500 | ||
1366 | printk("\nother info that might help us debug this:\n\n"); | 1501 | printk("\nother info that might help us debug this:\n\n"); |
1502 | print_irq_lock_scenario(backwards_entry, forwards_entry, | ||
1503 | hlock_class(prev), hlock_class(next)); | ||
1504 | |||
1367 | lockdep_print_held_locks(curr); | 1505 | lockdep_print_held_locks(curr); |
1368 | 1506 | ||
1369 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); | 1507 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); |
@@ -1527,6 +1665,26 @@ static inline void inc_chains(void) | |||
1527 | 1665 | ||
1528 | #endif | 1666 | #endif |
1529 | 1667 | ||
1668 | static void | ||
1669 | print_deadlock_scenario(struct held_lock *nxt, | ||
1670 | struct held_lock *prv) | ||
1671 | { | ||
1672 | struct lock_class *next = hlock_class(nxt); | ||
1673 | struct lock_class *prev = hlock_class(prv); | ||
1674 | |||
1675 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1676 | printk(" CPU0\n"); | ||
1677 | printk(" ----\n"); | ||
1678 | printk(" lock("); | ||
1679 | __print_lock_name(prev); | ||
1680 | printk(");\n"); | ||
1681 | printk(" lock("); | ||
1682 | __print_lock_name(next); | ||
1683 | printk(");\n"); | ||
1684 | printk("\n *** DEADLOCK ***\n\n"); | ||
1685 | printk(" May be due to missing lock nesting notation\n\n"); | ||
1686 | } | ||
1687 | |||
1530 | static int | 1688 | static int |
1531 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | 1689 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, |
1532 | struct held_lock *next) | 1690 | struct held_lock *next) |
@@ -1545,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1545 | print_lock(prev); | 1703 | print_lock(prev); |
1546 | 1704 | ||
1547 | printk("\nother info that might help us debug this:\n"); | 1705 | printk("\nother info that might help us debug this:\n"); |
1706 | print_deadlock_scenario(next, prev); | ||
1548 | lockdep_print_held_locks(curr); | 1707 | lockdep_print_held_locks(curr); |
1549 | 1708 | ||
1550 | printk("\nstack backtrace:\n"); | 1709 | printk("\nstack backtrace:\n"); |
@@ -1814,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
1814 | struct list_head *hash_head = chainhashentry(chain_key); | 1973 | struct list_head *hash_head = chainhashentry(chain_key); |
1815 | struct lock_chain *chain; | 1974 | struct lock_chain *chain; |
1816 | struct held_lock *hlock_curr, *hlock_next; | 1975 | struct held_lock *hlock_curr, *hlock_next; |
1817 | int i, j, n, cn; | 1976 | int i, j; |
1818 | 1977 | ||
1819 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 1978 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
1820 | return 0; | 1979 | return 0; |
@@ -1874,15 +2033,9 @@ cache_hit: | |||
1874 | } | 2033 | } |
1875 | i++; | 2034 | i++; |
1876 | chain->depth = curr->lockdep_depth + 1 - i; | 2035 | chain->depth = curr->lockdep_depth + 1 - i; |
1877 | cn = nr_chain_hlocks; | 2036 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { |
1878 | while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { | 2037 | chain->base = nr_chain_hlocks; |
1879 | n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); | 2038 | nr_chain_hlocks += chain->depth; |
1880 | if (n == cn) | ||
1881 | break; | ||
1882 | cn = n; | ||
1883 | } | ||
1884 | if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { | ||
1885 | chain->base = cn; | ||
1886 | for (j = 0; j < chain->depth - 1; j++, i++) { | 2039 | for (j = 0; j < chain->depth - 1; j++, i++) { |
1887 | int lock_id = curr->held_locks[i].class_idx - 1; | 2040 | int lock_id = curr->held_locks[i].class_idx - 1; |
1888 | chain_hlocks[chain->base + j] = lock_id; | 2041 | chain_hlocks[chain->base + j] = lock_id; |
@@ -1999,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr) | |||
1999 | #endif | 2152 | #endif |
2000 | } | 2153 | } |
2001 | 2154 | ||
2155 | static void | ||
2156 | print_usage_bug_scenario(struct held_lock *lock) | ||
2157 | { | ||
2158 | struct lock_class *class = hlock_class(lock); | ||
2159 | |||
2160 | printk(" Possible unsafe locking scenario:\n\n"); | ||
2161 | printk(" CPU0\n"); | ||
2162 | printk(" ----\n"); | ||
2163 | printk(" lock("); | ||
2164 | __print_lock_name(class); | ||
2165 | printk(");\n"); | ||
2166 | printk(" <Interrupt>\n"); | ||
2167 | printk(" lock("); | ||
2168 | __print_lock_name(class); | ||
2169 | printk(");\n"); | ||
2170 | printk("\n *** DEADLOCK ***\n\n"); | ||
2171 | } | ||
2172 | |||
2002 | static int | 2173 | static int |
2003 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | 2174 | print_usage_bug(struct task_struct *curr, struct held_lock *this, |
2004 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | 2175 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) |
@@ -2027,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2027 | 2198 | ||
2028 | print_irqtrace_events(curr); | 2199 | print_irqtrace_events(curr); |
2029 | printk("\nother info that might help us debug this:\n"); | 2200 | printk("\nother info that might help us debug this:\n"); |
2201 | print_usage_bug_scenario(this); | ||
2202 | |||
2030 | lockdep_print_held_locks(curr); | 2203 | lockdep_print_held_locks(curr); |
2031 | 2204 | ||
2032 | printk("\nstack backtrace:\n"); | 2205 | printk("\nstack backtrace:\n"); |
@@ -2061,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2061 | struct held_lock *this, int forwards, | 2234 | struct held_lock *this, int forwards, |
2062 | const char *irqclass) | 2235 | const char *irqclass) |
2063 | { | 2236 | { |
2237 | struct lock_list *entry = other; | ||
2238 | struct lock_list *middle = NULL; | ||
2239 | int depth; | ||
2240 | |||
2064 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2241 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2065 | return 0; | 2242 | return 0; |
2066 | 2243 | ||
@@ -2079,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2079 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | 2256 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); |
2080 | 2257 | ||
2081 | printk("\nother info that might help us debug this:\n"); | 2258 | printk("\nother info that might help us debug this:\n"); |
2259 | |||
2260 | /* Find a middle lock (if one exists) */ | ||
2261 | depth = get_lock_depth(other); | ||
2262 | do { | ||
2263 | if (depth == 0 && (entry != root)) { | ||
2264 | printk("lockdep:%s bad path found in chain graph\n", __func__); | ||
2265 | break; | ||
2266 | } | ||
2267 | middle = entry; | ||
2268 | entry = get_lock_parent(entry); | ||
2269 | depth--; | ||
2270 | } while (entry && entry != root && (depth >= 0)); | ||
2271 | if (forwards) | ||
2272 | print_irq_lock_scenario(root, other, | ||
2273 | middle ? middle->class : root->class, other->class); | ||
2274 | else | ||
2275 | print_irq_lock_scenario(other, root, | ||
2276 | middle ? middle->class : other->class, root->class); | ||
2277 | |||
2082 | lockdep_print_held_locks(curr); | 2278 | lockdep_print_held_locks(curr); |
2083 | 2279 | ||
2084 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); | 2280 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); |
@@ -2280,22 +2476,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
2280 | } | 2476 | } |
2281 | 2477 | ||
2282 | /* | 2478 | /* |
2283 | * Debugging helper: via this flag we know that we are in | ||
2284 | * 'early bootup code', and will warn about any invalid irqs-on event: | ||
2285 | */ | ||
2286 | static int early_boot_irqs_enabled; | ||
2287 | |||
2288 | void early_boot_irqs_off(void) | ||
2289 | { | ||
2290 | early_boot_irqs_enabled = 0; | ||
2291 | } | ||
2292 | |||
2293 | void early_boot_irqs_on(void) | ||
2294 | { | ||
2295 | early_boot_irqs_enabled = 1; | ||
2296 | } | ||
2297 | |||
2298 | /* | ||
2299 | * Hardirqs will be enabled: | 2479 | * Hardirqs will be enabled: |
2300 | */ | 2480 | */ |
2301 | void trace_hardirqs_on_caller(unsigned long ip) | 2481 | void trace_hardirqs_on_caller(unsigned long ip) |
@@ -2307,13 +2487,13 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2307 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2487 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2308 | return; | 2488 | return; |
2309 | 2489 | ||
2310 | if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) | 2490 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) |
2311 | return; | 2491 | return; |
2312 | 2492 | ||
2313 | if (unlikely(curr->hardirqs_enabled)) { | 2493 | if (unlikely(curr->hardirqs_enabled)) { |
2314 | /* | 2494 | /* |
2315 | * Neither irq nor preemption are disabled here | 2495 | * Neither irq nor preemption are disabled here |
2316 | * so this is racy by nature but loosing one hit | 2496 | * so this is racy by nature but losing one hit |
2317 | * in a stat is not a big deal. | 2497 | * in a stat is not a big deal. |
2318 | */ | 2498 | */ |
2319 | __debug_atomic_inc(redundant_hardirqs_on); | 2499 | __debug_atomic_inc(redundant_hardirqs_on); |
@@ -2624,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2624 | if (!graph_lock()) | 2804 | if (!graph_lock()) |
2625 | return 0; | 2805 | return 0; |
2626 | /* | 2806 | /* |
2627 | * Make sure we didnt race: | 2807 | * Make sure we didn't race: |
2628 | */ | 2808 | */ |
2629 | if (unlikely(hlock_class(this)->usage_mask & new_mask)) { | 2809 | if (unlikely(hlock_class(this)->usage_mask & new_mask)) { |
2630 | graph_unlock(); | 2810 | graph_unlock(); |
@@ -2679,7 +2859,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2679 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2859 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
2680 | struct lock_class_key *key, int subclass) | 2860 | struct lock_class_key *key, int subclass) |
2681 | { | 2861 | { |
2682 | lock->class_cache = NULL; | 2862 | int i; |
2863 | |||
2864 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
2865 | lock->class_cache[i] = NULL; | ||
2866 | |||
2683 | #ifdef CONFIG_LOCK_STAT | 2867 | #ifdef CONFIG_LOCK_STAT |
2684 | lock->cpu = raw_smp_processor_id(); | 2868 | lock->cpu = raw_smp_processor_id(); |
2685 | #endif | 2869 | #endif |
@@ -2739,21 +2923,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2739 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2923 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2740 | return 0; | 2924 | return 0; |
2741 | 2925 | ||
2742 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
2743 | debug_locks_off(); | ||
2744 | printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); | ||
2745 | printk("turning off the locking correctness validator.\n"); | ||
2746 | dump_stack(); | ||
2747 | return 0; | ||
2748 | } | ||
2749 | |||
2750 | if (lock->key == &__lockdep_no_validate__) | 2926 | if (lock->key == &__lockdep_no_validate__) |
2751 | check = 1; | 2927 | check = 1; |
2752 | 2928 | ||
2753 | if (!subclass) | 2929 | if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
2754 | class = lock->class_cache; | 2930 | class = lock->class_cache[subclass]; |
2755 | /* | 2931 | /* |
2756 | * Not cached yet or subclass? | 2932 | * Not cached? |
2757 | */ | 2933 | */ |
2758 | if (unlikely(!class)) { | 2934 | if (unlikely(!class)) { |
2759 | class = register_lock_class(lock, subclass, 0); | 2935 | class = register_lock_class(lock, subclass, 0); |
@@ -2918,7 +3094,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
2918 | return 1; | 3094 | return 1; |
2919 | 3095 | ||
2920 | if (hlock->references) { | 3096 | if (hlock->references) { |
2921 | struct lock_class *class = lock->class_cache; | 3097 | struct lock_class *class = lock->class_cache[0]; |
2922 | 3098 | ||
2923 | if (!class) | 3099 | if (!class) |
2924 | class = look_up_lock_class(lock, 0); | 3100 | class = look_up_lock_class(lock, 0); |
@@ -3250,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock) | |||
3250 | int ret = 0; | 3426 | int ret = 0; |
3251 | 3427 | ||
3252 | if (unlikely(current->lockdep_recursion)) | 3428 | if (unlikely(current->lockdep_recursion)) |
3253 | return ret; | 3429 | return 1; /* avoid false negative lockdep_assert_held() */ |
3254 | 3430 | ||
3255 | raw_local_irq_save(flags); | 3431 | raw_local_irq_save(flags); |
3256 | check_flags(flags); | 3432 | check_flags(flags); |
@@ -3559,7 +3735,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3559 | if (list_empty(head)) | 3735 | if (list_empty(head)) |
3560 | continue; | 3736 | continue; |
3561 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3737 | list_for_each_entry_safe(class, next, head, hash_entry) { |
3562 | if (unlikely(class == lock->class_cache)) { | 3738 | int match = 0; |
3739 | |||
3740 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | ||
3741 | match |= class == lock->class_cache[j]; | ||
3742 | |||
3743 | if (unlikely(match)) { | ||
3563 | if (debug_locks_off_graph_unlock()) | 3744 | if (debug_locks_off_graph_unlock()) |
3564 | WARN_ON(1); | 3745 | WARN_ON(1); |
3565 | goto out_restore; | 3746 | goto out_restore; |
@@ -3775,7 +3956,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); | |||
3775 | * Careful: only use this function if you are sure that | 3956 | * Careful: only use this function if you are sure that |
3776 | * the task cannot run in parallel! | 3957 | * the task cannot run in parallel! |
3777 | */ | 3958 | */ |
3778 | void __debug_show_held_locks(struct task_struct *task) | 3959 | void debug_show_held_locks(struct task_struct *task) |
3779 | { | 3960 | { |
3780 | if (unlikely(!debug_locks)) { | 3961 | if (unlikely(!debug_locks)) { |
3781 | printk("INFO: lockdep is turned off.\n"); | 3962 | printk("INFO: lockdep is turned off.\n"); |
@@ -3783,12 +3964,6 @@ void __debug_show_held_locks(struct task_struct *task) | |||
3783 | } | 3964 | } |
3784 | lockdep_print_held_locks(task); | 3965 | lockdep_print_held_locks(task); |
3785 | } | 3966 | } |
3786 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3787 | |||
3788 | void debug_show_held_locks(struct task_struct *task) | ||
3789 | { | ||
3790 | __debug_show_held_locks(task); | ||
3791 | } | ||
3792 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3967 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3793 | 3968 | ||
3794 | void lockdep_sys_exit(void) | 3969 | void lockdep_sys_exit(void) |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d7..71edd2f60c02 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
225 | nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, | 225 | nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, |
226 | nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, | 226 | nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, |
227 | nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, | 227 | nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, |
228 | sum_forward_deps = 0, factor = 0; | 228 | sum_forward_deps = 0; |
229 | 229 | ||
230 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | 230 | list_for_each_entry(class, &all_lock_classes, lock_entry) { |
231 | 231 | ||
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) | |||
283 | nr_hardirq_unsafe * nr_hardirq_safe + | 283 | nr_hardirq_unsafe * nr_hardirq_safe + |
284 | nr_list_entries); | 284 | nr_list_entries); |
285 | 285 | ||
286 | /* | ||
287 | * Estimated factor between direct and indirect | ||
288 | * dependencies: | ||
289 | */ | ||
290 | if (nr_list_entries) | ||
291 | factor = sum_forward_deps / nr_list_entries; | ||
292 | |||
293 | #ifdef CONFIG_PROVE_LOCKING | 286 | #ifdef CONFIG_PROVE_LOCKING |
294 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", | 287 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", |
295 | nr_lock_chains, MAX_LOCKDEP_CHAINS); | 288 | nr_lock_chains, MAX_LOCKDEP_CHAINS); |
@@ -494,7 +487,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
494 | namelen += 2; | 487 | namelen += 2; |
495 | 488 | ||
496 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 489 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
497 | char sym[KSYM_SYMBOL_LEN]; | ||
498 | char ip[32]; | 490 | char ip[32]; |
499 | 491 | ||
500 | if (class->contention_point[i] == 0) | 492 | if (class->contention_point[i] == 0) |
@@ -503,15 +495,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
503 | if (!i) | 495 | if (!i) |
504 | seq_line(m, '-', 40-namelen, namelen); | 496 | seq_line(m, '-', 40-namelen, namelen); |
505 | 497 | ||
506 | sprint_symbol(sym, class->contention_point[i]); | ||
507 | snprintf(ip, sizeof(ip), "[<%p>]", | 498 | snprintf(ip, sizeof(ip), "[<%p>]", |
508 | (void *)class->contention_point[i]); | 499 | (void *)class->contention_point[i]); |
509 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 500 | seq_printf(m, "%40s %14lu %29s %pS\n", |
510 | stats->contention_point[i], | 501 | name, stats->contention_point[i], |
511 | ip, sym); | 502 | ip, (void *)class->contention_point[i]); |
512 | } | 503 | } |
513 | for (i = 0; i < LOCKSTAT_POINTS; i++) { | 504 | for (i = 0; i < LOCKSTAT_POINTS; i++) { |
514 | char sym[KSYM_SYMBOL_LEN]; | ||
515 | char ip[32]; | 505 | char ip[32]; |
516 | 506 | ||
517 | if (class->contending_point[i] == 0) | 507 | if (class->contending_point[i] == 0) |
@@ -520,12 +510,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) | |||
520 | if (!i) | 510 | if (!i) |
521 | seq_line(m, '-', 40-namelen, namelen); | 511 | seq_line(m, '-', 40-namelen, namelen); |
522 | 512 | ||
523 | sprint_symbol(sym, class->contending_point[i]); | ||
524 | snprintf(ip, sizeof(ip), "[<%p>]", | 513 | snprintf(ip, sizeof(ip), "[<%p>]", |
525 | (void *)class->contending_point[i]); | 514 | (void *)class->contending_point[i]); |
526 | seq_printf(m, "%40s %14lu %29s %s\n", name, | 515 | seq_printf(m, "%40s %14lu %29s %pS\n", |
527 | stats->contending_point[i], | 516 | name, stats->contending_point[i], |
528 | ip, sym); | 517 | ip, (void *)class->contending_point[i]); |
529 | } | 518 | } |
530 | if (i) { | 519 | if (i) { |
531 | seq_puts(m, "\n"); | 520 | seq_puts(m, "\n"); |
diff --git a/kernel/module.c b/kernel/module.c index ccd641991842..795bdc7f5c3f 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -55,6 +55,9 @@ | |||
55 | #include <linux/async.h> | 55 | #include <linux/async.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | ||
59 | #include <linux/pfn.h> | ||
60 | #include <linux/bsearch.h> | ||
58 | 61 | ||
59 | #define CREATE_TRACE_POINTS | 62 | #define CREATE_TRACE_POINTS |
60 | #include <trace/events/module.h> | 63 | #include <trace/events/module.h> |
@@ -69,6 +72,26 @@ | |||
69 | #define ARCH_SHF_SMALL 0 | 72 | #define ARCH_SHF_SMALL 0 |
70 | #endif | 73 | #endif |
71 | 74 | ||
75 | /* | ||
76 | * Modules' sections will be aligned on page boundaries | ||
77 | * to ensure complete separation of code and data, but | ||
78 | * only when CONFIG_DEBUG_SET_MODULE_RONX=y | ||
79 | */ | ||
80 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
81 | # define debug_align(X) ALIGN(X, PAGE_SIZE) | ||
82 | #else | ||
83 | # define debug_align(X) (X) | ||
84 | #endif | ||
85 | |||
86 | /* | ||
87 | * Given BASE and SIZE this macro calculates the number of pages the | ||
88 | * memory regions occupies | ||
89 | */ | ||
90 | #define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ | ||
91 | (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ | ||
92 | PFN_DOWN((unsigned long)BASE) + 1) \ | ||
93 | : (0UL)) | ||
94 | |||
72 | /* If this is set, the section belongs in the init part of the module */ | 95 | /* If this is set, the section belongs in the init part of the module */ |
73 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) | 96 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) |
74 | 97 | ||
@@ -218,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr, | |||
218 | struct module *owner, | 241 | struct module *owner, |
219 | bool (*fn)(const struct symsearch *syms, | 242 | bool (*fn)(const struct symsearch *syms, |
220 | struct module *owner, | 243 | struct module *owner, |
221 | unsigned int symnum, void *data), | 244 | void *data), |
222 | void *data) | 245 | void *data) |
223 | { | 246 | { |
224 | unsigned int i, j; | 247 | unsigned int j; |
225 | 248 | ||
226 | for (j = 0; j < arrsize; j++) { | 249 | for (j = 0; j < arrsize; j++) { |
227 | for (i = 0; i < arr[j].stop - arr[j].start; i++) | 250 | if (fn(&arr[j], owner, data)) |
228 | if (fn(&arr[j], owner, i, data)) | 251 | return true; |
229 | return true; | ||
230 | } | 252 | } |
231 | 253 | ||
232 | return false; | 254 | return false; |
233 | } | 255 | } |
234 | 256 | ||
235 | /* Returns true as soon as fn returns true, otherwise false. */ | 257 | /* Returns true as soon as fn returns true, otherwise false. */ |
236 | bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | 258 | bool each_symbol_section(bool (*fn)(const struct symsearch *arr, |
237 | unsigned int symnum, void *data), void *data) | 259 | struct module *owner, |
260 | void *data), | ||
261 | void *data) | ||
238 | { | 262 | { |
239 | struct module *mod; | 263 | struct module *mod; |
240 | static const struct symsearch arr[] = { | 264 | static const struct symsearch arr[] = { |
@@ -287,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | |||
287 | } | 311 | } |
288 | return false; | 312 | return false; |
289 | } | 313 | } |
290 | EXPORT_SYMBOL_GPL(each_symbol); | 314 | EXPORT_SYMBOL_GPL(each_symbol_section); |
291 | 315 | ||
292 | struct find_symbol_arg { | 316 | struct find_symbol_arg { |
293 | /* Input */ | 317 | /* Input */ |
@@ -301,15 +325,12 @@ struct find_symbol_arg { | |||
301 | const struct kernel_symbol *sym; | 325 | const struct kernel_symbol *sym; |
302 | }; | 326 | }; |
303 | 327 | ||
304 | static bool find_symbol_in_section(const struct symsearch *syms, | 328 | static bool check_symbol(const struct symsearch *syms, |
305 | struct module *owner, | 329 | struct module *owner, |
306 | unsigned int symnum, void *data) | 330 | unsigned int symnum, void *data) |
307 | { | 331 | { |
308 | struct find_symbol_arg *fsa = data; | 332 | struct find_symbol_arg *fsa = data; |
309 | 333 | ||
310 | if (strcmp(syms->start[symnum].name, fsa->name) != 0) | ||
311 | return false; | ||
312 | |||
313 | if (!fsa->gplok) { | 334 | if (!fsa->gplok) { |
314 | if (syms->licence == GPL_ONLY) | 335 | if (syms->licence == GPL_ONLY) |
315 | return false; | 336 | return false; |
@@ -343,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms, | |||
343 | return true; | 364 | return true; |
344 | } | 365 | } |
345 | 366 | ||
367 | static int cmp_name(const void *va, const void *vb) | ||
368 | { | ||
369 | const char *a; | ||
370 | const struct kernel_symbol *b; | ||
371 | a = va; b = vb; | ||
372 | return strcmp(a, b->name); | ||
373 | } | ||
374 | |||
375 | static bool find_symbol_in_section(const struct symsearch *syms, | ||
376 | struct module *owner, | ||
377 | void *data) | ||
378 | { | ||
379 | struct find_symbol_arg *fsa = data; | ||
380 | struct kernel_symbol *sym; | ||
381 | |||
382 | sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, | ||
383 | sizeof(struct kernel_symbol), cmp_name); | ||
384 | |||
385 | if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) | ||
386 | return true; | ||
387 | |||
388 | return false; | ||
389 | } | ||
390 | |||
346 | /* Find a symbol and return it, along with, (optional) crc and | 391 | /* Find a symbol and return it, along with, (optional) crc and |
347 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ | 392 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ |
348 | const struct kernel_symbol *find_symbol(const char *name, | 393 | const struct kernel_symbol *find_symbol(const char *name, |
@@ -357,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
357 | fsa.gplok = gplok; | 402 | fsa.gplok = gplok; |
358 | fsa.warn = warn; | 403 | fsa.warn = warn; |
359 | 404 | ||
360 | if (each_symbol(find_symbol_in_section, &fsa)) { | 405 | if (each_symbol_section(find_symbol_in_section, &fsa)) { |
361 | if (owner) | 406 | if (owner) |
362 | *owner = fsa.owner; | 407 | *owner = fsa.owner; |
363 | if (crc) | 408 | if (crc) |
@@ -787,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
787 | wait_for_zero_refcount(mod); | 832 | wait_for_zero_refcount(mod); |
788 | 833 | ||
789 | mutex_unlock(&module_mutex); | 834 | mutex_unlock(&module_mutex); |
790 | /* Final destruction now noone is using it. */ | 835 | /* Final destruction now no one is using it. */ |
791 | if (mod->exit != NULL) | 836 | if (mod->exit != NULL) |
792 | mod->exit(); | 837 | mod->exit(); |
793 | blocking_notifier_call_chain(&module_notify_list, | 838 | blocking_notifier_call_chain(&module_notify_list, |
@@ -1146,7 +1191,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, | |||
1146 | { | 1191 | { |
1147 | struct module_sect_attr *sattr = | 1192 | struct module_sect_attr *sattr = |
1148 | container_of(mattr, struct module_sect_attr, mattr); | 1193 | container_of(mattr, struct module_sect_attr, mattr); |
1149 | return sprintf(buf, "0x%lx\n", sattr->address); | 1194 | return sprintf(buf, "0x%pK\n", (void *)sattr->address); |
1150 | } | 1195 | } |
1151 | 1196 | ||
1152 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) | 1197 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) |
@@ -1541,6 +1586,117 @@ static int __unlink_module(void *_mod) | |||
1541 | return 0; | 1586 | return 0; |
1542 | } | 1587 | } |
1543 | 1588 | ||
1589 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | ||
1590 | /* | ||
1591 | * LKM RO/NX protection: protect module's text/ro-data | ||
1592 | * from modification and any data from execution. | ||
1593 | */ | ||
1594 | void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) | ||
1595 | { | ||
1596 | unsigned long begin_pfn = PFN_DOWN((unsigned long)start); | ||
1597 | unsigned long end_pfn = PFN_DOWN((unsigned long)end); | ||
1598 | |||
1599 | if (end_pfn > begin_pfn) | ||
1600 | set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
1601 | } | ||
1602 | |||
1603 | static void set_section_ro_nx(void *base, | ||
1604 | unsigned long text_size, | ||
1605 | unsigned long ro_size, | ||
1606 | unsigned long total_size) | ||
1607 | { | ||
1608 | /* begin and end PFNs of the current subsection */ | ||
1609 | unsigned long begin_pfn; | ||
1610 | unsigned long end_pfn; | ||
1611 | |||
1612 | /* | ||
1613 | * Set RO for module text and RO-data: | ||
1614 | * - Always protect first page. | ||
1615 | * - Do not protect last partial page. | ||
1616 | */ | ||
1617 | if (ro_size > 0) | ||
1618 | set_page_attributes(base, base + ro_size, set_memory_ro); | ||
1619 | |||
1620 | /* | ||
1621 | * Set NX permissions for module data: | ||
1622 | * - Do not protect first partial page. | ||
1623 | * - Always protect last page. | ||
1624 | */ | ||
1625 | if (total_size > text_size) { | ||
1626 | begin_pfn = PFN_UP((unsigned long)base + text_size); | ||
1627 | end_pfn = PFN_UP((unsigned long)base + total_size); | ||
1628 | if (end_pfn > begin_pfn) | ||
1629 | set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); | ||
1630 | } | ||
1631 | } | ||
1632 | |||
1633 | static void unset_module_core_ro_nx(struct module *mod) | ||
1634 | { | ||
1635 | set_page_attributes(mod->module_core + mod->core_text_size, | ||
1636 | mod->module_core + mod->core_size, | ||
1637 | set_memory_x); | ||
1638 | set_page_attributes(mod->module_core, | ||
1639 | mod->module_core + mod->core_ro_size, | ||
1640 | set_memory_rw); | ||
1641 | } | ||
1642 | |||
1643 | static void unset_module_init_ro_nx(struct module *mod) | ||
1644 | { | ||
1645 | set_page_attributes(mod->module_init + mod->init_text_size, | ||
1646 | mod->module_init + mod->init_size, | ||
1647 | set_memory_x); | ||
1648 | set_page_attributes(mod->module_init, | ||
1649 | mod->module_init + mod->init_ro_size, | ||
1650 | set_memory_rw); | ||
1651 | } | ||
1652 | |||
1653 | /* Iterate through all modules and set each module's text as RW */ | ||
1654 | void set_all_modules_text_rw(void) | ||
1655 | { | ||
1656 | struct module *mod; | ||
1657 | |||
1658 | mutex_lock(&module_mutex); | ||
1659 | list_for_each_entry_rcu(mod, &modules, list) { | ||
1660 | if ((mod->module_core) && (mod->core_text_size)) { | ||
1661 | set_page_attributes(mod->module_core, | ||
1662 | mod->module_core + mod->core_text_size, | ||
1663 | set_memory_rw); | ||
1664 | } | ||
1665 | if ((mod->module_init) && (mod->init_text_size)) { | ||
1666 | set_page_attributes(mod->module_init, | ||
1667 | mod->module_init + mod->init_text_size, | ||
1668 | set_memory_rw); | ||
1669 | } | ||
1670 | } | ||
1671 | mutex_unlock(&module_mutex); | ||
1672 | } | ||
1673 | |||
1674 | /* Iterate through all modules and set each module's text as RO */ | ||
1675 | void set_all_modules_text_ro(void) | ||
1676 | { | ||
1677 | struct module *mod; | ||
1678 | |||
1679 | mutex_lock(&module_mutex); | ||
1680 | list_for_each_entry_rcu(mod, &modules, list) { | ||
1681 | if ((mod->module_core) && (mod->core_text_size)) { | ||
1682 | set_page_attributes(mod->module_core, | ||
1683 | mod->module_core + mod->core_text_size, | ||
1684 | set_memory_ro); | ||
1685 | } | ||
1686 | if ((mod->module_init) && (mod->init_text_size)) { | ||
1687 | set_page_attributes(mod->module_init, | ||
1688 | mod->module_init + mod->init_text_size, | ||
1689 | set_memory_ro); | ||
1690 | } | ||
1691 | } | ||
1692 | mutex_unlock(&module_mutex); | ||
1693 | } | ||
1694 | #else | ||
1695 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | ||
1696 | static void unset_module_core_ro_nx(struct module *mod) { } | ||
1697 | static void unset_module_init_ro_nx(struct module *mod) { } | ||
1698 | #endif | ||
1699 | |||
1544 | /* Free a module, remove from lists, etc. */ | 1700 | /* Free a module, remove from lists, etc. */ |
1545 | static void free_module(struct module *mod) | 1701 | static void free_module(struct module *mod) |
1546 | { | 1702 | { |
@@ -1565,6 +1721,7 @@ static void free_module(struct module *mod) | |||
1565 | destroy_params(mod->kp, mod->num_kp); | 1721 | destroy_params(mod->kp, mod->num_kp); |
1566 | 1722 | ||
1567 | /* This may be NULL, but that's OK */ | 1723 | /* This may be NULL, but that's OK */ |
1724 | unset_module_init_ro_nx(mod); | ||
1568 | module_free(mod, mod->module_init); | 1725 | module_free(mod, mod->module_init); |
1569 | kfree(mod->args); | 1726 | kfree(mod->args); |
1570 | percpu_modfree(mod); | 1727 | percpu_modfree(mod); |
@@ -1573,6 +1730,7 @@ static void free_module(struct module *mod) | |||
1573 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1730 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1574 | 1731 | ||
1575 | /* Finally, free the core (containing the module structure) */ | 1732 | /* Finally, free the core (containing the module structure) */ |
1733 | unset_module_core_ro_nx(mod); | ||
1576 | module_free(mod, mod->module_core); | 1734 | module_free(mod, mod->module_core); |
1577 | 1735 | ||
1578 | #ifdef CONFIG_MPU | 1736 | #ifdef CONFIG_MPU |
@@ -1776,8 +1934,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1776 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 1934 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1777 | DEBUGP("\t%s\n", name); | 1935 | DEBUGP("\t%s\n", name); |
1778 | } | 1936 | } |
1779 | if (m == 0) | 1937 | switch (m) { |
1938 | case 0: /* executable */ | ||
1939 | mod->core_size = debug_align(mod->core_size); | ||
1780 | mod->core_text_size = mod->core_size; | 1940 | mod->core_text_size = mod->core_size; |
1941 | break; | ||
1942 | case 1: /* RO: text and ro-data */ | ||
1943 | mod->core_size = debug_align(mod->core_size); | ||
1944 | mod->core_ro_size = mod->core_size; | ||
1945 | break; | ||
1946 | case 3: /* whole core */ | ||
1947 | mod->core_size = debug_align(mod->core_size); | ||
1948 | break; | ||
1949 | } | ||
1781 | } | 1950 | } |
1782 | 1951 | ||
1783 | DEBUGP("Init section allocation order:\n"); | 1952 | DEBUGP("Init section allocation order:\n"); |
@@ -1795,8 +1964,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1795 | | INIT_OFFSET_MASK); | 1964 | | INIT_OFFSET_MASK); |
1796 | DEBUGP("\t%s\n", sname); | 1965 | DEBUGP("\t%s\n", sname); |
1797 | } | 1966 | } |
1798 | if (m == 0) | 1967 | switch (m) { |
1968 | case 0: /* executable */ | ||
1969 | mod->init_size = debug_align(mod->init_size); | ||
1799 | mod->init_text_size = mod->init_size; | 1970 | mod->init_text_size = mod->init_size; |
1971 | break; | ||
1972 | case 1: /* RO: text and ro-data */ | ||
1973 | mod->init_size = debug_align(mod->init_size); | ||
1974 | mod->init_ro_size = mod->init_size; | ||
1975 | break; | ||
1976 | case 3: /* whole init */ | ||
1977 | mod->init_size = debug_align(mod->init_size); | ||
1978 | break; | ||
1979 | } | ||
1800 | } | 1980 | } |
1801 | } | 1981 | } |
1802 | 1982 | ||
@@ -1875,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
1875 | const struct kernel_symbol *start, | 2055 | const struct kernel_symbol *start, |
1876 | const struct kernel_symbol *stop) | 2056 | const struct kernel_symbol *stop) |
1877 | { | 2057 | { |
1878 | const struct kernel_symbol *ks = start; | 2058 | return bsearch(name, start, stop - start, |
1879 | for (; ks < stop; ks++) | 2059 | sizeof(struct kernel_symbol), cmp_name); |
1880 | if (strcmp(ks->name, name) == 0) | ||
1881 | return ks; | ||
1882 | return NULL; | ||
1883 | } | 2060 | } |
1884 | 2061 | ||
1885 | static int is_exported(const char *name, unsigned long value, | 2062 | static int is_exported(const char *name, unsigned long value, |
@@ -2036,7 +2213,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info) | |||
2036 | { | 2213 | { |
2037 | } | 2214 | } |
2038 | 2215 | ||
2039 | static void add_kallsyms(struct module *mod, struct load_info *info) | 2216 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
2040 | { | 2217 | { |
2041 | } | 2218 | } |
2042 | #endif /* CONFIG_KALLSYMS */ | 2219 | #endif /* CONFIG_KALLSYMS */ |
@@ -2305,9 +2482,14 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2305 | #endif | 2482 | #endif |
2306 | 2483 | ||
2307 | #ifdef CONFIG_TRACEPOINTS | 2484 | #ifdef CONFIG_TRACEPOINTS |
2308 | mod->tracepoints = section_objs(info, "__tracepoints", | 2485 | mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", |
2309 | sizeof(*mod->tracepoints), | 2486 | sizeof(*mod->tracepoints_ptrs), |
2310 | &mod->num_tracepoints); | 2487 | &mod->num_tracepoints); |
2488 | #endif | ||
2489 | #ifdef HAVE_JUMP_LABEL | ||
2490 | mod->jump_entries = section_objs(info, "__jump_table", | ||
2491 | sizeof(*mod->jump_entries), | ||
2492 | &mod->num_jump_entries); | ||
2311 | #endif | 2493 | #endif |
2312 | #ifdef CONFIG_EVENT_TRACING | 2494 | #ifdef CONFIG_EVENT_TRACING |
2313 | mod->trace_events = section_objs(info, "_ftrace_events", | 2495 | mod->trace_events = section_objs(info, "_ftrace_events", |
@@ -2320,6 +2502,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2320 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | 2502 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * |
2321 | mod->num_trace_events, GFP_KERNEL); | 2503 | mod->num_trace_events, GFP_KERNEL); |
2322 | #endif | 2504 | #endif |
2505 | #ifdef CONFIG_TRACING | ||
2506 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | ||
2507 | sizeof(*mod->trace_bprintk_fmt_start), | ||
2508 | &mod->num_trace_bprintk_fmt); | ||
2509 | /* | ||
2510 | * This section contains pointers to allocated objects in the trace | ||
2511 | * code and not scanning it leads to false positives. | ||
2512 | */ | ||
2513 | kmemleak_scan_area(mod->trace_bprintk_fmt_start, | ||
2514 | sizeof(*mod->trace_bprintk_fmt_start) * | ||
2515 | mod->num_trace_bprintk_fmt, GFP_KERNEL); | ||
2516 | #endif | ||
2323 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | 2517 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD |
2324 | /* sechdrs[0].sh_size is always zero */ | 2518 | /* sechdrs[0].sh_size is always zero */ |
2325 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", | 2519 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", |
@@ -2605,7 +2799,7 @@ static struct module *load_module(void __user *umod, | |||
2605 | mod->state = MODULE_STATE_COMING; | 2799 | mod->state = MODULE_STATE_COMING; |
2606 | 2800 | ||
2607 | /* Now sew it into the lists so we can get lockdep and oops | 2801 | /* Now sew it into the lists so we can get lockdep and oops |
2608 | * info during argument parsing. Noone should access us, since | 2802 | * info during argument parsing. No one should access us, since |
2609 | * strong_try_module_get() will fail. | 2803 | * strong_try_module_get() will fail. |
2610 | * lockdep/oops can run asynchronous, so use the RCU list insertion | 2804 | * lockdep/oops can run asynchronous, so use the RCU list insertion |
2611 | * function to insert in a way safe to concurrent readers. | 2805 | * function to insert in a way safe to concurrent readers. |
@@ -2618,7 +2812,7 @@ static struct module *load_module(void __user *umod, | |||
2618 | } | 2812 | } |
2619 | 2813 | ||
2620 | /* This has to be done once we're sure module name is unique. */ | 2814 | /* This has to be done once we're sure module name is unique. */ |
2621 | if (!mod->taints) | 2815 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) |
2622 | dynamic_debug_setup(info.debug, info.num_debug); | 2816 | dynamic_debug_setup(info.debug, info.num_debug); |
2623 | 2817 | ||
2624 | /* Find duplicate symbols */ | 2818 | /* Find duplicate symbols */ |
@@ -2655,7 +2849,7 @@ static struct module *load_module(void __user *umod, | |||
2655 | module_bug_cleanup(mod); | 2849 | module_bug_cleanup(mod); |
2656 | 2850 | ||
2657 | ddebug: | 2851 | ddebug: |
2658 | if (!mod->taints) | 2852 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) |
2659 | dynamic_debug_remove(info.debug); | 2853 | dynamic_debug_remove(info.debug); |
2660 | unlock: | 2854 | unlock: |
2661 | mutex_unlock(&module_mutex); | 2855 | mutex_unlock(&module_mutex); |
@@ -2704,6 +2898,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2704 | blocking_notifier_call_chain(&module_notify_list, | 2898 | blocking_notifier_call_chain(&module_notify_list, |
2705 | MODULE_STATE_COMING, mod); | 2899 | MODULE_STATE_COMING, mod); |
2706 | 2900 | ||
2901 | /* Set RO and NX regions for core */ | ||
2902 | set_section_ro_nx(mod->module_core, | ||
2903 | mod->core_text_size, | ||
2904 | mod->core_ro_size, | ||
2905 | mod->core_size); | ||
2906 | |||
2907 | /* Set RO and NX regions for init */ | ||
2908 | set_section_ro_nx(mod->module_init, | ||
2909 | mod->init_text_size, | ||
2910 | mod->init_ro_size, | ||
2911 | mod->init_size); | ||
2912 | |||
2707 | do_mod_ctors(mod); | 2913 | do_mod_ctors(mod); |
2708 | /* Start the module */ | 2914 | /* Start the module */ |
2709 | if (mod->init != NULL) | 2915 | if (mod->init != NULL) |
@@ -2747,9 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2747 | mod->symtab = mod->core_symtab; | 2953 | mod->symtab = mod->core_symtab; |
2748 | mod->strtab = mod->core_strtab; | 2954 | mod->strtab = mod->core_strtab; |
2749 | #endif | 2955 | #endif |
2956 | unset_module_init_ro_nx(mod); | ||
2750 | module_free(mod, mod->module_init); | 2957 | module_free(mod, mod->module_init); |
2751 | mod->module_init = NULL; | 2958 | mod->module_init = NULL; |
2752 | mod->init_size = 0; | 2959 | mod->init_size = 0; |
2960 | mod->init_ro_size = 0; | ||
2753 | mod->init_text_size = 0; | 2961 | mod->init_text_size = 0; |
2754 | mutex_unlock(&module_mutex); | 2962 | mutex_unlock(&module_mutex); |
2755 | 2963 | ||
@@ -2786,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod, | |||
2786 | else | 2994 | else |
2787 | nextval = (unsigned long)mod->module_core+mod->core_text_size; | 2995 | nextval = (unsigned long)mod->module_core+mod->core_text_size; |
2788 | 2996 | ||
2789 | /* Scan for closest preceeding symbol, and next symbol. (ELF | 2997 | /* Scan for closest preceding symbol, and next symbol. (ELF |
2790 | starts real symbols at 1). */ | 2998 | starts real symbols at 1). */ |
2791 | for (i = 1; i < mod->num_symtab; i++) { | 2999 | for (i = 1; i < mod->num_symtab; i++) { |
2792 | if (mod->symtab[i].st_shndx == SHN_UNDEF) | 3000 | if (mod->symtab[i].st_shndx == SHN_UNDEF) |
@@ -3039,7 +3247,7 @@ static int m_show(struct seq_file *m, void *p) | |||
3039 | mod->state == MODULE_STATE_COMING ? "Loading": | 3247 | mod->state == MODULE_STATE_COMING ? "Loading": |
3040 | "Live"); | 3248 | "Live"); |
3041 | /* Used by oprofile and other similar tools. */ | 3249 | /* Used by oprofile and other similar tools. */ |
3042 | seq_printf(m, " 0x%p", mod->module_core); | 3250 | seq_printf(m, " 0x%pK", mod->module_core); |
3043 | 3251 | ||
3044 | /* Taints info */ | 3252 | /* Taints info */ |
3045 | if (mod->taints) | 3253 | if (mod->taints) |
@@ -3208,7 +3416,7 @@ void module_layout(struct module *mod, | |||
3208 | struct modversion_info *ver, | 3416 | struct modversion_info *ver, |
3209 | struct kernel_param *kp, | 3417 | struct kernel_param *kp, |
3210 | struct kernel_symbol *ks, | 3418 | struct kernel_symbol *ks, |
3211 | struct tracepoint *tp) | 3419 | struct tracepoint * const *tp) |
3212 | { | 3420 | { |
3213 | } | 3421 | } |
3214 | EXPORT_SYMBOL(module_layout); | 3422 | EXPORT_SYMBOL(module_layout); |
@@ -3222,8 +3430,8 @@ void module_update_tracepoints(void) | |||
3222 | mutex_lock(&module_mutex); | 3430 | mutex_lock(&module_mutex); |
3223 | list_for_each_entry(mod, &modules, list) | 3431 | list_for_each_entry(mod, &modules, list) |
3224 | if (!mod->taints) | 3432 | if (!mod->taints) |
3225 | tracepoint_update_probe_range(mod->tracepoints, | 3433 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
3226 | mod->tracepoints + mod->num_tracepoints); | 3434 | mod->tracepoints_ptrs + mod->num_tracepoints); |
3227 | mutex_unlock(&module_mutex); | 3435 | mutex_unlock(&module_mutex); |
3228 | } | 3436 | } |
3229 | 3437 | ||
@@ -3247,8 +3455,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) | |||
3247 | else if (iter_mod > iter->module) | 3455 | else if (iter_mod > iter->module) |
3248 | iter->tracepoint = NULL; | 3456 | iter->tracepoint = NULL; |
3249 | found = tracepoint_get_iter_range(&iter->tracepoint, | 3457 | found = tracepoint_get_iter_range(&iter->tracepoint, |
3250 | iter_mod->tracepoints, | 3458 | iter_mod->tracepoints_ptrs, |
3251 | iter_mod->tracepoints | 3459 | iter_mod->tracepoints_ptrs |
3252 | + iter_mod->num_tracepoints); | 3460 | + iter_mod->num_tracepoints); |
3253 | if (found) { | 3461 | if (found) { |
3254 | iter->module = iter_mod; | 3462 | iter->module = iter_mod; |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) | |||
75 | return; | 75 | return; |
76 | 76 | ||
77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
78 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 78 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
80 | mutex_clear_owner(lock); | 80 | mutex_clear_owner(lock); |
81 | } | 81 | } |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, | |||
29 | 29 | ||
30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
31 | { | 31 | { |
32 | lock->owner = current_thread_info(); | 32 | lock->owner = current; |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502f..d607ed5dd441 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock); | |||
131 | */ | 131 | */ |
132 | static inline int __sched | 132 | static inline int __sched |
133 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 133 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
134 | unsigned long ip) | 134 | struct lockdep_map *nest_lock, unsigned long ip) |
135 | { | 135 | { |
136 | struct task_struct *task = current; | 136 | struct task_struct *task = current; |
137 | struct mutex_waiter waiter; | 137 | struct mutex_waiter waiter; |
138 | unsigned long flags; | 138 | unsigned long flags; |
139 | 139 | ||
140 | preempt_disable(); | 140 | preempt_disable(); |
141 | mutex_acquire(&lock->dep_map, subclass, 0, ip); | 141 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
142 | 142 | ||
143 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 143 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
144 | /* | 144 | /* |
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
160 | */ | 160 | */ |
161 | 161 | ||
162 | for (;;) { | 162 | for (;;) { |
163 | struct thread_info *owner; | 163 | struct task_struct *owner; |
164 | |||
165 | /* | ||
166 | * If we own the BKL, then don't spin. The owner of | ||
167 | * the mutex might be waiting on us to release the BKL. | ||
168 | */ | ||
169 | if (unlikely(current->lock_depth >= 0)) | ||
170 | break; | ||
171 | 164 | ||
172 | /* | 165 | /* |
173 | * If there's an owner, wait for it to either | 166 | * If there's an owner, wait for it to either |
@@ -199,7 +192,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
199 | * memory barriers as we'll eventually observe the right | 192 | * memory barriers as we'll eventually observe the right |
200 | * values at the cost of a few extra spins. | 193 | * values at the cost of a few extra spins. |
201 | */ | 194 | */ |
202 | cpu_relax(); | 195 | arch_mutex_cpu_relax(); |
203 | } | 196 | } |
204 | #endif | 197 | #endif |
205 | spin_lock_mutex(&lock->wait_lock, flags); | 198 | spin_lock_mutex(&lock->wait_lock, flags); |
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
245 | } | 238 | } |
246 | __set_task_state(task, state); | 239 | __set_task_state(task, state); |
247 | 240 | ||
248 | /* didnt get the lock, go to sleep: */ | 241 | /* didn't get the lock, go to sleep: */ |
249 | spin_unlock_mutex(&lock->wait_lock, flags); | 242 | spin_unlock_mutex(&lock->wait_lock, flags); |
250 | preempt_enable_no_resched(); | 243 | preempt_enable_no_resched(); |
251 | schedule(); | 244 | schedule(); |
@@ -276,16 +269,25 @@ void __sched | |||
276 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | 269 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) |
277 | { | 270 | { |
278 | might_sleep(); | 271 | might_sleep(); |
279 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); | 272 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); |
280 | } | 273 | } |
281 | 274 | ||
282 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 275 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
283 | 276 | ||
277 | void __sched | ||
278 | _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | ||
279 | { | ||
280 | might_sleep(); | ||
281 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); | ||
282 | } | ||
283 | |||
284 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | ||
285 | |||
284 | int __sched | 286 | int __sched |
285 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | 287 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) |
286 | { | 288 | { |
287 | might_sleep(); | 289 | might_sleep(); |
288 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); | 290 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); |
289 | } | 291 | } |
290 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 292 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
291 | 293 | ||
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
294 | { | 296 | { |
295 | might_sleep(); | 297 | might_sleep(); |
296 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 298 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
297 | subclass, _RET_IP_); | 299 | subclass, NULL, _RET_IP_); |
298 | } | 300 | } |
299 | 301 | ||
300 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 302 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
400 | { | 402 | { |
401 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 403 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
402 | 404 | ||
403 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); | 405 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); |
404 | } | 406 | } |
405 | 407 | ||
406 | static noinline int __sched | 408 | static noinline int __sched |
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) | |||
408 | { | 410 | { |
409 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 411 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
410 | 412 | ||
411 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); | 413 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); |
412 | } | 414 | } |
413 | 415 | ||
414 | static noinline int __sched | 416 | static noinline int __sched |
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count) | |||
416 | { | 418 | { |
417 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 419 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
418 | 420 | ||
419 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); | 421 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); |
420 | } | 422 | } |
421 | #endif | 423 | #endif |
422 | 424 | ||
diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -19,7 +19,7 @@ | |||
19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_SMP |
20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
21 | { | 21 | { |
22 | lock->owner = current_thread_info(); | 22 | lock->owner = current; |
23 | } | 23 | } |
24 | 24 | ||
25 | static inline void mutex_clear_owner(struct mutex *lock) | 25 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2a5dfec8efe0..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null | |||
@@ -1,110 +0,0 @@ | |||
1 | /* | ||
2 | * ns_cgroup.c - namespace cgroup subsystem | ||
3 | * | ||
4 | * Copyright 2006, 2007 IBM Corp | ||
5 | */ | ||
6 | |||
7 | #include <linux/module.h> | ||
8 | #include <linux/cgroup.h> | ||
9 | #include <linux/fs.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/nsproxy.h> | ||
13 | |||
14 | struct ns_cgroup { | ||
15 | struct cgroup_subsys_state css; | ||
16 | }; | ||
17 | |||
18 | struct cgroup_subsys ns_subsys; | ||
19 | |||
20 | static inline struct ns_cgroup *cgroup_to_ns( | ||
21 | struct cgroup *cgroup) | ||
22 | { | ||
23 | return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), | ||
24 | struct ns_cgroup, css); | ||
25 | } | ||
26 | |||
27 | int ns_cgroup_clone(struct task_struct *task, struct pid *pid) | ||
28 | { | ||
29 | char name[PROC_NUMBUF]; | ||
30 | |||
31 | snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); | ||
32 | return cgroup_clone(task, &ns_subsys, name); | ||
33 | } | ||
34 | |||
35 | /* | ||
36 | * Rules: | ||
37 | * 1. you can only enter a cgroup which is a descendant of your current | ||
38 | * cgroup | ||
39 | * 2. you can only place another process into a cgroup if | ||
40 | * a. you have CAP_SYS_ADMIN | ||
41 | * b. your cgroup is an ancestor of task's destination cgroup | ||
42 | * (hence either you are in the same cgroup as task, or in an | ||
43 | * ancestor cgroup thereof) | ||
44 | */ | ||
45 | static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, | ||
46 | struct task_struct *task, bool threadgroup) | ||
47 | { | ||
48 | if (current != task) { | ||
49 | if (!capable(CAP_SYS_ADMIN)) | ||
50 | return -EPERM; | ||
51 | |||
52 | if (!cgroup_is_descendant(new_cgroup, current)) | ||
53 | return -EPERM; | ||
54 | } | ||
55 | |||
56 | if (!cgroup_is_descendant(new_cgroup, task)) | ||
57 | return -EPERM; | ||
58 | |||
59 | if (threadgroup) { | ||
60 | struct task_struct *c; | ||
61 | rcu_read_lock(); | ||
62 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
63 | if (!cgroup_is_descendant(new_cgroup, c)) { | ||
64 | rcu_read_unlock(); | ||
65 | return -EPERM; | ||
66 | } | ||
67 | } | ||
68 | rcu_read_unlock(); | ||
69 | } | ||
70 | |||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Rules: you can only create a cgroup if | ||
76 | * 1. you are capable(CAP_SYS_ADMIN) | ||
77 | * 2. the target cgroup is a descendant of your own cgroup | ||
78 | */ | ||
79 | static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | ||
80 | struct cgroup *cgroup) | ||
81 | { | ||
82 | struct ns_cgroup *ns_cgroup; | ||
83 | |||
84 | if (!capable(CAP_SYS_ADMIN)) | ||
85 | return ERR_PTR(-EPERM); | ||
86 | if (!cgroup_is_descendant(cgroup, current)) | ||
87 | return ERR_PTR(-EPERM); | ||
88 | |||
89 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | ||
90 | if (!ns_cgroup) | ||
91 | return ERR_PTR(-ENOMEM); | ||
92 | return &ns_cgroup->css; | ||
93 | } | ||
94 | |||
95 | static void ns_destroy(struct cgroup_subsys *ss, | ||
96 | struct cgroup *cgroup) | ||
97 | { | ||
98 | struct ns_cgroup *ns_cgroup; | ||
99 | |||
100 | ns_cgroup = cgroup_to_ns(cgroup); | ||
101 | kfree(ns_cgroup); | ||
102 | } | ||
103 | |||
104 | struct cgroup_subsys ns_subsys = { | ||
105 | .name = "ns", | ||
106 | .can_attach = ns_can_attach, | ||
107 | .create = ns_create, | ||
108 | .destroy = ns_destroy, | ||
109 | .subsys_id = ns_subsys_id, | ||
110 | }; | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c00e26d..d6a00f3de15d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -22,6 +22,9 @@ | |||
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <net/net_namespace.h> | 23 | #include <net/net_namespace.h> |
24 | #include <linux/ipc_namespace.h> | 24 | #include <linux/ipc_namespace.h> |
25 | #include <linux/proc_fs.h> | ||
26 | #include <linux/file.h> | ||
27 | #include <linux/syscalls.h> | ||
25 | 28 | ||
26 | static struct kmem_cache *nsproxy_cachep; | 29 | static struct kmem_cache *nsproxy_cachep; |
27 | 30 | ||
@@ -69,13 +72,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
69 | goto out_ns; | 72 | goto out_ns; |
70 | } | 73 | } |
71 | 74 | ||
72 | new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); | 75 | new_nsp->uts_ns = copy_utsname(flags, tsk); |
73 | if (IS_ERR(new_nsp->uts_ns)) { | 76 | if (IS_ERR(new_nsp->uts_ns)) { |
74 | err = PTR_ERR(new_nsp->uts_ns); | 77 | err = PTR_ERR(new_nsp->uts_ns); |
75 | goto out_uts; | 78 | goto out_uts; |
76 | } | 79 | } |
77 | 80 | ||
78 | new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); | 81 | new_nsp->ipc_ns = copy_ipcs(flags, tsk); |
79 | if (IS_ERR(new_nsp->ipc_ns)) { | 82 | if (IS_ERR(new_nsp->ipc_ns)) { |
80 | err = PTR_ERR(new_nsp->ipc_ns); | 83 | err = PTR_ERR(new_nsp->ipc_ns); |
81 | goto out_ipc; | 84 | goto out_ipc; |
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
198 | goto out; | 201 | goto out; |
199 | } | 202 | } |
200 | 203 | ||
201 | err = ns_cgroup_clone(current, task_pid(current)); | ||
202 | if (err) | ||
203 | put_nsproxy(*new_nsp); | ||
204 | |||
205 | out: | 204 | out: |
206 | return err; | 205 | return err; |
207 | } | 206 | } |
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p) | |||
233 | switch_task_namespaces(p, NULL); | 232 | switch_task_namespaces(p, NULL); |
234 | } | 233 | } |
235 | 234 | ||
235 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) | ||
236 | { | ||
237 | const struct proc_ns_operations *ops; | ||
238 | struct task_struct *tsk = current; | ||
239 | struct nsproxy *new_nsproxy; | ||
240 | struct proc_inode *ei; | ||
241 | struct file *file; | ||
242 | int err; | ||
243 | |||
244 | if (!capable(CAP_SYS_ADMIN)) | ||
245 | return -EPERM; | ||
246 | |||
247 | file = proc_ns_fget(fd); | ||
248 | if (IS_ERR(file)) | ||
249 | return PTR_ERR(file); | ||
250 | |||
251 | err = -EINVAL; | ||
252 | ei = PROC_I(file->f_dentry->d_inode); | ||
253 | ops = ei->ns_ops; | ||
254 | if (nstype && (ops->type != nstype)) | ||
255 | goto out; | ||
256 | |||
257 | new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); | ||
258 | if (IS_ERR(new_nsproxy)) { | ||
259 | err = PTR_ERR(new_nsproxy); | ||
260 | goto out; | ||
261 | } | ||
262 | |||
263 | err = ops->install(new_nsproxy, ei->ns); | ||
264 | if (err) { | ||
265 | free_nsproxy(new_nsproxy); | ||
266 | goto out; | ||
267 | } | ||
268 | switch_task_namespaces(tsk, new_nsproxy); | ||
269 | out: | ||
270 | fput(file); | ||
271 | return err; | ||
272 | } | ||
273 | |||
236 | static int __init nsproxy_cache_init(void) | 274 | static int __init nsproxy_cache_init(void) |
237 | { | 275 | { |
238 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); | 276 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); |
diff --git a/kernel/padata.c b/kernel/padata.c index 751019415d23..b91941df5e63 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) | |||
262 | /* | 262 | /* |
263 | * This cpu has to do the parallel processing of the next | 263 | * This cpu has to do the parallel processing of the next |
264 | * object. It's waiting in the cpu's parallelization queue, | 264 | * object. It's waiting in the cpu's parallelization queue, |
265 | * so exit imediately. | 265 | * so exit immediately. |
266 | */ | 266 | */ |
267 | if (PTR_ERR(padata) == -ENODATA) { | 267 | if (PTR_ERR(padata) == -ENODATA) { |
268 | del_timer(&pd->timer); | 268 | del_timer(&pd->timer); |
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) | |||
284 | /* | 284 | /* |
285 | * The next object that needs serialization might have arrived to | 285 | * The next object that needs serialization might have arrived to |
286 | * the reorder queues in the meantime, we will be called again | 286 | * the reorder queues in the meantime, we will be called again |
287 | * from the timer function if noone else cares for it. | 287 | * from the timer function if no one else cares for it. |
288 | */ | 288 | */ |
289 | if (atomic_read(&pd->reorder_objects) | 289 | if (atomic_read(&pd->reorder_objects) |
290 | && !(pinst->flags & PADATA_RESET)) | 290 | && !(pinst->flags & PADATA_RESET)) |
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) | |||
515 | put_online_cpus(); | 515 | put_online_cpus(); |
516 | } | 516 | } |
517 | 517 | ||
518 | /* Replace the internal control stucture with a new one. */ | 518 | /* Replace the internal control structure with a new one. */ |
519 | static void padata_replace(struct padata_instance *pinst, | 519 | static void padata_replace(struct padata_instance *pinst, |
520 | struct parallel_data *pd_new) | 520 | struct parallel_data *pd_new) |
521 | { | 521 | { |
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | |||
768 | } | 768 | } |
769 | 769 | ||
770 | /** | 770 | /** |
771 | * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) | 771 | * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) |
772 | * padata cpumasks. | 772 | * padata cpumasks. |
773 | * | 773 | * |
774 | * @pinst: padata instance | 774 | * @pinst: padata instance |
diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88ebb..69231670eb95 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -34,6 +34,7 @@ static int pause_on_oops_flag; | |||
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | 35 | ||
36 | int panic_timeout; | 36 | int panic_timeout; |
37 | EXPORT_SYMBOL_GPL(panic_timeout); | ||
37 | 38 | ||
38 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
39 | 40 | ||
@@ -432,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
432 | 433 | ||
433 | core_param(panic, panic_timeout, int, 0644); | 434 | core_param(panic, panic_timeout, int, 0644); |
434 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 435 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
436 | |||
437 | static int __init oops_setup(char *s) | ||
438 | { | ||
439 | if (!s) | ||
440 | return -EINVAL; | ||
441 | if (!strcmp(s, "panic")) | ||
442 | panic_on_oops = 1; | ||
443 | return 0; | ||
444 | } | ||
445 | early_param("oops", oops_setup); | ||
diff --git a/kernel/params.c b/kernel/params.c index 08107d181758..ed72e1330862 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -95,7 +95,7 @@ static int parse_one(char *param, | |||
95 | /* Find parameter */ | 95 | /* Find parameter */ |
96 | for (i = 0; i < num_params; i++) { | 96 | for (i = 0; i < num_params; i++) { |
97 | if (parameq(param, params[i].name)) { | 97 | if (parameq(param, params[i].name)) { |
98 | /* Noone handled NULL, so do it here. */ | 98 | /* No one handled NULL, so do it here. */ |
99 | if (!val && params[i].ops->set != param_set_bool) | 99 | if (!val && params[i].ops->set != param_set_bool) |
100 | return -EINVAL; | 100 | return -EINVAL; |
101 | DEBUGP("They are equal! Calling %p\n", | 101 | DEBUGP("They are equal! Calling %p\n", |
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); | |||
297 | int param_set_bool(const char *val, const struct kernel_param *kp) | 297 | int param_set_bool(const char *val, const struct kernel_param *kp) |
298 | { | 298 | { |
299 | bool v; | 299 | bool v; |
300 | int ret; | ||
300 | 301 | ||
301 | /* No equals means "set"... */ | 302 | /* No equals means "set"... */ |
302 | if (!val) val = "1"; | 303 | if (!val) val = "1"; |
303 | 304 | ||
304 | /* One of =[yYnN01] */ | 305 | /* One of =[yYnN01] */ |
305 | switch (val[0]) { | 306 | ret = strtobool(val, &v); |
306 | case 'y': case 'Y': case '1': | 307 | if (ret) |
307 | v = true; | 308 | return ret; |
308 | break; | ||
309 | case 'n': case 'N': case '0': | ||
310 | v = false; | ||
311 | break; | ||
312 | default: | ||
313 | return -EINVAL; | ||
314 | } | ||
315 | 309 | ||
316 | if (kp->flags & KPARAM_ISBOOL) | 310 | if (kp->flags & KPARAM_ISBOOL) |
317 | *(bool *)kp->arg = v; | 311 | *(bool *)kp->arg = v; |
@@ -719,9 +713,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) | |||
719 | params[i].ops->free(params[i].arg); | 713 | params[i].ops->free(params[i].arg); |
720 | } | 714 | } |
721 | 715 | ||
722 | static void __init kernel_add_sysfs_param(const char *name, | 716 | static struct module_kobject * __init locate_module_kobject(const char *name) |
723 | struct kernel_param *kparam, | ||
724 | unsigned int name_skip) | ||
725 | { | 717 | { |
726 | struct module_kobject *mk; | 718 | struct module_kobject *mk; |
727 | struct kobject *kobj; | 719 | struct kobject *kobj; |
@@ -729,10 +721,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
729 | 721 | ||
730 | kobj = kset_find_obj(module_kset, name); | 722 | kobj = kset_find_obj(module_kset, name); |
731 | if (kobj) { | 723 | if (kobj) { |
732 | /* We already have one. Remove params so we can add more. */ | ||
733 | mk = to_module_kobject(kobj); | 724 | mk = to_module_kobject(kobj); |
734 | /* We need to remove it before adding parameters. */ | ||
735 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
736 | } else { | 725 | } else { |
737 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); | 726 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
738 | BUG_ON(!mk); | 727 | BUG_ON(!mk); |
@@ -743,15 +732,36 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
743 | "%s", name); | 732 | "%s", name); |
744 | if (err) { | 733 | if (err) { |
745 | kobject_put(&mk->kobj); | 734 | kobject_put(&mk->kobj); |
746 | printk(KERN_ERR "Module '%s' failed add to sysfs, " | 735 | printk(KERN_ERR |
747 | "error number %d\n", name, err); | 736 | "Module '%s' failed add to sysfs, error number %d\n", |
748 | printk(KERN_ERR "The system will be unstable now.\n"); | 737 | name, err); |
749 | return; | 738 | printk(KERN_ERR |
739 | "The system will be unstable now.\n"); | ||
740 | return NULL; | ||
750 | } | 741 | } |
751 | /* So that exit path is even. */ | 742 | |
743 | /* So that we hold reference in both cases. */ | ||
752 | kobject_get(&mk->kobj); | 744 | kobject_get(&mk->kobj); |
753 | } | 745 | } |
754 | 746 | ||
747 | return mk; | ||
748 | } | ||
749 | |||
750 | static void __init kernel_add_sysfs_param(const char *name, | ||
751 | struct kernel_param *kparam, | ||
752 | unsigned int name_skip) | ||
753 | { | ||
754 | struct module_kobject *mk; | ||
755 | int err; | ||
756 | |||
757 | mk = locate_module_kobject(name); | ||
758 | if (!mk) | ||
759 | return; | ||
760 | |||
761 | /* We need to remove old parameters before adding more. */ | ||
762 | if (mk->mp) | ||
763 | sysfs_remove_group(&mk->kobj, &mk->mp->grp); | ||
764 | |||
755 | /* These should not fail at boot. */ | 765 | /* These should not fail at boot. */ |
756 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); | 766 | err = add_sysfs_param(mk, kparam, kparam->name + name_skip); |
757 | BUG_ON(err); | 767 | BUG_ON(err); |
@@ -796,6 +806,35 @@ static void __init param_sysfs_builtin(void) | |||
796 | } | 806 | } |
797 | } | 807 | } |
798 | 808 | ||
809 | ssize_t __modver_version_show(struct module_attribute *mattr, | ||
810 | struct module *mod, char *buf) | ||
811 | { | ||
812 | struct module_version_attribute *vattr = | ||
813 | container_of(mattr, struct module_version_attribute, mattr); | ||
814 | |||
815 | return sprintf(buf, "%s\n", vattr->version); | ||
816 | } | ||
817 | |||
818 | extern const struct module_version_attribute *__start___modver[]; | ||
819 | extern const struct module_version_attribute *__stop___modver[]; | ||
820 | |||
821 | static void __init version_sysfs_builtin(void) | ||
822 | { | ||
823 | const struct module_version_attribute **p; | ||
824 | struct module_kobject *mk; | ||
825 | int err; | ||
826 | |||
827 | for (p = __start___modver; p < __stop___modver; p++) { | ||
828 | const struct module_version_attribute *vattr = *p; | ||
829 | |||
830 | mk = locate_module_kobject(vattr->module_name); | ||
831 | if (mk) { | ||
832 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | ||
833 | kobject_uevent(&mk->kobj, KOBJ_ADD); | ||
834 | kobject_put(&mk->kobj); | ||
835 | } | ||
836 | } | ||
837 | } | ||
799 | 838 | ||
800 | /* module-related sysfs stuff */ | 839 | /* module-related sysfs stuff */ |
801 | 840 | ||
@@ -875,6 +914,7 @@ static int __init param_sysfs_init(void) | |||
875 | } | 914 | } |
876 | module_sysfs_initialized = 1; | 915 | module_sysfs_initialized = 1; |
877 | 916 | ||
917 | version_sysfs_builtin(); | ||
878 | param_sysfs_builtin(); | 918 | param_sysfs_builtin(); |
879 | 919 | ||
880 | return 0; | 920 | return 0; |
diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..57a8346a270e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
217 | return -1; | 217 | return -1; |
218 | } | 218 | } |
219 | 219 | ||
220 | int next_pidmap(struct pid_namespace *pid_ns, int last) | 220 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) |
221 | { | 221 | { |
222 | int offset; | 222 | int offset; |
223 | struct pidmap *map, *end; | 223 | struct pidmap *map, *end; |
224 | 224 | ||
225 | if (last >= PID_MAX_LIMIT) | ||
226 | return -1; | ||
227 | |||
225 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 228 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
226 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; | 229 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
227 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; | 230 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
@@ -401,7 +404,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
401 | struct task_struct *result = NULL; | 404 | struct task_struct *result = NULL; |
402 | if (pid) { | 405 | if (pid) { |
403 | struct hlist_node *first; | 406 | struct hlist_node *first; |
404 | first = rcu_dereference_check(pid->tasks[type].first, | 407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
405 | rcu_read_lock_held() || | 408 | rcu_read_lock_held() || |
406 | lockdep_tasklist_lock_is_held()); | 409 | lockdep_tasklist_lock_is_held()); |
407 | if (first) | 410 | if (first) |
@@ -416,6 +419,7 @@ EXPORT_SYMBOL(pid_task); | |||
416 | */ | 419 | */ |
417 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 420 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
418 | { | 421 | { |
422 | rcu_lockdep_assert(rcu_read_lock_held()); | ||
419 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 423 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
420 | } | 424 | } |
421 | 425 | ||
@@ -434,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) | |||
434 | rcu_read_unlock(); | 438 | rcu_read_unlock(); |
435 | return pid; | 439 | return pid; |
436 | } | 440 | } |
441 | EXPORT_SYMBOL_GPL(get_task_pid); | ||
437 | 442 | ||
438 | struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) | 443 | struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) |
439 | { | 444 | { |
@@ -445,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) | |||
445 | rcu_read_unlock(); | 450 | rcu_read_unlock(); |
446 | return result; | 451 | return result; |
447 | } | 452 | } |
453 | EXPORT_SYMBOL_GPL(get_pid_task); | ||
448 | 454 | ||
449 | struct pid *find_get_pid(pid_t nr) | 455 | struct pid *find_get_pid(pid_t nr) |
450 | { | 456 | { |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0b..e9c9adc84ca6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/acct.h> | 15 | #include <linux/acct.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/proc_fs.h> | ||
17 | 18 | ||
18 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 19 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
19 | 20 | ||
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
72 | { | 73 | { |
73 | struct pid_namespace *ns; | 74 | struct pid_namespace *ns; |
74 | unsigned int level = parent_pid_ns->level + 1; | 75 | unsigned int level = parent_pid_ns->level + 1; |
75 | int i; | 76 | int i, err = -ENOMEM; |
76 | 77 | ||
77 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); | 78 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); |
78 | if (ns == NULL) | 79 | if (ns == NULL) |
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p | |||
96 | for (i = 1; i < PIDMAP_ENTRIES; i++) | 97 | for (i = 1; i < PIDMAP_ENTRIES; i++) |
97 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 98 | atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); |
98 | 99 | ||
100 | err = pid_ns_prepare_proc(ns); | ||
101 | if (err) | ||
102 | goto out_put_parent_pid_ns; | ||
103 | |||
99 | return ns; | 104 | return ns; |
100 | 105 | ||
106 | out_put_parent_pid_ns: | ||
107 | put_pid_ns(parent_pid_ns); | ||
101 | out_free_map: | 108 | out_free_map: |
102 | kfree(ns->pidmap[0].page); | 109 | kfree(ns->pidmap[0].page); |
103 | out_free: | 110 | out_free: |
104 | kmem_cache_free(pid_ns_cachep, ns); | 111 | kmem_cache_free(pid_ns_cachep, ns); |
105 | out: | 112 | out: |
106 | return ERR_PTR(-ENOMEM); | 113 | return ERR_PTR(err); |
107 | } | 114 | } |
108 | 115 | ||
109 | static void destroy_pid_namespace(struct pid_namespace *ns) | 116 | static void destroy_pid_namespace(struct pid_namespace *ns) |
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 645e541a45f6..6824ca7d4d0c 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/string.h> | 40 | #include <linux/string.h> |
41 | #include <linux/platform_device.h> | 41 | #include <linux/platform_device.h> |
42 | #include <linux/init.h> | 42 | #include <linux/init.h> |
43 | #include <linux/kernel.h> | ||
43 | 44 | ||
44 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
45 | 46 | ||
@@ -53,11 +54,17 @@ enum pm_qos_type { | |||
53 | PM_QOS_MIN /* return the smallest value */ | 54 | PM_QOS_MIN /* return the smallest value */ |
54 | }; | 55 | }; |
55 | 56 | ||
57 | /* | ||
58 | * Note: The lockless read path depends on the CPU accessing | ||
59 | * target_value atomically. Atomic access is only guaranteed on all CPU | ||
60 | * types linux supports for 32 bit quantites | ||
61 | */ | ||
56 | struct pm_qos_object { | 62 | struct pm_qos_object { |
57 | struct plist_head requests; | 63 | struct plist_head requests; |
58 | struct blocking_notifier_head *notifiers; | 64 | struct blocking_notifier_head *notifiers; |
59 | struct miscdevice pm_qos_power_miscdev; | 65 | struct miscdevice pm_qos_power_miscdev; |
60 | char *name; | 66 | char *name; |
67 | s32 target_value; /* Do not change to 64 bit */ | ||
61 | s32 default_value; | 68 | s32 default_value; |
62 | enum pm_qos_type type; | 69 | enum pm_qos_type type; |
63 | }; | 70 | }; |
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = { | |||
70 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), | 77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), |
71 | .notifiers = &cpu_dma_lat_notifier, | 78 | .notifiers = &cpu_dma_lat_notifier, |
72 | .name = "cpu_dma_latency", | 79 | .name = "cpu_dma_latency", |
73 | .default_value = 2000 * USEC_PER_SEC, | 80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
81 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | ||
74 | .type = PM_QOS_MIN, | 82 | .type = PM_QOS_MIN, |
75 | }; | 83 | }; |
76 | 84 | ||
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = { | |||
79 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), | 87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), |
80 | .notifiers = &network_lat_notifier, | 88 | .notifiers = &network_lat_notifier, |
81 | .name = "network_latency", | 89 | .name = "network_latency", |
82 | .default_value = 2000 * USEC_PER_SEC, | 90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
91 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | ||
83 | .type = PM_QOS_MIN | 92 | .type = PM_QOS_MIN |
84 | }; | 93 | }; |
85 | 94 | ||
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = { | |||
89 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), | 98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), |
90 | .notifiers = &network_throughput_notifier, | 99 | .notifiers = &network_throughput_notifier, |
91 | .name = "network_throughput", | 100 | .name = "network_throughput", |
92 | .default_value = 0, | 101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
102 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | ||
93 | .type = PM_QOS_MAX, | 103 | .type = PM_QOS_MAX, |
94 | }; | 104 | }; |
95 | 105 | ||
@@ -103,13 +113,17 @@ static struct pm_qos_object *pm_qos_array[] = { | |||
103 | 113 | ||
104 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 114 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
105 | size_t count, loff_t *f_pos); | 115 | size_t count, loff_t *f_pos); |
116 | static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | ||
117 | size_t count, loff_t *f_pos); | ||
106 | static int pm_qos_power_open(struct inode *inode, struct file *filp); | 118 | static int pm_qos_power_open(struct inode *inode, struct file *filp); |
107 | static int pm_qos_power_release(struct inode *inode, struct file *filp); | 119 | static int pm_qos_power_release(struct inode *inode, struct file *filp); |
108 | 120 | ||
109 | static const struct file_operations pm_qos_power_fops = { | 121 | static const struct file_operations pm_qos_power_fops = { |
110 | .write = pm_qos_power_write, | 122 | .write = pm_qos_power_write, |
123 | .read = pm_qos_power_read, | ||
111 | .open = pm_qos_power_open, | 124 | .open = pm_qos_power_open, |
112 | .release = pm_qos_power_release, | 125 | .release = pm_qos_power_release, |
126 | .llseek = noop_llseek, | ||
113 | }; | 127 | }; |
114 | 128 | ||
115 | /* unlocked internal variant */ | 129 | /* unlocked internal variant */ |
@@ -120,10 +134,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
120 | 134 | ||
121 | switch (o->type) { | 135 | switch (o->type) { |
122 | case PM_QOS_MIN: | 136 | case PM_QOS_MIN: |
123 | return plist_last(&o->requests)->prio; | 137 | return plist_first(&o->requests)->prio; |
124 | 138 | ||
125 | case PM_QOS_MAX: | 139 | case PM_QOS_MAX: |
126 | return plist_first(&o->requests)->prio; | 140 | return plist_last(&o->requests)->prio; |
127 | 141 | ||
128 | default: | 142 | default: |
129 | /* runtime check for not using enum */ | 143 | /* runtime check for not using enum */ |
@@ -131,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
131 | } | 145 | } |
132 | } | 146 | } |
133 | 147 | ||
148 | static inline s32 pm_qos_read_value(struct pm_qos_object *o) | ||
149 | { | ||
150 | return o->target_value; | ||
151 | } | ||
152 | |||
153 | static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) | ||
154 | { | ||
155 | o->target_value = value; | ||
156 | } | ||
157 | |||
134 | static void update_target(struct pm_qos_object *o, struct plist_node *node, | 158 | static void update_target(struct pm_qos_object *o, struct plist_node *node, |
135 | int del, int value) | 159 | int del, int value) |
136 | { | 160 | { |
@@ -155,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node, | |||
155 | plist_add(node, &o->requests); | 179 | plist_add(node, &o->requests); |
156 | } | 180 | } |
157 | curr_value = pm_qos_get_value(o); | 181 | curr_value = pm_qos_get_value(o); |
182 | pm_qos_set_value(o, curr_value); | ||
158 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 183 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
159 | 184 | ||
160 | if (prev_value != curr_value) | 185 | if (prev_value != curr_value) |
@@ -189,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor) | |||
189 | * pm_qos_request - returns current system wide qos expectation | 214 | * pm_qos_request - returns current system wide qos expectation |
190 | * @pm_qos_class: identification of which qos value is requested | 215 | * @pm_qos_class: identification of which qos value is requested |
191 | * | 216 | * |
192 | * This function returns the current target value in an atomic manner. | 217 | * This function returns the current target value. |
193 | */ | 218 | */ |
194 | int pm_qos_request(int pm_qos_class) | 219 | int pm_qos_request(int pm_qos_class) |
195 | { | 220 | { |
196 | unsigned long flags; | 221 | return pm_qos_read_value(pm_qos_array[pm_qos_class]); |
197 | int value; | ||
198 | |||
199 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
200 | value = pm_qos_get_value(pm_qos_array[pm_qos_class]); | ||
201 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
202 | |||
203 | return value; | ||
204 | } | 222 | } |
205 | EXPORT_SYMBOL_GPL(pm_qos_request); | 223 | EXPORT_SYMBOL_GPL(pm_qos_request); |
206 | 224 | ||
@@ -375,30 +393,63 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) | |||
375 | } | 393 | } |
376 | 394 | ||
377 | 395 | ||
396 | static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | ||
397 | size_t count, loff_t *f_pos) | ||
398 | { | ||
399 | s32 value; | ||
400 | unsigned long flags; | ||
401 | struct pm_qos_object *o; | ||
402 | struct pm_qos_request_list *pm_qos_req = filp->private_data; | ||
403 | |||
404 | if (!pm_qos_req) | ||
405 | return -EINVAL; | ||
406 | if (!pm_qos_request_active(pm_qos_req)) | ||
407 | return -EINVAL; | ||
408 | |||
409 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | ||
410 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
411 | value = pm_qos_get_value(o); | ||
412 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
413 | |||
414 | return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); | ||
415 | } | ||
416 | |||
378 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 417 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
379 | size_t count, loff_t *f_pos) | 418 | size_t count, loff_t *f_pos) |
380 | { | 419 | { |
381 | s32 value; | 420 | s32 value; |
382 | int x; | ||
383 | char ascii_value[11]; | ||
384 | struct pm_qos_request_list *pm_qos_req; | 421 | struct pm_qos_request_list *pm_qos_req; |
385 | 422 | ||
386 | if (count == sizeof(s32)) { | 423 | if (count == sizeof(s32)) { |
387 | if (copy_from_user(&value, buf, sizeof(s32))) | 424 | if (copy_from_user(&value, buf, sizeof(s32))) |
388 | return -EFAULT; | 425 | return -EFAULT; |
389 | } else if (count == 11) { /* len('0x12345678/0') */ | 426 | } else if (count <= 11) { /* ASCII perhaps? */ |
390 | if (copy_from_user(ascii_value, buf, 11)) | 427 | char ascii_value[11]; |
428 | unsigned long int ulval; | ||
429 | int ret; | ||
430 | |||
431 | if (copy_from_user(ascii_value, buf, count)) | ||
391 | return -EFAULT; | 432 | return -EFAULT; |
392 | if (strlen(ascii_value) != 10) | 433 | |
393 | return -EINVAL; | 434 | if (count > 10) { |
394 | x = sscanf(ascii_value, "%x", &value); | 435 | if (ascii_value[10] == '\n') |
395 | if (x != 1) | 436 | ascii_value[10] = '\0'; |
437 | else | ||
438 | return -EINVAL; | ||
439 | } else { | ||
440 | ascii_value[count] = '\0'; | ||
441 | } | ||
442 | ret = strict_strtoul(ascii_value, 16, &ulval); | ||
443 | if (ret) { | ||
444 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | ||
396 | return -EINVAL; | 445 | return -EINVAL; |
397 | pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); | 446 | } |
398 | } else | 447 | value = (s32)lower_32_bits(ulval); |
448 | } else { | ||
399 | return -EINVAL; | 449 | return -EINVAL; |
450 | } | ||
400 | 451 | ||
401 | pm_qos_req = (struct pm_qos_request_list *)filp->private_data; | 452 | pm_qos_req = filp->private_data; |
402 | pm_qos_update_request(pm_qos_req, value); | 453 | pm_qos_update_request(pm_qos_req, value); |
403 | 454 | ||
404 | return count; | 455 | return count; |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 6842eeba5879..58f405b581e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock) | |||
37 | if (pid == 0) | 37 | if (pid == 0) |
38 | return 0; | 38 | return 0; |
39 | 39 | ||
40 | read_lock(&tasklist_lock); | 40 | rcu_read_lock(); |
41 | p = find_task_by_vpid(pid); | 41 | p = find_task_by_vpid(pid); |
42 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? | 42 | if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? |
43 | same_thread_group(p, current) : thread_group_leader(p))) { | 43 | same_thread_group(p, current) : has_group_leader_pid(p))) { |
44 | error = -EINVAL; | 44 | error = -EINVAL; |
45 | } | 45 | } |
46 | read_unlock(&tasklist_lock); | 46 | rcu_read_unlock(); |
47 | 47 | ||
48 | return error; | 48 | return error; |
49 | } | 49 | } |
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) | |||
176 | return p->utime; | 176 | return p->utime; |
177 | } | 177 | } |
178 | 178 | ||
179 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | 179 | static int |
180 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
180 | { | 181 | { |
181 | int error = check_clock(which_clock); | 182 | int error = check_clock(which_clock); |
182 | if (!error) { | 183 | if (!error) { |
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
194 | return error; | 195 | return error; |
195 | } | 196 | } |
196 | 197 | ||
197 | int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | 198 | static int |
199 | posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | ||
198 | { | 200 | { |
199 | /* | 201 | /* |
200 | * You can never reset a CPU clock, but we check for other errors | 202 | * You can never reset a CPU clock, but we check for other errors |
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
317 | } | 319 | } |
318 | 320 | ||
319 | 321 | ||
320 | int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | 322 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) |
321 | { | 323 | { |
322 | const pid_t pid = CPUCLOCK_PID(which_clock); | 324 | const pid_t pid = CPUCLOCK_PID(which_clock); |
323 | int error = -EINVAL; | 325 | int error = -EINVAL; |
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
379 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the | 381 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the |
380 | * new timer already all-zeros initialized. | 382 | * new timer already all-zeros initialized. |
381 | */ | 383 | */ |
382 | int posix_cpu_timer_create(struct k_itimer *new_timer) | 384 | static int posix_cpu_timer_create(struct k_itimer *new_timer) |
383 | { | 385 | { |
384 | int ret = 0; | 386 | int ret = 0; |
385 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); | 387 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); |
@@ -390,7 +392,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
390 | 392 | ||
391 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); | 393 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); |
392 | 394 | ||
393 | read_lock(&tasklist_lock); | 395 | rcu_read_lock(); |
394 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { | 396 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { |
395 | if (pid == 0) { | 397 | if (pid == 0) { |
396 | p = current; | 398 | p = current; |
@@ -404,7 +406,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
404 | p = current->group_leader; | 406 | p = current->group_leader; |
405 | } else { | 407 | } else { |
406 | p = find_task_by_vpid(pid); | 408 | p = find_task_by_vpid(pid); |
407 | if (p && !thread_group_leader(p)) | 409 | if (p && !has_group_leader_pid(p)) |
408 | p = NULL; | 410 | p = NULL; |
409 | } | 411 | } |
410 | } | 412 | } |
@@ -414,7 +416,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
414 | } else { | 416 | } else { |
415 | ret = -EINVAL; | 417 | ret = -EINVAL; |
416 | } | 418 | } |
417 | read_unlock(&tasklist_lock); | 419 | rcu_read_unlock(); |
418 | 420 | ||
419 | return ret; | 421 | return ret; |
420 | } | 422 | } |
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
425 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 427 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
426 | * and try again. (This happens when the timer is in the middle of firing.) | 428 | * and try again. (This happens when the timer is in the middle of firing.) |
427 | */ | 429 | */ |
428 | int posix_cpu_timer_del(struct k_itimer *timer) | 430 | static int posix_cpu_timer_del(struct k_itimer *timer) |
429 | { | 431 | { |
430 | struct task_struct *p = timer->it.cpu.task; | 432 | struct task_struct *p = timer->it.cpu.task; |
431 | int ret = 0; | 433 | int ret = 0; |
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
665 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | 667 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
666 | * and try again. (This happens when the timer is in the middle of firing.) | 668 | * and try again. (This happens when the timer is in the middle of firing.) |
667 | */ | 669 | */ |
668 | int posix_cpu_timer_set(struct k_itimer *timer, int flags, | 670 | static int posix_cpu_timer_set(struct k_itimer *timer, int flags, |
669 | struct itimerspec *new, struct itimerspec *old) | 671 | struct itimerspec *new, struct itimerspec *old) |
670 | { | 672 | { |
671 | struct task_struct *p = timer->it.cpu.task; | 673 | struct task_struct *p = timer->it.cpu.task; |
672 | union cpu_time_count old_expires, new_expires, old_incr, val; | 674 | union cpu_time_count old_expires, new_expires, old_incr, val; |
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
820 | return ret; | 822 | return ret; |
821 | } | 823 | } |
822 | 824 | ||
823 | void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | 825 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) |
824 | { | 826 | { |
825 | union cpu_time_count now; | 827 | union cpu_time_count now; |
826 | struct task_struct *p = timer->it.cpu.task; | 828 | struct task_struct *p = timer->it.cpu.task; |
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1345 | 1347 | ||
1346 | /* | 1348 | /* |
1347 | * Now that all the timers on our list have the firing flag, | 1349 | * Now that all the timers on our list have the firing flag, |
1348 | * noone will touch their list entries but us. We'll take | 1350 | * no one will touch their list entries but us. We'll take |
1349 | * each timer's lock before clearing its firing flag, so no | 1351 | * each timer's lock before clearing its firing flag, so no |
1350 | * timer call will interfere. | 1352 | * timer call will interfere. |
1351 | */ | 1353 | */ |
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1481 | return error; | 1483 | return error; |
1482 | } | 1484 | } |
1483 | 1485 | ||
1484 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, | 1486 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); |
1485 | struct timespec *rqtp, struct timespec __user *rmtp) | 1487 | |
1488 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | ||
1489 | struct timespec *rqtp, struct timespec __user *rmtp) | ||
1486 | { | 1490 | { |
1487 | struct restart_block *restart_block = | 1491 | struct restart_block *restart_block = |
1488 | ¤t_thread_info()->restart_block; | 1492 | ¤t_thread_info()->restart_block; |
1489 | struct itimerspec it; | 1493 | struct itimerspec it; |
1490 | int error; | 1494 | int error; |
1491 | 1495 | ||
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1501 | 1505 | ||
1502 | if (error == -ERESTART_RESTARTBLOCK) { | 1506 | if (error == -ERESTART_RESTARTBLOCK) { |
1503 | 1507 | ||
1504 | if (flags & TIMER_ABSTIME) | 1508 | if (flags & TIMER_ABSTIME) |
1505 | return -ERESTARTNOHAND; | 1509 | return -ERESTARTNOHAND; |
1506 | /* | 1510 | /* |
1507 | * Report back to the user the time still remaining. | 1511 | * Report back to the user the time still remaining. |
1508 | */ | 1512 | */ |
1509 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1513 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
1510 | return -EFAULT; | 1514 | return -EFAULT; |
1511 | 1515 | ||
1512 | restart_block->fn = posix_cpu_nsleep_restart; | 1516 | restart_block->fn = posix_cpu_nsleep_restart; |
1513 | restart_block->arg0 = which_clock; | 1517 | restart_block->nanosleep.clockid = which_clock; |
1514 | restart_block->arg1 = (unsigned long) rmtp; | 1518 | restart_block->nanosleep.rmtp = rmtp; |
1515 | restart_block->arg2 = rqtp->tv_sec; | 1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); |
1516 | restart_block->arg3 = rqtp->tv_nsec; | ||
1517 | } | 1520 | } |
1518 | return error; | 1521 | return error; |
1519 | } | 1522 | } |
1520 | 1523 | ||
1521 | long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
1522 | { | 1525 | { |
1523 | clockid_t which_clock = restart_block->arg0; | 1526 | clockid_t which_clock = restart_block->nanosleep.clockid; |
1524 | struct timespec __user *rmtp; | ||
1525 | struct timespec t; | 1527 | struct timespec t; |
1526 | struct itimerspec it; | 1528 | struct itimerspec it; |
1527 | int error; | 1529 | int error; |
1528 | 1530 | ||
1529 | rmtp = (struct timespec __user *) restart_block->arg1; | 1531 | t = ns_to_timespec(restart_block->nanosleep.expires); |
1530 | t.tv_sec = restart_block->arg2; | ||
1531 | t.tv_nsec = restart_block->arg3; | ||
1532 | 1532 | ||
1533 | restart_block->fn = do_no_restart_syscall; | ||
1534 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); | 1533 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); |
1535 | 1534 | ||
1536 | if (error == -ERESTART_RESTARTBLOCK) { | 1535 | if (error == -ERESTART_RESTARTBLOCK) { |
1536 | struct timespec __user *rmtp = restart_block->nanosleep.rmtp; | ||
1537 | /* | 1537 | /* |
1538 | * Report back to the user the time still remaining. | 1538 | * Report back to the user the time still remaining. |
1539 | */ | 1539 | */ |
1540 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1540 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
1541 | return -EFAULT; | 1541 | return -EFAULT; |
1542 | 1542 | ||
1543 | restart_block->fn = posix_cpu_nsleep_restart; | 1543 | restart_block->nanosleep.expires = timespec_to_ns(&t); |
1544 | restart_block->arg0 = which_clock; | ||
1545 | restart_block->arg1 = (unsigned long) rmtp; | ||
1546 | restart_block->arg2 = t.tv_sec; | ||
1547 | restart_block->arg3 = t.tv_nsec; | ||
1548 | } | 1544 | } |
1549 | return error; | 1545 | return error; |
1550 | 1546 | ||
1551 | } | 1547 | } |
1552 | 1548 | ||
1553 | |||
1554 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | 1549 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) |
1555 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | 1550 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) |
1556 | 1551 | ||
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer) | |||
1594 | timer->it_clock = THREAD_CLOCK; | 1589 | timer->it_clock = THREAD_CLOCK; |
1595 | return posix_cpu_timer_create(timer); | 1590 | return posix_cpu_timer_create(timer); |
1596 | } | 1591 | } |
1597 | static int thread_cpu_nsleep(const clockid_t which_clock, int flags, | 1592 | |
1598 | struct timespec *rqtp, struct timespec __user *rmtp) | 1593 | struct k_clock clock_posix_cpu = { |
1599 | { | 1594 | .clock_getres = posix_cpu_clock_getres, |
1600 | return -EINVAL; | 1595 | .clock_set = posix_cpu_clock_set, |
1601 | } | 1596 | .clock_get = posix_cpu_clock_get, |
1602 | static long thread_cpu_nsleep_restart(struct restart_block *restart_block) | 1597 | .timer_create = posix_cpu_timer_create, |
1603 | { | 1598 | .nsleep = posix_cpu_nsleep, |
1604 | return -EINVAL; | 1599 | .nsleep_restart = posix_cpu_nsleep_restart, |
1605 | } | 1600 | .timer_set = posix_cpu_timer_set, |
1601 | .timer_del = posix_cpu_timer_del, | ||
1602 | .timer_get = posix_cpu_timer_get, | ||
1603 | }; | ||
1606 | 1604 | ||
1607 | static __init int init_posix_cpu_timers(void) | 1605 | static __init int init_posix_cpu_timers(void) |
1608 | { | 1606 | { |
1609 | struct k_clock process = { | 1607 | struct k_clock process = { |
1610 | .clock_getres = process_cpu_clock_getres, | 1608 | .clock_getres = process_cpu_clock_getres, |
1611 | .clock_get = process_cpu_clock_get, | 1609 | .clock_get = process_cpu_clock_get, |
1612 | .clock_set = do_posix_clock_nosettime, | 1610 | .timer_create = process_cpu_timer_create, |
1613 | .timer_create = process_cpu_timer_create, | 1611 | .nsleep = process_cpu_nsleep, |
1614 | .nsleep = process_cpu_nsleep, | 1612 | .nsleep_restart = process_cpu_nsleep_restart, |
1615 | .nsleep_restart = process_cpu_nsleep_restart, | ||
1616 | }; | 1613 | }; |
1617 | struct k_clock thread = { | 1614 | struct k_clock thread = { |
1618 | .clock_getres = thread_cpu_clock_getres, | 1615 | .clock_getres = thread_cpu_clock_getres, |
1619 | .clock_get = thread_cpu_clock_get, | 1616 | .clock_get = thread_cpu_clock_get, |
1620 | .clock_set = do_posix_clock_nosettime, | 1617 | .timer_create = thread_cpu_timer_create, |
1621 | .timer_create = thread_cpu_timer_create, | ||
1622 | .nsleep = thread_cpu_nsleep, | ||
1623 | .nsleep_restart = thread_cpu_nsleep_restart, | ||
1624 | }; | 1618 | }; |
1625 | struct timespec ts; | 1619 | struct timespec ts; |
1626 | 1620 | ||
1627 | register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | 1621 | posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); |
1628 | register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); | 1622 | posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); |
1629 | 1623 | ||
1630 | cputime_to_timespec(cputime_one_jiffy, &ts); | 1624 | cputime_to_timespec(cputime_one_jiffy, &ts); |
1631 | onecputick = ts.tv_nsec; | 1625 | onecputick = ts.tv_nsec; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736d..4556182527f3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/init.h> | 41 | #include <linux/init.h> |
42 | #include <linux/compiler.h> | 42 | #include <linux/compiler.h> |
43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
44 | #include <linux/posix-clock.h> | ||
44 | #include <linux/posix-timers.h> | 45 | #include <linux/posix-timers.h> |
45 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
46 | #include <linux/wait.h> | 47 | #include <linux/wait.h> |
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock); | |||
81 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" | 82 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" |
82 | #endif | 83 | #endif |
83 | 84 | ||
85 | /* | ||
86 | * parisc wants ENOTSUP instead of EOPNOTSUPP | ||
87 | */ | ||
88 | #ifndef ENOTSUP | ||
89 | # define ENANOSLEEP_NOTSUP EOPNOTSUPP | ||
90 | #else | ||
91 | # define ENANOSLEEP_NOTSUP ENOTSUP | ||
92 | #endif | ||
84 | 93 | ||
85 | /* | 94 | /* |
86 | * The timer ID is turned into a timer address by idr_find(). | 95 | * The timer ID is turned into a timer address by idr_find(). |
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock); | |||
94 | /* | 103 | /* |
95 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us | 104 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us |
96 | * to implement others. This structure defines the various | 105 | * to implement others. This structure defines the various |
97 | * clocks and allows the possibility of adding others. We | 106 | * clocks. |
98 | * provide an interface to add clocks to the table and expect | ||
99 | * the "arch" code to add at least one clock that is high | ||
100 | * resolution. Here we define the standard CLOCK_REALTIME as a | ||
101 | * 1/HZ resolution clock. | ||
102 | * | 107 | * |
103 | * RESOLUTION: Clock resolution is used to round up timer and interval | 108 | * RESOLUTION: Clock resolution is used to round up timer and interval |
104 | * times, NOT to report clock times, which are reported with as | 109 | * times, NOT to report clock times, which are reported with as |
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock); | |||
108 | * necessary code is written. The standard says we should say | 113 | * necessary code is written. The standard says we should say |
109 | * something about this issue in the documentation... | 114 | * something about this issue in the documentation... |
110 | * | 115 | * |
111 | * FUNCTIONS: The CLOCKs structure defines possible functions to handle | 116 | * FUNCTIONS: The CLOCKs structure defines possible functions to |
112 | * various clock functions. For clocks that use the standard | 117 | * handle various clock functions. |
113 | * system timer code these entries should be NULL. This will | ||
114 | * allow dispatch without the overhead of indirect function | ||
115 | * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) | ||
116 | * must supply functions here, even if the function just returns | ||
117 | * ENOSYS. The standard POSIX timer management code assumes the | ||
118 | * following: 1.) The k_itimer struct (sched.h) is used for the | ||
119 | * timer. 2.) The list, it_lock, it_clock, it_id and it_pid | ||
120 | * fields are not modified by timer code. | ||
121 | * | 118 | * |
122 | * At this time all functions EXCEPT clock_nanosleep can be | 119 | * The standard POSIX timer management code assumes the |
123 | * redirected by the CLOCKS structure. Clock_nanosleep is in | 120 | * following: 1.) The k_itimer struct (sched.h) is used for |
124 | * there, but the code ignores it. | 121 | * the timer. 2.) The list, it_lock, it_clock, it_id and |
122 | * it_pid fields are not modified by timer code. | ||
125 | * | 123 | * |
126 | * Permissions: It is assumed that the clock_settime() function defined | 124 | * Permissions: It is assumed that the clock_settime() function defined |
127 | * for each clock will take care of permission checks. Some | 125 | * for each clock will take care of permission checks. Some |
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; | |||
138 | */ | 136 | */ |
139 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, | 137 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, |
140 | struct timespec __user *rmtp); | 138 | struct timespec __user *rmtp); |
139 | static int common_timer_create(struct k_itimer *new_timer); | ||
141 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | 140 | static void common_timer_get(struct k_itimer *, struct itimerspec *); |
142 | static int common_timer_set(struct k_itimer *, int, | 141 | static int common_timer_set(struct k_itimer *, int, |
143 | struct itimerspec *, struct itimerspec *); | 142 | struct itimerspec *, struct itimerspec *); |
@@ -145,83 +144,37 @@ static int common_timer_del(struct k_itimer *timer); | |||
145 | 144 | ||
146 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); | 145 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
147 | 146 | ||
148 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 147 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); |
148 | |||
149 | #define lock_timer(tid, flags) \ | ||
150 | ({ struct k_itimer *__timr; \ | ||
151 | __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ | ||
152 | __timr; \ | ||
153 | }) | ||
149 | 154 | ||
150 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | 155 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) |
151 | { | 156 | { |
152 | spin_unlock_irqrestore(&timr->it_lock, flags); | 157 | spin_unlock_irqrestore(&timr->it_lock, flags); |
153 | } | 158 | } |
154 | 159 | ||
155 | /* | 160 | /* Get clock_realtime */ |
156 | * Call the k_clock hook function if non-null, or the default function. | 161 | static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) |
157 | */ | ||
158 | #define CLOCK_DISPATCH(clock, call, arglist) \ | ||
159 | ((clock) < 0 ? posix_cpu_##call arglist : \ | ||
160 | (posix_clocks[clock].call != NULL \ | ||
161 | ? (*posix_clocks[clock].call) arglist : common_##call arglist)) | ||
162 | |||
163 | /* | ||
164 | * Default clock hook functions when the struct k_clock passed | ||
165 | * to register_posix_clock leaves a function pointer null. | ||
166 | * | ||
167 | * The function common_CALL is the default implementation for | ||
168 | * the function pointer CALL in struct k_clock. | ||
169 | */ | ||
170 | |||
171 | static inline int common_clock_getres(const clockid_t which_clock, | ||
172 | struct timespec *tp) | ||
173 | { | ||
174 | tp->tv_sec = 0; | ||
175 | tp->tv_nsec = posix_clocks[which_clock].res; | ||
176 | return 0; | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Get real time for posix timers | ||
181 | */ | ||
182 | static int common_clock_get(clockid_t which_clock, struct timespec *tp) | ||
183 | { | 162 | { |
184 | ktime_get_real_ts(tp); | 163 | ktime_get_real_ts(tp); |
185 | return 0; | 164 | return 0; |
186 | } | 165 | } |
187 | 166 | ||
188 | static inline int common_clock_set(const clockid_t which_clock, | 167 | /* Set clock_realtime */ |
189 | struct timespec *tp) | 168 | static int posix_clock_realtime_set(const clockid_t which_clock, |
169 | const struct timespec *tp) | ||
190 | { | 170 | { |
191 | return do_sys_settimeofday(tp, NULL); | 171 | return do_sys_settimeofday(tp, NULL); |
192 | } | 172 | } |
193 | 173 | ||
194 | static int common_timer_create(struct k_itimer *new_timer) | 174 | static int posix_clock_realtime_adj(const clockid_t which_clock, |
195 | { | 175 | struct timex *t) |
196 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static int no_timer_create(struct k_itimer *new_timer) | ||
201 | { | ||
202 | return -EOPNOTSUPP; | ||
203 | } | ||
204 | |||
205 | static int no_nsleep(const clockid_t which_clock, int flags, | ||
206 | struct timespec *tsave, struct timespec __user *rmtp) | ||
207 | { | 176 | { |
208 | return -EOPNOTSUPP; | 177 | return do_adjtimex(t); |
209 | } | ||
210 | |||
211 | /* | ||
212 | * Return nonzero if we know a priori this clockid_t value is bogus. | ||
213 | */ | ||
214 | static inline int invalid_clockid(const clockid_t which_clock) | ||
215 | { | ||
216 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ | ||
217 | return 0; | ||
218 | if ((unsigned) which_clock >= MAX_CLOCKS) | ||
219 | return 1; | ||
220 | if (posix_clocks[which_clock].clock_getres != NULL) | ||
221 | return 0; | ||
222 | if (posix_clocks[which_clock].res != 0) | ||
223 | return 0; | ||
224 | return 1; | ||
225 | } | 178 | } |
226 | 179 | ||
227 | /* | 180 | /* |
@@ -234,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) | |||
234 | } | 187 | } |
235 | 188 | ||
236 | /* | 189 | /* |
237 | * Get monotonic time for posix timers | 190 | * Get monotonic-raw time for posix timers |
238 | */ | 191 | */ |
239 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) | 192 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) |
240 | { | 193 | { |
@@ -261,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp | |||
261 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 214 | *tp = ktime_to_timespec(KTIME_LOW_RES); |
262 | return 0; | 215 | return 0; |
263 | } | 216 | } |
217 | |||
218 | static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) | ||
219 | { | ||
220 | get_monotonic_boottime(tp); | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | |||
264 | /* | 225 | /* |
265 | * Initialize everything, well, just everything in Posix clocks/timers ;) | 226 | * Initialize everything, well, just everything in Posix clocks/timers ;) |
266 | */ | 227 | */ |
267 | static __init int init_posix_timers(void) | 228 | static __init int init_posix_timers(void) |
268 | { | 229 | { |
269 | struct k_clock clock_realtime = { | 230 | struct k_clock clock_realtime = { |
270 | .clock_getres = hrtimer_get_res, | 231 | .clock_getres = hrtimer_get_res, |
232 | .clock_get = posix_clock_realtime_get, | ||
233 | .clock_set = posix_clock_realtime_set, | ||
234 | .clock_adj = posix_clock_realtime_adj, | ||
235 | .nsleep = common_nsleep, | ||
236 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
237 | .timer_create = common_timer_create, | ||
238 | .timer_set = common_timer_set, | ||
239 | .timer_get = common_timer_get, | ||
240 | .timer_del = common_timer_del, | ||
271 | }; | 241 | }; |
272 | struct k_clock clock_monotonic = { | 242 | struct k_clock clock_monotonic = { |
273 | .clock_getres = hrtimer_get_res, | 243 | .clock_getres = hrtimer_get_res, |
274 | .clock_get = posix_ktime_get_ts, | 244 | .clock_get = posix_ktime_get_ts, |
275 | .clock_set = do_posix_clock_nosettime, | 245 | .nsleep = common_nsleep, |
246 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
247 | .timer_create = common_timer_create, | ||
248 | .timer_set = common_timer_set, | ||
249 | .timer_get = common_timer_get, | ||
250 | .timer_del = common_timer_del, | ||
276 | }; | 251 | }; |
277 | struct k_clock clock_monotonic_raw = { | 252 | struct k_clock clock_monotonic_raw = { |
278 | .clock_getres = hrtimer_get_res, | 253 | .clock_getres = hrtimer_get_res, |
279 | .clock_get = posix_get_monotonic_raw, | 254 | .clock_get = posix_get_monotonic_raw, |
280 | .clock_set = do_posix_clock_nosettime, | ||
281 | .timer_create = no_timer_create, | ||
282 | .nsleep = no_nsleep, | ||
283 | }; | 255 | }; |
284 | struct k_clock clock_realtime_coarse = { | 256 | struct k_clock clock_realtime_coarse = { |
285 | .clock_getres = posix_get_coarse_res, | 257 | .clock_getres = posix_get_coarse_res, |
286 | .clock_get = posix_get_realtime_coarse, | 258 | .clock_get = posix_get_realtime_coarse, |
287 | .clock_set = do_posix_clock_nosettime, | ||
288 | .timer_create = no_timer_create, | ||
289 | .nsleep = no_nsleep, | ||
290 | }; | 259 | }; |
291 | struct k_clock clock_monotonic_coarse = { | 260 | struct k_clock clock_monotonic_coarse = { |
292 | .clock_getres = posix_get_coarse_res, | 261 | .clock_getres = posix_get_coarse_res, |
293 | .clock_get = posix_get_monotonic_coarse, | 262 | .clock_get = posix_get_monotonic_coarse, |
294 | .clock_set = do_posix_clock_nosettime, | 263 | }; |
295 | .timer_create = no_timer_create, | 264 | struct k_clock clock_boottime = { |
296 | .nsleep = no_nsleep, | 265 | .clock_getres = hrtimer_get_res, |
266 | .clock_get = posix_get_boottime, | ||
267 | .nsleep = common_nsleep, | ||
268 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
269 | .timer_create = common_timer_create, | ||
270 | .timer_set = common_timer_set, | ||
271 | .timer_get = common_timer_get, | ||
272 | .timer_del = common_timer_del, | ||
297 | }; | 273 | }; |
298 | 274 | ||
299 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); | 275 | posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); |
300 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); | 276 | posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); |
301 | register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); | 277 | posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); |
302 | register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); | 278 | posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); |
303 | register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); | 279 | posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); |
280 | posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); | ||
304 | 281 | ||
305 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 282 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
306 | sizeof (struct k_itimer), 0, SLAB_PANIC, | 283 | sizeof (struct k_itimer), 0, SLAB_PANIC, |
@@ -336,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) | |||
336 | * restarted (i.e. we have flagged this in the sys_private entry of the | 313 | * restarted (i.e. we have flagged this in the sys_private entry of the |
337 | * info block). | 314 | * info block). |
338 | * | 315 | * |
339 | * To protect aginst the timer going away while the interrupt is queued, | 316 | * To protect against the timer going away while the interrupt is queued, |
340 | * we require that the it_requeue_pending flag be set. | 317 | * we require that the it_requeue_pending flag be set. |
341 | */ | 318 | */ |
342 | void do_schedule_next_timer(struct siginfo *info) | 319 | void do_schedule_next_timer(struct siginfo *info) |
@@ -476,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event) | |||
476 | return task_pid(rtn); | 453 | return task_pid(rtn); |
477 | } | 454 | } |
478 | 455 | ||
479 | void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) | 456 | void posix_timers_register_clock(const clockid_t clock_id, |
457 | struct k_clock *new_clock) | ||
480 | { | 458 | { |
481 | if ((unsigned) clock_id >= MAX_CLOCKS) { | 459 | if ((unsigned) clock_id >= MAX_CLOCKS) { |
482 | printk("POSIX clock register failed for clock_id %d\n", | 460 | printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", |
461 | clock_id); | ||
462 | return; | ||
463 | } | ||
464 | |||
465 | if (!new_clock->clock_get) { | ||
466 | printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", | ||
467 | clock_id); | ||
468 | return; | ||
469 | } | ||
470 | if (!new_clock->clock_getres) { | ||
471 | printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", | ||
483 | clock_id); | 472 | clock_id); |
484 | return; | 473 | return; |
485 | } | 474 | } |
486 | 475 | ||
487 | posix_clocks[clock_id] = *new_clock; | 476 | posix_clocks[clock_id] = *new_clock; |
488 | } | 477 | } |
489 | EXPORT_SYMBOL_GPL(register_posix_clock); | 478 | EXPORT_SYMBOL_GPL(posix_timers_register_clock); |
490 | 479 | ||
491 | static struct k_itimer * alloc_posix_timer(void) | 480 | static struct k_itimer * alloc_posix_timer(void) |
492 | { | 481 | { |
@@ -502,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void) | |||
502 | return tmr; | 491 | return tmr; |
503 | } | 492 | } |
504 | 493 | ||
494 | static void k_itimer_rcu_free(struct rcu_head *head) | ||
495 | { | ||
496 | struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); | ||
497 | |||
498 | kmem_cache_free(posix_timers_cache, tmr); | ||
499 | } | ||
500 | |||
505 | #define IT_ID_SET 1 | 501 | #define IT_ID_SET 1 |
506 | #define IT_ID_NOT_SET 0 | 502 | #define IT_ID_NOT_SET 0 |
507 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | 503 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) |
@@ -514,7 +510,24 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
514 | } | 510 | } |
515 | put_pid(tmr->it_pid); | 511 | put_pid(tmr->it_pid); |
516 | sigqueue_free(tmr->sigq); | 512 | sigqueue_free(tmr->sigq); |
517 | kmem_cache_free(posix_timers_cache, tmr); | 513 | call_rcu(&tmr->it.rcu, k_itimer_rcu_free); |
514 | } | ||
515 | |||
516 | static struct k_clock *clockid_to_kclock(const clockid_t id) | ||
517 | { | ||
518 | if (id < 0) | ||
519 | return (id & CLOCKFD_MASK) == CLOCKFD ? | ||
520 | &clock_posix_dynamic : &clock_posix_cpu; | ||
521 | |||
522 | if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) | ||
523 | return NULL; | ||
524 | return &posix_clocks[id]; | ||
525 | } | ||
526 | |||
527 | static int common_timer_create(struct k_itimer *new_timer) | ||
528 | { | ||
529 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | ||
530 | return 0; | ||
518 | } | 531 | } |
519 | 532 | ||
520 | /* Create a POSIX.1b interval timer. */ | 533 | /* Create a POSIX.1b interval timer. */ |
@@ -523,13 +536,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
523 | struct sigevent __user *, timer_event_spec, | 536 | struct sigevent __user *, timer_event_spec, |
524 | timer_t __user *, created_timer_id) | 537 | timer_t __user *, created_timer_id) |
525 | { | 538 | { |
539 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
526 | struct k_itimer *new_timer; | 540 | struct k_itimer *new_timer; |
527 | int error, new_timer_id; | 541 | int error, new_timer_id; |
528 | sigevent_t event; | 542 | sigevent_t event; |
529 | int it_id_set = IT_ID_NOT_SET; | 543 | int it_id_set = IT_ID_NOT_SET; |
530 | 544 | ||
531 | if (invalid_clockid(which_clock)) | 545 | if (!kc) |
532 | return -EINVAL; | 546 | return -EINVAL; |
547 | if (!kc->timer_create) | ||
548 | return -EOPNOTSUPP; | ||
533 | 549 | ||
534 | new_timer = alloc_posix_timer(); | 550 | new_timer = alloc_posix_timer(); |
535 | if (unlikely(!new_timer)) | 551 | if (unlikely(!new_timer)) |
@@ -591,7 +607,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
591 | goto out; | 607 | goto out; |
592 | } | 608 | } |
593 | 609 | ||
594 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 610 | error = kc->timer_create(new_timer); |
595 | if (error) | 611 | if (error) |
596 | goto out; | 612 | goto out; |
597 | 613 | ||
@@ -601,7 +617,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
601 | spin_unlock_irq(¤t->sighand->siglock); | 617 | spin_unlock_irq(¤t->sighand->siglock); |
602 | 618 | ||
603 | return 0; | 619 | return 0; |
604 | /* | 620 | /* |
605 | * In the case of the timer belonging to another task, after | 621 | * In the case of the timer belonging to another task, after |
606 | * the task is unlocked, the timer is owned by the other task | 622 | * the task is unlocked, the timer is owned by the other task |
607 | * and may cease to exist at any time. Don't use or modify | 623 | * and may cease to exist at any time. Don't use or modify |
@@ -619,25 +635,21 @@ out: | |||
619 | * the find to the timer lock. To avoid a dead lock, the timer id MUST | 635 | * the find to the timer lock. To avoid a dead lock, the timer id MUST |
620 | * be release with out holding the timer lock. | 636 | * be release with out holding the timer lock. |
621 | */ | 637 | */ |
622 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) | 638 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) |
623 | { | 639 | { |
624 | struct k_itimer *timr; | 640 | struct k_itimer *timr; |
625 | /* | 641 | |
626 | * Watch out here. We do a irqsave on the idr_lock and pass the | 642 | rcu_read_lock(); |
627 | * flags part over to the timer lock. Must not let interrupts in | ||
628 | * while we are moving the lock. | ||
629 | */ | ||
630 | spin_lock_irqsave(&idr_lock, *flags); | ||
631 | timr = idr_find(&posix_timers_id, (int)timer_id); | 643 | timr = idr_find(&posix_timers_id, (int)timer_id); |
632 | if (timr) { | 644 | if (timr) { |
633 | spin_lock(&timr->it_lock); | 645 | spin_lock_irqsave(&timr->it_lock, *flags); |
634 | if (timr->it_signal == current->signal) { | 646 | if (timr->it_signal == current->signal) { |
635 | spin_unlock(&idr_lock); | 647 | rcu_read_unlock(); |
636 | return timr; | 648 | return timr; |
637 | } | 649 | } |
638 | spin_unlock(&timr->it_lock); | 650 | spin_unlock_irqrestore(&timr->it_lock, *flags); |
639 | } | 651 | } |
640 | spin_unlock_irqrestore(&idr_lock, *flags); | 652 | rcu_read_unlock(); |
641 | 653 | ||
642 | return NULL; | 654 | return NULL; |
643 | } | 655 | } |
@@ -703,22 +715,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | |||
703 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 715 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
704 | struct itimerspec __user *, setting) | 716 | struct itimerspec __user *, setting) |
705 | { | 717 | { |
706 | struct k_itimer *timr; | ||
707 | struct itimerspec cur_setting; | 718 | struct itimerspec cur_setting; |
719 | struct k_itimer *timr; | ||
720 | struct k_clock *kc; | ||
708 | unsigned long flags; | 721 | unsigned long flags; |
722 | int ret = 0; | ||
709 | 723 | ||
710 | timr = lock_timer(timer_id, &flags); | 724 | timr = lock_timer(timer_id, &flags); |
711 | if (!timr) | 725 | if (!timr) |
712 | return -EINVAL; | 726 | return -EINVAL; |
713 | 727 | ||
714 | CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); | 728 | kc = clockid_to_kclock(timr->it_clock); |
729 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) | ||
730 | ret = -EINVAL; | ||
731 | else | ||
732 | kc->timer_get(timr, &cur_setting); | ||
715 | 733 | ||
716 | unlock_timer(timr, flags); | 734 | unlock_timer(timr, flags); |
717 | 735 | ||
718 | if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | 736 | if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) |
719 | return -EFAULT; | 737 | return -EFAULT; |
720 | 738 | ||
721 | return 0; | 739 | return ret; |
722 | } | 740 | } |
723 | 741 | ||
724 | /* | 742 | /* |
@@ -807,6 +825,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | |||
807 | int error = 0; | 825 | int error = 0; |
808 | unsigned long flag; | 826 | unsigned long flag; |
809 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; | 827 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; |
828 | struct k_clock *kc; | ||
810 | 829 | ||
811 | if (!new_setting) | 830 | if (!new_setting) |
812 | return -EINVAL; | 831 | return -EINVAL; |
@@ -822,8 +841,11 @@ retry: | |||
822 | if (!timr) | 841 | if (!timr) |
823 | return -EINVAL; | 842 | return -EINVAL; |
824 | 843 | ||
825 | error = CLOCK_DISPATCH(timr->it_clock, timer_set, | 844 | kc = clockid_to_kclock(timr->it_clock); |
826 | (timr, flags, &new_spec, rtn)); | 845 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) |
846 | error = -EINVAL; | ||
847 | else | ||
848 | error = kc->timer_set(timr, flags, &new_spec, rtn); | ||
827 | 849 | ||
828 | unlock_timer(timr, flag); | 850 | unlock_timer(timr, flag); |
829 | if (error == TIMER_RETRY) { | 851 | if (error == TIMER_RETRY) { |
@@ -838,7 +860,7 @@ retry: | |||
838 | return error; | 860 | return error; |
839 | } | 861 | } |
840 | 862 | ||
841 | static inline int common_timer_del(struct k_itimer *timer) | 863 | static int common_timer_del(struct k_itimer *timer) |
842 | { | 864 | { |
843 | timer->it.real.interval.tv64 = 0; | 865 | timer->it.real.interval.tv64 = 0; |
844 | 866 | ||
@@ -849,7 +871,11 @@ static inline int common_timer_del(struct k_itimer *timer) | |||
849 | 871 | ||
850 | static inline int timer_delete_hook(struct k_itimer *timer) | 872 | static inline int timer_delete_hook(struct k_itimer *timer) |
851 | { | 873 | { |
852 | return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); | 874 | struct k_clock *kc = clockid_to_kclock(timer->it_clock); |
875 | |||
876 | if (WARN_ON_ONCE(!kc || !kc->timer_del)) | ||
877 | return -EINVAL; | ||
878 | return kc->timer_del(timer); | ||
853 | } | 879 | } |
854 | 880 | ||
855 | /* Delete a POSIX.1b interval timer. */ | 881 | /* Delete a POSIX.1b interval timer. */ |
@@ -921,69 +947,76 @@ void exit_itimers(struct signal_struct *sig) | |||
921 | } | 947 | } |
922 | } | 948 | } |
923 | 949 | ||
924 | /* Not available / possible... functions */ | ||
925 | int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) | ||
926 | { | ||
927 | return -EINVAL; | ||
928 | } | ||
929 | EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); | ||
930 | |||
931 | int do_posix_clock_nonanosleep(const clockid_t clock, int flags, | ||
932 | struct timespec *t, struct timespec __user *r) | ||
933 | { | ||
934 | #ifndef ENOTSUP | ||
935 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ | ||
936 | #else /* parisc does define it separately. */ | ||
937 | return -ENOTSUP; | ||
938 | #endif | ||
939 | } | ||
940 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); | ||
941 | |||
942 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 950 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
943 | const struct timespec __user *, tp) | 951 | const struct timespec __user *, tp) |
944 | { | 952 | { |
953 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
945 | struct timespec new_tp; | 954 | struct timespec new_tp; |
946 | 955 | ||
947 | if (invalid_clockid(which_clock)) | 956 | if (!kc || !kc->clock_set) |
948 | return -EINVAL; | 957 | return -EINVAL; |
958 | |||
949 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 959 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) |
950 | return -EFAULT; | 960 | return -EFAULT; |
951 | 961 | ||
952 | return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); | 962 | return kc->clock_set(which_clock, &new_tp); |
953 | } | 963 | } |
954 | 964 | ||
955 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 965 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
956 | struct timespec __user *,tp) | 966 | struct timespec __user *,tp) |
957 | { | 967 | { |
968 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
958 | struct timespec kernel_tp; | 969 | struct timespec kernel_tp; |
959 | int error; | 970 | int error; |
960 | 971 | ||
961 | if (invalid_clockid(which_clock)) | 972 | if (!kc) |
962 | return -EINVAL; | 973 | return -EINVAL; |
963 | error = CLOCK_DISPATCH(which_clock, clock_get, | 974 | |
964 | (which_clock, &kernel_tp)); | 975 | error = kc->clock_get(which_clock, &kernel_tp); |
976 | |||
965 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 977 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) |
966 | error = -EFAULT; | 978 | error = -EFAULT; |
967 | 979 | ||
968 | return error; | 980 | return error; |
981 | } | ||
982 | |||
983 | SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, | ||
984 | struct timex __user *, utx) | ||
985 | { | ||
986 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
987 | struct timex ktx; | ||
988 | int err; | ||
989 | |||
990 | if (!kc) | ||
991 | return -EINVAL; | ||
992 | if (!kc->clock_adj) | ||
993 | return -EOPNOTSUPP; | ||
969 | 994 | ||
995 | if (copy_from_user(&ktx, utx, sizeof(ktx))) | ||
996 | return -EFAULT; | ||
997 | |||
998 | err = kc->clock_adj(which_clock, &ktx); | ||
999 | |||
1000 | if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) | ||
1001 | return -EFAULT; | ||
1002 | |||
1003 | return err; | ||
970 | } | 1004 | } |
971 | 1005 | ||
972 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | 1006 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, |
973 | struct timespec __user *, tp) | 1007 | struct timespec __user *, tp) |
974 | { | 1008 | { |
1009 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
975 | struct timespec rtn_tp; | 1010 | struct timespec rtn_tp; |
976 | int error; | 1011 | int error; |
977 | 1012 | ||
978 | if (invalid_clockid(which_clock)) | 1013 | if (!kc) |
979 | return -EINVAL; | 1014 | return -EINVAL; |
980 | 1015 | ||
981 | error = CLOCK_DISPATCH(which_clock, clock_getres, | 1016 | error = kc->clock_getres(which_clock, &rtn_tp); |
982 | (which_clock, &rtn_tp)); | ||
983 | 1017 | ||
984 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { | 1018 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) |
985 | error = -EFAULT; | 1019 | error = -EFAULT; |
986 | } | ||
987 | 1020 | ||
988 | return error; | 1021 | return error; |
989 | } | 1022 | } |
@@ -1003,10 +1036,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1003 | const struct timespec __user *, rqtp, | 1036 | const struct timespec __user *, rqtp, |
1004 | struct timespec __user *, rmtp) | 1037 | struct timespec __user *, rmtp) |
1005 | { | 1038 | { |
1039 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
1006 | struct timespec t; | 1040 | struct timespec t; |
1007 | 1041 | ||
1008 | if (invalid_clockid(which_clock)) | 1042 | if (!kc) |
1009 | return -EINVAL; | 1043 | return -EINVAL; |
1044 | if (!kc->nsleep) | ||
1045 | return -ENANOSLEEP_NOTSUP; | ||
1010 | 1046 | ||
1011 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 1047 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
1012 | return -EFAULT; | 1048 | return -EFAULT; |
@@ -1014,27 +1050,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1014 | if (!timespec_valid(&t)) | 1050 | if (!timespec_valid(&t)) |
1015 | return -EINVAL; | 1051 | return -EINVAL; |
1016 | 1052 | ||
1017 | return CLOCK_DISPATCH(which_clock, nsleep, | 1053 | return kc->nsleep(which_clock, flags, &t, rmtp); |
1018 | (which_clock, flags, &t, rmtp)); | ||
1019 | } | ||
1020 | |||
1021 | /* | ||
1022 | * nanosleep_restart for monotonic and realtime clocks | ||
1023 | */ | ||
1024 | static int common_nsleep_restart(struct restart_block *restart_block) | ||
1025 | { | ||
1026 | return hrtimer_nanosleep_restart(restart_block); | ||
1027 | } | 1054 | } |
1028 | 1055 | ||
1029 | /* | 1056 | /* |
1030 | * This will restart clock_nanosleep. This is required only by | 1057 | * This will restart clock_nanosleep. This is required only by |
1031 | * compat_clock_nanosleep_restart for now. | 1058 | * compat_clock_nanosleep_restart for now. |
1032 | */ | 1059 | */ |
1033 | long | 1060 | long clock_nanosleep_restart(struct restart_block *restart_block) |
1034 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
1035 | { | 1061 | { |
1036 | clockid_t which_clock = restart_block->arg0; | 1062 | clockid_t which_clock = restart_block->nanosleep.clockid; |
1063 | struct k_clock *kc = clockid_to_kclock(which_clock); | ||
1064 | |||
1065 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) | ||
1066 | return -EINVAL; | ||
1037 | 1067 | ||
1038 | return CLOCK_DISPATCH(which_clock, nsleep_restart, | 1068 | return kc->nsleep_restart(restart_block); |
1039 | (restart_block)); | ||
1040 | } | 1069 | } |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca6066a6952e..87f4d24b55b0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -1,128 +1,12 @@ | |||
1 | config PM | ||
2 | bool "Power Management support" | ||
3 | depends on !IA64_HP_SIM | ||
4 | ---help--- | ||
5 | "Power Management" means that parts of your computer are shut | ||
6 | off or put into a power conserving "sleep" mode if they are not | ||
7 | being used. There are two competing standards for doing this: APM | ||
8 | and ACPI. If you want to use either one, say Y here and then also | ||
9 | to the requisite support below. | ||
10 | |||
11 | Power Management is most important for battery powered laptop | ||
12 | computers; if you have a laptop, check out the Linux Laptop home | ||
13 | page on the WWW at <http://www.linux-on-laptops.com/> or | ||
14 | Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> | ||
15 | and the Battery Powered Linux mini-HOWTO, available from | ||
16 | <http://www.tldp.org/docs.html#howto>. | ||
17 | |||
18 | Note that, even if you say N here, Linux on the x86 architecture | ||
19 | will issue the hlt instruction if nothing is to be done, thereby | ||
20 | sending the processor to sleep and saving power. | ||
21 | |||
22 | config PM_DEBUG | ||
23 | bool "Power Management Debug Support" | ||
24 | depends on PM | ||
25 | ---help--- | ||
26 | This option enables various debugging support in the Power Management | ||
27 | code. This is helpful when debugging and reporting PM bugs, like | ||
28 | suspend support. | ||
29 | |||
30 | config PM_ADVANCED_DEBUG | ||
31 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
32 | depends on PM_DEBUG | ||
33 | default n | ||
34 | ---help--- | ||
35 | Add extra sysfs attributes allowing one to access some Power Management | ||
36 | fields of device objects from user space. If you are not a kernel | ||
37 | developer interested in debugging/testing Power Management, say "no". | ||
38 | |||
39 | config PM_VERBOSE | ||
40 | bool "Verbose Power Management debugging" | ||
41 | depends on PM_DEBUG | ||
42 | default n | ||
43 | ---help--- | ||
44 | This option enables verbose messages from the Power Management code. | ||
45 | |||
46 | config CAN_PM_TRACE | ||
47 | def_bool y | ||
48 | depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL | ||
49 | |||
50 | config PM_TRACE | ||
51 | bool | ||
52 | help | ||
53 | This enables code to save the last PM event point across | ||
54 | reboot. The architecture needs to support this, x86 for | ||
55 | example does by saving things in the RTC, see below. | ||
56 | |||
57 | The architecture specific code must provide the extern | ||
58 | functions from <linux/resume-trace.h> as well as the | ||
59 | <asm/resume-trace.h> header with a TRACE_RESUME() macro. | ||
60 | |||
61 | The way the information is presented is architecture- | ||
62 | dependent, x86 will print the information during a | ||
63 | late_initcall. | ||
64 | |||
65 | config PM_TRACE_RTC | ||
66 | bool "Suspend/resume event tracing" | ||
67 | depends on CAN_PM_TRACE | ||
68 | depends on X86 | ||
69 | select PM_TRACE | ||
70 | default n | ||
71 | ---help--- | ||
72 | This enables some cheesy code to save the last PM event point in the | ||
73 | RTC across reboots, so that you can debug a machine that just hangs | ||
74 | during suspend (or more commonly, during resume). | ||
75 | |||
76 | To use this debugging feature you should attempt to suspend the | ||
77 | machine, reboot it and then run | ||
78 | |||
79 | dmesg -s 1000000 | grep 'hash matches' | ||
80 | |||
81 | CAUTION: this option will cause your machine's real-time clock to be | ||
82 | set to an invalid time after a resume. | ||
83 | |||
84 | config PM_SLEEP_SMP | ||
85 | bool | ||
86 | depends on SMP | ||
87 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | ||
88 | depends on PM_SLEEP | ||
89 | select HOTPLUG_CPU | ||
90 | default y | ||
91 | |||
92 | config PM_SLEEP | ||
93 | bool | ||
94 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE | ||
95 | default y | ||
96 | |||
97 | config PM_SLEEP_ADVANCED_DEBUG | ||
98 | bool | ||
99 | depends on PM_ADVANCED_DEBUG | ||
100 | default n | ||
101 | |||
102 | config SUSPEND_NVS | ||
103 | bool | ||
104 | |||
105 | config SUSPEND | 1 | config SUSPEND |
106 | bool "Suspend to RAM and standby" | 2 | bool "Suspend to RAM and standby" |
107 | depends on PM && ARCH_SUSPEND_POSSIBLE | 3 | depends on ARCH_SUSPEND_POSSIBLE |
108 | select SUSPEND_NVS if HAS_IOMEM | ||
109 | default y | 4 | default y |
110 | ---help--- | 5 | ---help--- |
111 | Allow the system to enter sleep states in which main memory is | 6 | Allow the system to enter sleep states in which main memory is |
112 | powered and thus its contents are preserved, such as the | 7 | powered and thus its contents are preserved, such as the |
113 | suspend-to-RAM state (e.g. the ACPI S3 state). | 8 | suspend-to-RAM state (e.g. the ACPI S3 state). |
114 | 9 | ||
115 | config PM_TEST_SUSPEND | ||
116 | bool "Test suspend/resume and wakealarm during bootup" | ||
117 | depends on SUSPEND && PM_DEBUG && RTC_CLASS=y | ||
118 | ---help--- | ||
119 | This option will let you suspend your machine during bootup, and | ||
120 | make it wake up a few seconds later using an RTC wakeup alarm. | ||
121 | Enable this with a kernel parameter like "test_suspend=mem". | ||
122 | |||
123 | You probably want to have your system's RTC driver statically | ||
124 | linked, ensuring that it's available when this test runs. | ||
125 | |||
126 | config SUSPEND_FREEZER | 10 | config SUSPEND_FREEZER |
127 | bool "Enable freezer for suspend to RAM/standby" \ | 11 | bool "Enable freezer for suspend to RAM/standby" \ |
128 | if ARCH_WANTS_FREEZER_CONTROL || BROKEN | 12 | if ARCH_WANTS_FREEZER_CONTROL || BROKEN |
@@ -134,10 +18,15 @@ config SUSPEND_FREEZER | |||
134 | 18 | ||
135 | Turning OFF this setting is NOT recommended! If in doubt, say Y. | 19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. |
136 | 20 | ||
21 | config HIBERNATE_CALLBACKS | ||
22 | bool | ||
23 | |||
137 | config HIBERNATION | 24 | config HIBERNATION |
138 | bool "Hibernation (aka 'suspend to disk')" | 25 | bool "Hibernation (aka 'suspend to disk')" |
139 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 26 | depends on SWAP && ARCH_HIBERNATION_POSSIBLE |
140 | select SUSPEND_NVS if HAS_IOMEM | 27 | select HIBERNATE_CALLBACKS |
28 | select LZO_COMPRESS | ||
29 | select LZO_DECOMPRESS | ||
141 | ---help--- | 30 | ---help--- |
142 | Enable the suspend to disk (STD) functionality, which is usually | 31 | Enable the suspend to disk (STD) functionality, which is usually |
143 | called "hibernation" in user interfaces. STD checkpoints the | 32 | called "hibernation" in user interfaces. STD checkpoints the |
@@ -198,6 +87,100 @@ config PM_STD_PARTITION | |||
198 | suspended image to. It will simply pick the first available swap | 87 | suspended image to. It will simply pick the first available swap |
199 | device. | 88 | device. |
200 | 89 | ||
90 | config PM_SLEEP | ||
91 | def_bool y | ||
92 | depends on SUSPEND || HIBERNATE_CALLBACKS | ||
93 | |||
94 | config PM_SLEEP_SMP | ||
95 | def_bool y | ||
96 | depends on SMP | ||
97 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | ||
98 | depends on PM_SLEEP | ||
99 | select HOTPLUG | ||
100 | select HOTPLUG_CPU | ||
101 | |||
102 | config PM_RUNTIME | ||
103 | bool "Run-time PM core functionality" | ||
104 | depends on !IA64_HP_SIM | ||
105 | ---help--- | ||
106 | Enable functionality allowing I/O devices to be put into energy-saving | ||
107 | (low power) states at run time (or autosuspended) after a specified | ||
108 | period of inactivity and woken up in response to a hardware-generated | ||
109 | wake-up event or a driver's request. | ||
110 | |||
111 | Hardware support is generally required for this functionality to work | ||
112 | and the bus type drivers of the buses the devices are on are | ||
113 | responsible for the actual handling of the autosuspend requests and | ||
114 | wake-up events. | ||
115 | |||
116 | config PM | ||
117 | def_bool y | ||
118 | depends on PM_SLEEP || PM_RUNTIME | ||
119 | |||
120 | config PM_DEBUG | ||
121 | bool "Power Management Debug Support" | ||
122 | depends on PM | ||
123 | ---help--- | ||
124 | This option enables various debugging support in the Power Management | ||
125 | code. This is helpful when debugging and reporting PM bugs, like | ||
126 | suspend support. | ||
127 | |||
128 | config PM_ADVANCED_DEBUG | ||
129 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | ||
130 | depends on PM_DEBUG | ||
131 | ---help--- | ||
132 | Add extra sysfs attributes allowing one to access some Power Management | ||
133 | fields of device objects from user space. If you are not a kernel | ||
134 | developer interested in debugging/testing Power Management, say "no". | ||
135 | |||
136 | config PM_TEST_SUSPEND | ||
137 | bool "Test suspend/resume and wakealarm during bootup" | ||
138 | depends on SUSPEND && PM_DEBUG && RTC_CLASS=y | ||
139 | ---help--- | ||
140 | This option will let you suspend your machine during bootup, and | ||
141 | make it wake up a few seconds later using an RTC wakeup alarm. | ||
142 | Enable this with a kernel parameter like "test_suspend=mem". | ||
143 | |||
144 | You probably want to have your system's RTC driver statically | ||
145 | linked, ensuring that it's available when this test runs. | ||
146 | |||
147 | config CAN_PM_TRACE | ||
148 | def_bool y | ||
149 | depends on PM_DEBUG && PM_SLEEP | ||
150 | |||
151 | config PM_TRACE | ||
152 | bool | ||
153 | help | ||
154 | This enables code to save the last PM event point across | ||
155 | reboot. The architecture needs to support this, x86 for | ||
156 | example does by saving things in the RTC, see below. | ||
157 | |||
158 | The architecture specific code must provide the extern | ||
159 | functions from <linux/resume-trace.h> as well as the | ||
160 | <asm/resume-trace.h> header with a TRACE_RESUME() macro. | ||
161 | |||
162 | The way the information is presented is architecture- | ||
163 | dependent, x86 will print the information during a | ||
164 | late_initcall. | ||
165 | |||
166 | config PM_TRACE_RTC | ||
167 | bool "Suspend/resume event tracing" | ||
168 | depends on CAN_PM_TRACE | ||
169 | depends on X86 | ||
170 | select PM_TRACE | ||
171 | ---help--- | ||
172 | This enables some cheesy code to save the last PM event point in the | ||
173 | RTC across reboots, so that you can debug a machine that just hangs | ||
174 | during suspend (or more commonly, during resume). | ||
175 | |||
176 | To use this debugging feature you should attempt to suspend the | ||
177 | machine, reboot it and then run | ||
178 | |||
179 | dmesg -s 1000000 | grep 'hash matches' | ||
180 | |||
181 | CAUTION: this option will cause your machine's real-time clock to be | ||
182 | set to an invalid time after a resume. | ||
183 | |||
201 | config APM_EMULATION | 184 | config APM_EMULATION |
202 | tristate "Advanced Power Management Emulation" | 185 | tristate "Advanced Power Management Emulation" |
203 | depends on PM && SYS_SUPPORTS_APM_EMULATION | 186 | depends on PM && SYS_SUPPORTS_APM_EMULATION |
@@ -224,21 +207,23 @@ config APM_EMULATION | |||
224 | anything, try disabling/enabling this option (or disabling/enabling | 207 | anything, try disabling/enabling this option (or disabling/enabling |
225 | APM in your BIOS). | 208 | APM in your BIOS). |
226 | 209 | ||
227 | config PM_RUNTIME | 210 | config ARCH_HAS_OPP |
228 | bool "Run-time PM core functionality" | 211 | bool |
229 | depends on PM | 212 | |
213 | config PM_OPP | ||
214 | bool "Operating Performance Point (OPP) Layer library" | ||
215 | depends on ARCH_HAS_OPP | ||
230 | ---help--- | 216 | ---help--- |
231 | Enable functionality allowing I/O devices to be put into energy-saving | 217 | SOCs have a standard set of tuples consisting of frequency and |
232 | (low power) states at run time (or autosuspended) after a specified | 218 | voltage pairs that the device will support per voltage domain. This |
233 | period of inactivity and woken up in response to a hardware-generated | 219 | is called Operating Performance Point or OPP. The actual definitions |
234 | wake-up event or a driver's request. | 220 | of OPP varies over silicon within the same family of devices. |
235 | 221 | ||
236 | Hardware support is generally required for this functionality to work | 222 | OPP layer organizes the data internally using device pointers |
237 | and the bus type drivers of the buses the devices are on are | 223 | representing individual voltage domains and provides SOC |
238 | responsible for the actual handling of the autosuspend requests and | 224 | implementations a ready to use framework to manage OPPs. |
239 | wake-up events. | 225 | For more information, read <file:Documentation/power/opp.txt> |
240 | 226 | ||
241 | config PM_OPS | 227 | config PM_RUNTIME_CLK |
242 | bool | 228 | def_bool y |
243 | depends on PM_SLEEP || PM_RUNTIME | 229 | depends on PM_RUNTIME && HAVE_CLK |
244 | default y | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185d..c5ebc6a90643 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,7 +1,5 @@ | |||
1 | 1 | ||
2 | ifeq ($(CONFIG_PM_DEBUG),y) | 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG |
3 | EXTRA_CFLAGS += -DDEBUG | ||
4 | endif | ||
5 | 3 | ||
6 | obj-$(CONFIG_PM) += main.o | 4 | obj-$(CONFIG_PM) += main.o |
7 | obj-$(CONFIG_PM_SLEEP) += console.o | 5 | obj-$(CONFIG_PM_SLEEP) += console.o |
@@ -10,6 +8,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
10 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
11 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 9 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
12 | block_io.o | 10 | block_io.o |
13 | obj-$(CONFIG_SUSPEND_NVS) += nvs.o | ||
14 | 11 | ||
15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 12 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c02df9..d09dd10c5a5e 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c | |||
@@ -28,7 +28,7 @@ | |||
28 | static int submit(int rw, struct block_device *bdev, sector_t sector, | 28 | static int submit(int rw, struct block_device *bdev, sector_t sector, |
29 | struct page *page, struct bio **bio_chain) | 29 | struct page *page, struct bio **bio_chain) |
30 | { | 30 | { |
31 | const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; | 31 | const int bio_rw = rw | REQ_SYNC; |
32 | struct bio *bio; | 32 | struct bio *bio; |
33 | 33 | ||
34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | 34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8dc31e02ae12..8f7b1db1ece1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -23,12 +23,13 @@ | |||
23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
24 | #include <linux/freezer.h> | 24 | #include <linux/freezer.h> |
25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
26 | #include <linux/syscore_ops.h> | ||
26 | #include <scsi/scsi_scan.h> | 27 | #include <scsi/scsi_scan.h> |
27 | #include <asm/suspend.h> | ||
28 | 28 | ||
29 | #include "power.h" | 29 | #include "power.h" |
30 | 30 | ||
31 | 31 | ||
32 | static int nocompress = 0; | ||
32 | static int noresume = 0; | 33 | static int noresume = 0; |
33 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 34 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
34 | dev_t swsusp_resume_device; | 35 | dev_t swsusp_resume_device; |
@@ -50,18 +51,17 @@ enum { | |||
50 | 51 | ||
51 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
52 | 53 | ||
53 | static struct platform_hibernation_ops *hibernation_ops; | 54 | static const struct platform_hibernation_ops *hibernation_ops; |
54 | 55 | ||
55 | /** | 56 | /** |
56 | * hibernation_set_ops - set the global hibernate operations | 57 | * hibernation_set_ops - Set the global hibernate operations. |
57 | * @ops: the hibernation operations to use in subsequent hibernation transitions | 58 | * @ops: Hibernation operations to use in subsequent hibernation transitions. |
58 | */ | 59 | */ |
59 | 60 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) | |
60 | void hibernation_set_ops(struct platform_hibernation_ops *ops) | ||
61 | { | 61 | { |
62 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot | 62 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
63 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore | 63 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore |
64 | && ops->restore_cleanup)) { | 64 | && ops->restore_cleanup && ops->leave)) { |
65 | WARN_ON(1); | 65 | WARN_ON(1); |
66 | return; | 66 | return; |
67 | } | 67 | } |
@@ -113,10 +113,9 @@ static int hibernation_test(int level) { return 0; } | |||
113 | #endif /* !CONFIG_PM_DEBUG */ | 113 | #endif /* !CONFIG_PM_DEBUG */ |
114 | 114 | ||
115 | /** | 115 | /** |
116 | * platform_begin - tell the platform driver that we're starting | 116 | * platform_begin - Call platform to start hibernation. |
117 | * hibernation | 117 | * @platform_mode: Whether or not to use the platform driver. |
118 | */ | 118 | */ |
119 | |||
120 | static int platform_begin(int platform_mode) | 119 | static int platform_begin(int platform_mode) |
121 | { | 120 | { |
122 | return (platform_mode && hibernation_ops) ? | 121 | return (platform_mode && hibernation_ops) ? |
@@ -124,10 +123,9 @@ static int platform_begin(int platform_mode) | |||
124 | } | 123 | } |
125 | 124 | ||
126 | /** | 125 | /** |
127 | * platform_end - tell the platform driver that we've entered the | 126 | * platform_end - Call platform to finish transition to the working state. |
128 | * working state | 127 | * @platform_mode: Whether or not to use the platform driver. |
129 | */ | 128 | */ |
130 | |||
131 | static void platform_end(int platform_mode) | 129 | static void platform_end(int platform_mode) |
132 | { | 130 | { |
133 | if (platform_mode && hibernation_ops) | 131 | if (platform_mode && hibernation_ops) |
@@ -135,8 +133,11 @@ static void platform_end(int platform_mode) | |||
135 | } | 133 | } |
136 | 134 | ||
137 | /** | 135 | /** |
138 | * platform_pre_snapshot - prepare the machine for hibernation using the | 136 | * platform_pre_snapshot - Call platform to prepare the machine for hibernation. |
139 | * platform driver if so configured and return an error code if it fails | 137 | * @platform_mode: Whether or not to use the platform driver. |
138 | * | ||
139 | * Use the platform driver to prepare the system for creating a hibernate image, | ||
140 | * if so configured, and return an error code if that fails. | ||
140 | */ | 141 | */ |
141 | 142 | ||
142 | static int platform_pre_snapshot(int platform_mode) | 143 | static int platform_pre_snapshot(int platform_mode) |
@@ -146,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode) | |||
146 | } | 147 | } |
147 | 148 | ||
148 | /** | 149 | /** |
149 | * platform_leave - prepare the machine for switching to the normal mode | 150 | * platform_leave - Call platform to prepare a transition to the working state. |
150 | * of operation using the platform driver (called with interrupts disabled) | 151 | * @platform_mode: Whether or not to use the platform driver. |
152 | * | ||
153 | * Use the platform driver prepare to prepare the machine for switching to the | ||
154 | * normal mode of operation. | ||
155 | * | ||
156 | * This routine is called on one CPU with interrupts disabled. | ||
151 | */ | 157 | */ |
152 | |||
153 | static void platform_leave(int platform_mode) | 158 | static void platform_leave(int platform_mode) |
154 | { | 159 | { |
155 | if (platform_mode && hibernation_ops) | 160 | if (platform_mode && hibernation_ops) |
@@ -157,10 +162,14 @@ static void platform_leave(int platform_mode) | |||
157 | } | 162 | } |
158 | 163 | ||
159 | /** | 164 | /** |
160 | * platform_finish - switch the machine to the normal mode of operation | 165 | * platform_finish - Call platform to switch the system to the working state. |
161 | * using the platform driver (must be called after platform_prepare()) | 166 | * @platform_mode: Whether or not to use the platform driver. |
167 | * | ||
168 | * Use the platform driver to switch the machine to the normal mode of | ||
169 | * operation. | ||
170 | * | ||
171 | * This routine must be called after platform_prepare(). | ||
162 | */ | 172 | */ |
163 | |||
164 | static void platform_finish(int platform_mode) | 173 | static void platform_finish(int platform_mode) |
165 | { | 174 | { |
166 | if (platform_mode && hibernation_ops) | 175 | if (platform_mode && hibernation_ops) |
@@ -168,11 +177,15 @@ static void platform_finish(int platform_mode) | |||
168 | } | 177 | } |
169 | 178 | ||
170 | /** | 179 | /** |
171 | * platform_pre_restore - prepare the platform for the restoration from a | 180 | * platform_pre_restore - Prepare for hibernate image restoration. |
172 | * hibernation image. If the restore fails after this function has been | 181 | * @platform_mode: Whether or not to use the platform driver. |
173 | * called, platform_restore_cleanup() must be called. | 182 | * |
183 | * Use the platform driver to prepare the system for resume from a hibernation | ||
184 | * image. | ||
185 | * | ||
186 | * If the restore fails after this function has been called, | ||
187 | * platform_restore_cleanup() must be called. | ||
174 | */ | 188 | */ |
175 | |||
176 | static int platform_pre_restore(int platform_mode) | 189 | static int platform_pre_restore(int platform_mode) |
177 | { | 190 | { |
178 | return (platform_mode && hibernation_ops) ? | 191 | return (platform_mode && hibernation_ops) ? |
@@ -180,12 +193,16 @@ static int platform_pre_restore(int platform_mode) | |||
180 | } | 193 | } |
181 | 194 | ||
182 | /** | 195 | /** |
183 | * platform_restore_cleanup - switch the platform to the normal mode of | 196 | * platform_restore_cleanup - Switch to the working state after failing restore. |
184 | * operation after a failing restore. If platform_pre_restore() has been | 197 | * @platform_mode: Whether or not to use the platform driver. |
185 | * called before the failing restore, this function must be called too, | 198 | * |
186 | * regardless of the result of platform_pre_restore(). | 199 | * Use the platform driver to switch the system to the normal mode of operation |
200 | * after a failing restore. | ||
201 | * | ||
202 | * If platform_pre_restore() has been called before the failing restore, this | ||
203 | * function must be called too, regardless of the result of | ||
204 | * platform_pre_restore(). | ||
187 | */ | 205 | */ |
188 | |||
189 | static void platform_restore_cleanup(int platform_mode) | 206 | static void platform_restore_cleanup(int platform_mode) |
190 | { | 207 | { |
191 | if (platform_mode && hibernation_ops) | 208 | if (platform_mode && hibernation_ops) |
@@ -193,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode) | |||
193 | } | 210 | } |
194 | 211 | ||
195 | /** | 212 | /** |
196 | * platform_recover - recover the platform from a failure to suspend | 213 | * platform_recover - Recover from a failure to suspend devices. |
197 | * devices. | 214 | * @platform_mode: Whether or not to use the platform driver. |
198 | */ | 215 | */ |
199 | |||
200 | static void platform_recover(int platform_mode) | 216 | static void platform_recover(int platform_mode) |
201 | { | 217 | { |
202 | if (platform_mode && hibernation_ops && hibernation_ops->recover) | 218 | if (platform_mode && hibernation_ops && hibernation_ops->recover) |
@@ -204,13 +220,12 @@ static void platform_recover(int platform_mode) | |||
204 | } | 220 | } |
205 | 221 | ||
206 | /** | 222 | /** |
207 | * swsusp_show_speed - print the time elapsed between two events. | 223 | * swsusp_show_speed - Print time elapsed between two events during hibernation. |
208 | * @start: Starting event. | 224 | * @start: Starting event. |
209 | * @stop: Final event. | 225 | * @stop: Final event. |
210 | * @nr_pages - number of pages processed between @start and @stop | 226 | * @nr_pages: Number of memory pages processed between @start and @stop. |
211 | * @msg - introductory message to print | 227 | * @msg: Additional diagnostic message to print. |
212 | */ | 228 | */ |
213 | |||
214 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 229 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, |
215 | unsigned nr_pages, char *msg) | 230 | unsigned nr_pages, char *msg) |
216 | { | 231 | { |
@@ -233,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, | |||
233 | } | 248 | } |
234 | 249 | ||
235 | /** | 250 | /** |
236 | * create_image - freeze devices that need to be frozen with interrupts | 251 | * create_image - Create a hibernation image. |
237 | * off, create the hibernation image and thaw those devices. Control | 252 | * @platform_mode: Whether or not to use the platform driver. |
238 | * reappears in this routine after a restore. | 253 | * |
254 | * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image | ||
255 | * and execute the drivers' .thaw_noirq() callbacks. | ||
256 | * | ||
257 | * Control reappears in this routine after the subsequent restore. | ||
239 | */ | 258 | */ |
240 | |||
241 | static int create_image(int platform_mode) | 259 | static int create_image(int platform_mode) |
242 | { | 260 | { |
243 | int error; | 261 | int error; |
244 | 262 | ||
245 | error = arch_prepare_suspend(); | ||
246 | if (error) | ||
247 | return error; | ||
248 | |||
249 | /* At this point, dpm_suspend_start() has been called, but *not* | ||
250 | * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. | ||
251 | * Otherwise, drivers for some devices (e.g. interrupt controllers) | ||
252 | * become desynchronized with the actual state of the hardware | ||
253 | * at resume time, and evil weirdness ensues. | ||
254 | */ | ||
255 | error = dpm_suspend_noirq(PMSG_FREEZE); | 263 | error = dpm_suspend_noirq(PMSG_FREEZE); |
256 | if (error) { | 264 | if (error) { |
257 | printk(KERN_ERR "PM: Some devices failed to power down, " | 265 | printk(KERN_ERR "PM: Some devices failed to power down, " |
@@ -270,14 +278,14 @@ static int create_image(int platform_mode) | |||
270 | 278 | ||
271 | local_irq_disable(); | 279 | local_irq_disable(); |
272 | 280 | ||
273 | error = sysdev_suspend(PMSG_FREEZE); | 281 | error = syscore_suspend(); |
274 | if (error) { | 282 | if (error) { |
275 | printk(KERN_ERR "PM: Some system devices failed to power down, " | 283 | printk(KERN_ERR "PM: Some system devices failed to power down, " |
276 | "aborting hibernation\n"); | 284 | "aborting hibernation\n"); |
277 | goto Enable_irqs; | 285 | goto Enable_irqs; |
278 | } | 286 | } |
279 | 287 | ||
280 | if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) | 288 | if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) |
281 | goto Power_up; | 289 | goto Power_up; |
282 | 290 | ||
283 | in_suspend = 1; | 291 | in_suspend = 1; |
@@ -294,10 +302,7 @@ static int create_image(int platform_mode) | |||
294 | } | 302 | } |
295 | 303 | ||
296 | Power_up: | 304 | Power_up: |
297 | sysdev_resume(); | 305 | syscore_resume(); |
298 | /* NOTE: dpm_resume_noirq() is just a resume() for devices | ||
299 | * that suspended with irqs off ... no overall powerup. | ||
300 | */ | ||
301 | 306 | ||
302 | Enable_irqs: | 307 | Enable_irqs: |
303 | local_irq_enable(); | 308 | local_irq_enable(); |
@@ -315,31 +320,32 @@ static int create_image(int platform_mode) | |||
315 | } | 320 | } |
316 | 321 | ||
317 | /** | 322 | /** |
318 | * hibernation_snapshot - quiesce devices and create the hibernation | 323 | * hibernation_snapshot - Quiesce devices and create a hibernation image. |
319 | * snapshot image. | 324 | * @platform_mode: If set, use platform driver to prepare for the transition. |
320 | * @platform_mode - if set, use the platform driver, if available, to | ||
321 | * prepare the platform firmware for the power transition. | ||
322 | * | 325 | * |
323 | * Must be called with pm_mutex held | 326 | * This routine must be called with pm_mutex held. |
324 | */ | 327 | */ |
325 | |||
326 | int hibernation_snapshot(int platform_mode) | 328 | int hibernation_snapshot(int platform_mode) |
327 | { | 329 | { |
330 | pm_message_t msg = PMSG_RECOVER; | ||
328 | int error; | 331 | int error; |
329 | gfp_t saved_mask; | ||
330 | 332 | ||
331 | error = platform_begin(platform_mode); | 333 | error = platform_begin(platform_mode); |
332 | if (error) | 334 | if (error) |
333 | goto Close; | 335 | goto Close; |
334 | 336 | ||
337 | error = dpm_prepare(PMSG_FREEZE); | ||
338 | if (error) | ||
339 | goto Complete_devices; | ||
340 | |||
335 | /* Preallocate image memory before shutting down devices. */ | 341 | /* Preallocate image memory before shutting down devices. */ |
336 | error = hibernate_preallocate_memory(); | 342 | error = hibernate_preallocate_memory(); |
337 | if (error) | 343 | if (error) |
338 | goto Close; | 344 | goto Complete_devices; |
339 | 345 | ||
340 | suspend_console(); | 346 | suspend_console(); |
341 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 347 | pm_restrict_gfp_mask(); |
342 | error = dpm_suspend_start(PMSG_FREEZE); | 348 | error = dpm_suspend(PMSG_FREEZE); |
343 | if (error) | 349 | if (error) |
344 | goto Recover_platform; | 350 | goto Recover_platform; |
345 | 351 | ||
@@ -347,17 +353,27 @@ int hibernation_snapshot(int platform_mode) | |||
347 | goto Recover_platform; | 353 | goto Recover_platform; |
348 | 354 | ||
349 | error = create_image(platform_mode); | 355 | error = create_image(platform_mode); |
350 | /* Control returns here after successful restore */ | 356 | /* |
357 | * Control returns here (1) after the image has been created or the | ||
358 | * image creation has failed and (2) after a successful restore. | ||
359 | */ | ||
351 | 360 | ||
352 | Resume_devices: | 361 | Resume_devices: |
353 | /* We may need to release the preallocated image pages here. */ | 362 | /* We may need to release the preallocated image pages here. */ |
354 | if (error || !in_suspend) | 363 | if (error || !in_suspend) |
355 | swsusp_free(); | 364 | swsusp_free(); |
356 | 365 | ||
357 | dpm_resume_end(in_suspend ? | 366 | msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; |
358 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 367 | dpm_resume(msg); |
359 | set_gfp_allowed_mask(saved_mask); | 368 | |
369 | if (error || !in_suspend) | ||
370 | pm_restore_gfp_mask(); | ||
371 | |||
360 | resume_console(); | 372 | resume_console(); |
373 | |||
374 | Complete_devices: | ||
375 | dpm_complete(msg); | ||
376 | |||
361 | Close: | 377 | Close: |
362 | platform_end(platform_mode); | 378 | platform_end(platform_mode); |
363 | return error; | 379 | return error; |
@@ -368,13 +384,14 @@ int hibernation_snapshot(int platform_mode) | |||
368 | } | 384 | } |
369 | 385 | ||
370 | /** | 386 | /** |
371 | * resume_target_kernel - prepare devices that need to be suspended with | 387 | * resume_target_kernel - Restore system state from a hibernation image. |
372 | * interrupts off, restore the contents of highmem that have not been | 388 | * @platform_mode: Whether or not to use the platform driver. |
373 | * restored yet from the image and run the low level code that will restore | 389 | * |
374 | * the remaining contents of memory and switch to the just restored target | 390 | * Execute device drivers' .freeze_noirq() callbacks, restore the contents of |
375 | * kernel. | 391 | * highmem that have not been restored yet from the image and run the low-level |
392 | * code that will restore the remaining contents of memory and switch to the | ||
393 | * just restored target kernel. | ||
376 | */ | 394 | */ |
377 | |||
378 | static int resume_target_kernel(bool platform_mode) | 395 | static int resume_target_kernel(bool platform_mode) |
379 | { | 396 | { |
380 | int error; | 397 | int error; |
@@ -396,34 +413,36 @@ static int resume_target_kernel(bool platform_mode) | |||
396 | 413 | ||
397 | local_irq_disable(); | 414 | local_irq_disable(); |
398 | 415 | ||
399 | error = sysdev_suspend(PMSG_QUIESCE); | 416 | error = syscore_suspend(); |
400 | if (error) | 417 | if (error) |
401 | goto Enable_irqs; | 418 | goto Enable_irqs; |
402 | 419 | ||
403 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | ||
404 | save_processor_state(); | 420 | save_processor_state(); |
405 | error = restore_highmem(); | 421 | error = restore_highmem(); |
406 | if (!error) { | 422 | if (!error) { |
407 | error = swsusp_arch_resume(); | 423 | error = swsusp_arch_resume(); |
408 | /* | 424 | /* |
409 | * The code below is only ever reached in case of a failure. | 425 | * The code below is only ever reached in case of a failure. |
410 | * Otherwise execution continues at place where | 426 | * Otherwise, execution continues at the place where |
411 | * swsusp_arch_suspend() was called | 427 | * swsusp_arch_suspend() was called. |
412 | */ | 428 | */ |
413 | BUG_ON(!error); | 429 | BUG_ON(!error); |
414 | /* This call to restore_highmem() undos the previous one */ | 430 | /* |
431 | * This call to restore_highmem() reverts the changes made by | ||
432 | * the previous one. | ||
433 | */ | ||
415 | restore_highmem(); | 434 | restore_highmem(); |
416 | } | 435 | } |
417 | /* | 436 | /* |
418 | * The only reason why swsusp_arch_resume() can fail is memory being | 437 | * The only reason why swsusp_arch_resume() can fail is memory being |
419 | * very tight, so we have to free it as soon as we can to avoid | 438 | * very tight, so we have to free it as soon as we can to avoid |
420 | * subsequent failures | 439 | * subsequent failures. |
421 | */ | 440 | */ |
422 | swsusp_free(); | 441 | swsusp_free(); |
423 | restore_processor_state(); | 442 | restore_processor_state(); |
424 | touch_softlockup_watchdog(); | 443 | touch_softlockup_watchdog(); |
425 | 444 | ||
426 | sysdev_resume(); | 445 | syscore_resume(); |
427 | 446 | ||
428 | Enable_irqs: | 447 | Enable_irqs: |
429 | local_irq_enable(); | 448 | local_irq_enable(); |
@@ -440,42 +459,36 @@ static int resume_target_kernel(bool platform_mode) | |||
440 | } | 459 | } |
441 | 460 | ||
442 | /** | 461 | /** |
443 | * hibernation_restore - quiesce devices and restore the hibernation | 462 | * hibernation_restore - Quiesce devices and restore from a hibernation image. |
444 | * snapshot image. If successful, control returns in hibernation_snaphot() | 463 | * @platform_mode: If set, use platform driver to prepare for the transition. |
445 | * @platform_mode - if set, use the platform driver, if available, to | ||
446 | * prepare the platform firmware for the transition. | ||
447 | * | 464 | * |
448 | * Must be called with pm_mutex held | 465 | * This routine must be called with pm_mutex held. If it is successful, control |
466 | * reappears in the restored target kernel in hibernation_snaphot(). | ||
449 | */ | 467 | */ |
450 | |||
451 | int hibernation_restore(int platform_mode) | 468 | int hibernation_restore(int platform_mode) |
452 | { | 469 | { |
453 | int error; | 470 | int error; |
454 | gfp_t saved_mask; | ||
455 | 471 | ||
456 | pm_prepare_console(); | 472 | pm_prepare_console(); |
457 | suspend_console(); | 473 | suspend_console(); |
458 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 474 | pm_restrict_gfp_mask(); |
459 | error = dpm_suspend_start(PMSG_QUIESCE); | 475 | error = dpm_suspend_start(PMSG_QUIESCE); |
460 | if (!error) { | 476 | if (!error) { |
461 | error = resume_target_kernel(platform_mode); | 477 | error = resume_target_kernel(platform_mode); |
462 | dpm_resume_end(PMSG_RECOVER); | 478 | dpm_resume_end(PMSG_RECOVER); |
463 | } | 479 | } |
464 | set_gfp_allowed_mask(saved_mask); | 480 | pm_restore_gfp_mask(); |
465 | resume_console(); | 481 | resume_console(); |
466 | pm_restore_console(); | 482 | pm_restore_console(); |
467 | return error; | 483 | return error; |
468 | } | 484 | } |
469 | 485 | ||
470 | /** | 486 | /** |
471 | * hibernation_platform_enter - enter the hibernation state using the | 487 | * hibernation_platform_enter - Power off the system using the platform driver. |
472 | * platform driver (if available) | ||
473 | */ | 488 | */ |
474 | |||
475 | int hibernation_platform_enter(void) | 489 | int hibernation_platform_enter(void) |
476 | { | 490 | { |
477 | int error; | 491 | int error; |
478 | gfp_t saved_mask; | ||
479 | 492 | ||
480 | if (!hibernation_ops) | 493 | if (!hibernation_ops) |
481 | return -ENOSYS; | 494 | return -ENOSYS; |
@@ -491,7 +504,6 @@ int hibernation_platform_enter(void) | |||
491 | 504 | ||
492 | entering_platform_hibernation = true; | 505 | entering_platform_hibernation = true; |
493 | suspend_console(); | 506 | suspend_console(); |
494 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
495 | error = dpm_suspend_start(PMSG_HIBERNATE); | 507 | error = dpm_suspend_start(PMSG_HIBERNATE); |
496 | if (error) { | 508 | if (error) { |
497 | if (hibernation_ops->recover) | 509 | if (hibernation_ops->recover) |
@@ -512,8 +524,8 @@ int hibernation_platform_enter(void) | |||
512 | goto Platform_finish; | 524 | goto Platform_finish; |
513 | 525 | ||
514 | local_irq_disable(); | 526 | local_irq_disable(); |
515 | sysdev_suspend(PMSG_HIBERNATE); | 527 | syscore_suspend(); |
516 | if (!pm_check_wakeup_events()) { | 528 | if (pm_wakeup_pending()) { |
517 | error = -EAGAIN; | 529 | error = -EAGAIN; |
518 | goto Power_up; | 530 | goto Power_up; |
519 | } | 531 | } |
@@ -523,7 +535,7 @@ int hibernation_platform_enter(void) | |||
523 | while (1); | 535 | while (1); |
524 | 536 | ||
525 | Power_up: | 537 | Power_up: |
526 | sysdev_resume(); | 538 | syscore_resume(); |
527 | local_irq_enable(); | 539 | local_irq_enable(); |
528 | enable_nonboot_cpus(); | 540 | enable_nonboot_cpus(); |
529 | 541 | ||
@@ -535,7 +547,6 @@ int hibernation_platform_enter(void) | |||
535 | Resume_devices: | 547 | Resume_devices: |
536 | entering_platform_hibernation = false; | 548 | entering_platform_hibernation = false; |
537 | dpm_resume_end(PMSG_RESTORE); | 549 | dpm_resume_end(PMSG_RESTORE); |
538 | set_gfp_allowed_mask(saved_mask); | ||
539 | resume_console(); | 550 | resume_console(); |
540 | 551 | ||
541 | Close: | 552 | Close: |
@@ -545,12 +556,12 @@ int hibernation_platform_enter(void) | |||
545 | } | 556 | } |
546 | 557 | ||
547 | /** | 558 | /** |
548 | * power_down - Shut the machine down for hibernation. | 559 | * power_down - Shut the machine down for hibernation. |
549 | * | 560 | * |
550 | * Use the platform driver, if configured so; otherwise try | 561 | * Use the platform driver, if configured, to put the system into the sleep |
551 | * to power off or reboot. | 562 | * state corresponding to hibernation, or try to power it off or reboot, |
563 | * depending on the value of hibernation_mode. | ||
552 | */ | 564 | */ |
553 | |||
554 | static void power_down(void) | 565 | static void power_down(void) |
555 | { | 566 | { |
556 | switch (hibernation_mode) { | 567 | switch (hibernation_mode) { |
@@ -587,9 +598,8 @@ static int prepare_processes(void) | |||
587 | } | 598 | } |
588 | 599 | ||
589 | /** | 600 | /** |
590 | * hibernate - The granpappy of the built-in hibernation management | 601 | * hibernate - Carry out system hibernation, including saving the image. |
591 | */ | 602 | */ |
592 | |||
593 | int hibernate(void) | 603 | int hibernate(void) |
594 | { | 604 | { |
595 | int error; | 605 | int error; |
@@ -638,11 +648,15 @@ int hibernate(void) | |||
638 | 648 | ||
639 | if (hibernation_mode == HIBERNATION_PLATFORM) | 649 | if (hibernation_mode == HIBERNATION_PLATFORM) |
640 | flags |= SF_PLATFORM_MODE; | 650 | flags |= SF_PLATFORM_MODE; |
651 | if (nocompress) | ||
652 | flags |= SF_NOCOMPRESS_MODE; | ||
641 | pr_debug("PM: writing image.\n"); | 653 | pr_debug("PM: writing image.\n"); |
642 | error = swsusp_write(flags); | 654 | error = swsusp_write(flags); |
643 | swsusp_free(); | 655 | swsusp_free(); |
644 | if (!error) | 656 | if (!error) |
645 | power_down(); | 657 | power_down(); |
658 | in_suspend = 0; | ||
659 | pm_restore_gfp_mask(); | ||
646 | } else { | 660 | } else { |
647 | pr_debug("PM: Image restored successfully.\n"); | 661 | pr_debug("PM: Image restored successfully.\n"); |
648 | } | 662 | } |
@@ -663,17 +677,20 @@ int hibernate(void) | |||
663 | 677 | ||
664 | 678 | ||
665 | /** | 679 | /** |
666 | * software_resume - Resume from a saved image. | 680 | * software_resume - Resume from a saved hibernation image. |
681 | * | ||
682 | * This routine is called as a late initcall, when all devices have been | ||
683 | * discovered and initialized already. | ||
667 | * | 684 | * |
668 | * Called as a late_initcall (so all devices are discovered and | 685 | * The image reading code is called to see if there is a hibernation image |
669 | * initialized), we call swsusp to see if we have a saved image or not. | 686 | * available for reading. If that is the case, devices are quiesced and the |
670 | * If so, we quiesce devices, the restore the saved image. We will | 687 | * contents of memory is restored from the saved image. |
671 | * return above (in hibernate() ) if everything goes well. | ||
672 | * Otherwise, we fail gracefully and return to the normally | ||
673 | * scheduled program. | ||
674 | * | 688 | * |
689 | * If this is successful, control reappears in the restored target kernel in | ||
690 | * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine | ||
691 | * attempts to recover gracefully and make the kernel return to the normal mode | ||
692 | * of operation. | ||
675 | */ | 693 | */ |
676 | |||
677 | static int software_resume(void) | 694 | static int software_resume(void) |
678 | { | 695 | { |
679 | int error; | 696 | int error; |
@@ -705,7 +722,7 @@ static int software_resume(void) | |||
705 | goto Unlock; | 722 | goto Unlock; |
706 | } | 723 | } |
707 | 724 | ||
708 | pr_debug("PM: Checking image partition %s\n", resume_file); | 725 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); |
709 | 726 | ||
710 | /* Check if the device is there */ | 727 | /* Check if the device is there */ |
711 | swsusp_resume_device = name_to_dev_t(resume_file); | 728 | swsusp_resume_device = name_to_dev_t(resume_file); |
@@ -730,10 +747,10 @@ static int software_resume(void) | |||
730 | } | 747 | } |
731 | 748 | ||
732 | Check_image: | 749 | Check_image: |
733 | pr_debug("PM: Resume from partition %d:%d\n", | 750 | pr_debug("PM: Hibernation image partition %d:%d present\n", |
734 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | 751 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); |
735 | 752 | ||
736 | pr_debug("PM: Checking hibernation image.\n"); | 753 | pr_debug("PM: Looking for hibernation image.\n"); |
737 | error = swsusp_check(); | 754 | error = swsusp_check(); |
738 | if (error) | 755 | if (error) |
739 | goto Unlock; | 756 | goto Unlock; |
@@ -765,14 +782,14 @@ static int software_resume(void) | |||
765 | goto Done; | 782 | goto Done; |
766 | } | 783 | } |
767 | 784 | ||
768 | pr_debug("PM: Reading hibernation image.\n"); | 785 | pr_debug("PM: Loading hibernation image.\n"); |
769 | 786 | ||
770 | error = swsusp_read(&flags); | 787 | error = swsusp_read(&flags); |
771 | swsusp_close(FMODE_READ); | 788 | swsusp_close(FMODE_READ); |
772 | if (!error) | 789 | if (!error) |
773 | hibernation_restore(flags & SF_PLATFORM_MODE); | 790 | hibernation_restore(flags & SF_PLATFORM_MODE); |
774 | 791 | ||
775 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | 792 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); |
776 | swsusp_free(); | 793 | swsusp_free(); |
777 | thaw_processes(); | 794 | thaw_processes(); |
778 | Done: | 795 | Done: |
@@ -785,7 +802,7 @@ static int software_resume(void) | |||
785 | /* For success case, the suspend path will release the lock */ | 802 | /* For success case, the suspend path will release the lock */ |
786 | Unlock: | 803 | Unlock: |
787 | mutex_unlock(&pm_mutex); | 804 | mutex_unlock(&pm_mutex); |
788 | pr_debug("PM: Resume from disk failed.\n"); | 805 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); |
789 | return error; | 806 | return error; |
790 | close_finish: | 807 | close_finish: |
791 | swsusp_close(FMODE_READ); | 808 | swsusp_close(FMODE_READ); |
@@ -803,21 +820,17 @@ static const char * const hibernation_modes[] = { | |||
803 | [HIBERNATION_TESTPROC] = "testproc", | 820 | [HIBERNATION_TESTPROC] = "testproc", |
804 | }; | 821 | }; |
805 | 822 | ||
806 | /** | 823 | /* |
807 | * disk - Control hibernation mode | 824 | * /sys/power/disk - Control hibernation mode. |
808 | * | ||
809 | * Suspend-to-disk can be handled in several ways. We have a few options | ||
810 | * for putting the system to sleep - using the platform driver (e.g. ACPI | ||
811 | * or other hibernation_ops), powering off the system or rebooting the | ||
812 | * system (for testing) as well as the two test modes. | ||
813 | * | 825 | * |
814 | * The system can support 'platform', and that is known a priori (and | 826 | * Hibernation can be handled in several ways. There are a few different ways |
815 | * encoded by the presence of hibernation_ops). However, the user may | 827 | * to put the system into the sleep state: using the platform driver (e.g. ACPI |
816 | * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the | 828 | * or other hibernation_ops), powering it off or rebooting it (for testing |
817 | * test modes, 'test' or 'testproc'. | 829 | * mostly), or using one of the two available test modes. |
818 | * | 830 | * |
819 | * show() will display what the mode is currently set to. | 831 | * The sysfs file /sys/power/disk provides an interface for selecting the |
820 | * store() will accept one of | 832 | * hibernation mode to use. Reading from this file causes the available modes |
833 | * to be printed. There are 5 modes that can be supported: | ||
821 | * | 834 | * |
822 | * 'platform' | 835 | * 'platform' |
823 | * 'shutdown' | 836 | * 'shutdown' |
@@ -825,8 +838,14 @@ static const char * const hibernation_modes[] = { | |||
825 | * 'test' | 838 | * 'test' |
826 | * 'testproc' | 839 | * 'testproc' |
827 | * | 840 | * |
828 | * It will only change to 'platform' if the system | 841 | * If a platform hibernation driver is in use, 'platform' will be supported |
829 | * supports it (as determined by having hibernation_ops). | 842 | * and will be used by default. Otherwise, 'shutdown' will be used by default. |
843 | * The selected option (i.e. the one corresponding to the current value of | ||
844 | * hibernation_mode) is enclosed by a square bracket. | ||
845 | * | ||
846 | * To select a given hibernation mode it is necessary to write the mode's | ||
847 | * string representation (as returned by reading from /sys/power/disk) back | ||
848 | * into /sys/power/disk. | ||
830 | */ | 849 | */ |
831 | 850 | ||
832 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | 851 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -859,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
859 | return buf-start; | 878 | return buf-start; |
860 | } | 879 | } |
861 | 880 | ||
862 | |||
863 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | 881 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, |
864 | const char *buf, size_t n) | 882 | const char *buf, size_t n) |
865 | { | 883 | { |
@@ -961,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att | |||
961 | 979 | ||
962 | power_attr(image_size); | 980 | power_attr(image_size); |
963 | 981 | ||
982 | static ssize_t reserved_size_show(struct kobject *kobj, | ||
983 | struct kobj_attribute *attr, char *buf) | ||
984 | { | ||
985 | return sprintf(buf, "%lu\n", reserved_size); | ||
986 | } | ||
987 | |||
988 | static ssize_t reserved_size_store(struct kobject *kobj, | ||
989 | struct kobj_attribute *attr, | ||
990 | const char *buf, size_t n) | ||
991 | { | ||
992 | unsigned long size; | ||
993 | |||
994 | if (sscanf(buf, "%lu", &size) == 1) { | ||
995 | reserved_size = size; | ||
996 | return n; | ||
997 | } | ||
998 | |||
999 | return -EINVAL; | ||
1000 | } | ||
1001 | |||
1002 | power_attr(reserved_size); | ||
1003 | |||
964 | static struct attribute * g[] = { | 1004 | static struct attribute * g[] = { |
965 | &disk_attr.attr, | 1005 | &disk_attr.attr, |
966 | &resume_attr.attr, | 1006 | &resume_attr.attr, |
967 | &image_size_attr.attr, | 1007 | &image_size_attr.attr, |
1008 | &reserved_size_attr.attr, | ||
968 | NULL, | 1009 | NULL, |
969 | }; | 1010 | }; |
970 | 1011 | ||
@@ -1004,6 +1045,15 @@ static int __init resume_offset_setup(char *str) | |||
1004 | return 1; | 1045 | return 1; |
1005 | } | 1046 | } |
1006 | 1047 | ||
1048 | static int __init hibernate_setup(char *str) | ||
1049 | { | ||
1050 | if (!strncmp(str, "noresume", 8)) | ||
1051 | noresume = 1; | ||
1052 | else if (!strncmp(str, "nocompress", 10)) | ||
1053 | nocompress = 1; | ||
1054 | return 1; | ||
1055 | } | ||
1056 | |||
1007 | static int __init noresume_setup(char *str) | 1057 | static int __init noresume_setup(char *str) |
1008 | { | 1058 | { |
1009 | noresume = 1; | 1059 | noresume = 1; |
@@ -1013,3 +1063,4 @@ static int __init noresume_setup(char *str) | |||
1013 | __setup("noresume", noresume_setup); | 1063 | __setup("noresume", noresume_setup); |
1014 | __setup("resume_offset=", resume_offset_setup); | 1064 | __setup("resume_offset=", resume_offset_setup); |
1015 | __setup("resume=", resume_setup); | 1065 | __setup("resume=", resume_setup); |
1066 | __setup("hibernate=", hibernate_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 62b0bc6e4983..2981af4ce7cb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -17,9 +17,6 @@ | |||
17 | 17 | ||
18 | DEFINE_MUTEX(pm_mutex); | 18 | DEFINE_MUTEX(pm_mutex); |
19 | 19 | ||
20 | unsigned int pm_flags; | ||
21 | EXPORT_SYMBOL(pm_flags); | ||
22 | |||
23 | #ifdef CONFIG_PM_SLEEP | 20 | #ifdef CONFIG_PM_SLEEP |
24 | 21 | ||
25 | /* Routines for PM-transition notifications */ | 22 | /* Routines for PM-transition notifications */ |
@@ -227,7 +224,7 @@ power_attr(state); | |||
227 | * writing to 'state'. It first should read from 'wakeup_count' and store | 224 | * writing to 'state'. It first should read from 'wakeup_count' and store |
228 | * the read value. Then, after carrying out its own preparations for the system | 225 | * the read value. Then, after carrying out its own preparations for the system |
229 | * transition to a sleep state, it should write the stored value to | 226 | * transition to a sleep state, it should write the stored value to |
230 | * 'wakeup_count'. If that fails, at least one wakeup event has occured since | 227 | * 'wakeup_count'. If that fails, at least one wakeup event has occurred since |
231 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it | 228 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it |
232 | * is allowed to write to 'state', but the transition will be aborted if there | 229 | * is allowed to write to 'state', but the transition will be aborted if there |
233 | * are any wakeup events detected after 'wakeup_count' was written to. | 230 | * are any wakeup events detected after 'wakeup_count' was written to. |
@@ -237,18 +234,18 @@ static ssize_t wakeup_count_show(struct kobject *kobj, | |||
237 | struct kobj_attribute *attr, | 234 | struct kobj_attribute *attr, |
238 | char *buf) | 235 | char *buf) |
239 | { | 236 | { |
240 | unsigned long val; | 237 | unsigned int val; |
241 | 238 | ||
242 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; | 239 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; |
243 | } | 240 | } |
244 | 241 | ||
245 | static ssize_t wakeup_count_store(struct kobject *kobj, | 242 | static ssize_t wakeup_count_store(struct kobject *kobj, |
246 | struct kobj_attribute *attr, | 243 | struct kobj_attribute *attr, |
247 | const char *buf, size_t n) | 244 | const char *buf, size_t n) |
248 | { | 245 | { |
249 | unsigned long val; | 246 | unsigned int val; |
250 | 247 | ||
251 | if (sscanf(buf, "%lu", &val) == 1) { | 248 | if (sscanf(buf, "%u", &val) == 1) { |
252 | if (pm_save_wakeup_count(val)) | 249 | if (pm_save_wakeup_count(val)) |
253 | return n; | 250 | return n; |
254 | } | 251 | } |
@@ -281,12 +278,30 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
281 | } | 278 | } |
282 | 279 | ||
283 | power_attr(pm_trace); | 280 | power_attr(pm_trace); |
281 | |||
282 | static ssize_t pm_trace_dev_match_show(struct kobject *kobj, | ||
283 | struct kobj_attribute *attr, | ||
284 | char *buf) | ||
285 | { | ||
286 | return show_trace_dev_match(buf, PAGE_SIZE); | ||
287 | } | ||
288 | |||
289 | static ssize_t | ||
290 | pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
291 | const char *buf, size_t n) | ||
292 | { | ||
293 | return -EINVAL; | ||
294 | } | ||
295 | |||
296 | power_attr(pm_trace_dev_match); | ||
297 | |||
284 | #endif /* CONFIG_PM_TRACE */ | 298 | #endif /* CONFIG_PM_TRACE */ |
285 | 299 | ||
286 | static struct attribute * g[] = { | 300 | static struct attribute * g[] = { |
287 | &state_attr.attr, | 301 | &state_attr.attr, |
288 | #ifdef CONFIG_PM_TRACE | 302 | #ifdef CONFIG_PM_TRACE |
289 | &pm_trace_attr.attr, | 303 | &pm_trace_attr.attr, |
304 | &pm_trace_dev_match_attr.attr, | ||
290 | #endif | 305 | #endif |
291 | #ifdef CONFIG_PM_SLEEP | 306 | #ifdef CONFIG_PM_SLEEP |
292 | &pm_async_attr.attr, | 307 | &pm_async_attr.attr, |
@@ -308,7 +323,7 @@ EXPORT_SYMBOL_GPL(pm_wq); | |||
308 | 323 | ||
309 | static int __init pm_start_workqueue(void) | 324 | static int __init pm_start_workqueue(void) |
310 | { | 325 | { |
311 | pm_wq = create_freezeable_workqueue("pm"); | 326 | pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); |
312 | 327 | ||
313 | return pm_wq ? 0 : -ENOMEM; | 328 | return pm_wq ? 0 : -ENOMEM; |
314 | } | 329 | } |
@@ -321,6 +336,8 @@ static int __init pm_init(void) | |||
321 | int error = pm_start_workqueue(); | 336 | int error = pm_start_workqueue(); |
322 | if (error) | 337 | if (error) |
323 | return error; | 338 | return error; |
339 | hibernate_image_size_init(); | ||
340 | hibernate_reserved_size_init(); | ||
324 | power_kobj = kobject_create_and_add("power", NULL); | 341 | power_kobj = kobject_create_and_add("power", NULL); |
325 | if (!power_kobj) | 342 | if (!power_kobj) |
326 | return -ENOMEM; | 343 | return -ENOMEM; |
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db60bbb6..000000000000 --- a/kernel/power/nvs.c +++ /dev/null | |||
@@ -1,136 +0,0 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory | ||
3 | * | ||
4 | * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | ||
5 | * | ||
6 | * This file is released under the GPLv2. | ||
7 | */ | ||
8 | |||
9 | #include <linux/io.h> | ||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/suspend.h> | ||
15 | |||
16 | /* | ||
17 | * Platforms, like ACPI, may want us to save some memory used by them during | ||
18 | * suspend and to restore the contents of this memory during the subsequent | ||
19 | * resume. The code below implements a mechanism allowing us to do that. | ||
20 | */ | ||
21 | |||
22 | struct nvs_page { | ||
23 | unsigned long phys_start; | ||
24 | unsigned int size; | ||
25 | void *kaddr; | ||
26 | void *data; | ||
27 | struct list_head node; | ||
28 | }; | ||
29 | |||
30 | static LIST_HEAD(nvs_list); | ||
31 | |||
32 | /** | ||
33 | * suspend_nvs_register - register platform NVS memory region to save | ||
34 | * @start - physical address of the region | ||
35 | * @size - size of the region | ||
36 | * | ||
37 | * The NVS region need not be page-aligned (both ends) and we arrange | ||
38 | * things so that the data from page-aligned addresses in this region will | ||
39 | * be copied into separate RAM pages. | ||
40 | */ | ||
41 | int suspend_nvs_register(unsigned long start, unsigned long size) | ||
42 | { | ||
43 | struct nvs_page *entry, *next; | ||
44 | |||
45 | while (size > 0) { | ||
46 | unsigned int nr_bytes; | ||
47 | |||
48 | entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); | ||
49 | if (!entry) | ||
50 | goto Error; | ||
51 | |||
52 | list_add_tail(&entry->node, &nvs_list); | ||
53 | entry->phys_start = start; | ||
54 | nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); | ||
55 | entry->size = (size < nr_bytes) ? size : nr_bytes; | ||
56 | |||
57 | start += entry->size; | ||
58 | size -= entry->size; | ||
59 | } | ||
60 | return 0; | ||
61 | |||
62 | Error: | ||
63 | list_for_each_entry_safe(entry, next, &nvs_list, node) { | ||
64 | list_del(&entry->node); | ||
65 | kfree(entry); | ||
66 | } | ||
67 | return -ENOMEM; | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * suspend_nvs_free - free data pages allocated for saving NVS regions | ||
72 | */ | ||
73 | void suspend_nvs_free(void) | ||
74 | { | ||
75 | struct nvs_page *entry; | ||
76 | |||
77 | list_for_each_entry(entry, &nvs_list, node) | ||
78 | if (entry->data) { | ||
79 | free_page((unsigned long)entry->data); | ||
80 | entry->data = NULL; | ||
81 | if (entry->kaddr) { | ||
82 | iounmap(entry->kaddr); | ||
83 | entry->kaddr = NULL; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | /** | ||
89 | * suspend_nvs_alloc - allocate memory necessary for saving NVS regions | ||
90 | */ | ||
91 | int suspend_nvs_alloc(void) | ||
92 | { | ||
93 | struct nvs_page *entry; | ||
94 | |||
95 | list_for_each_entry(entry, &nvs_list, node) { | ||
96 | entry->data = (void *)__get_free_page(GFP_KERNEL); | ||
97 | if (!entry->data) { | ||
98 | suspend_nvs_free(); | ||
99 | return -ENOMEM; | ||
100 | } | ||
101 | } | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * suspend_nvs_save - save NVS memory regions | ||
107 | */ | ||
108 | void suspend_nvs_save(void) | ||
109 | { | ||
110 | struct nvs_page *entry; | ||
111 | |||
112 | printk(KERN_INFO "PM: Saving platform NVS memory\n"); | ||
113 | |||
114 | list_for_each_entry(entry, &nvs_list, node) | ||
115 | if (entry->data) { | ||
116 | entry->kaddr = ioremap(entry->phys_start, entry->size); | ||
117 | memcpy(entry->data, entry->kaddr, entry->size); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | /** | ||
122 | * suspend_nvs_restore - restore NVS memory regions | ||
123 | * | ||
124 | * This function is going to be called with interrupts disabled, so it | ||
125 | * cannot iounmap the virtual addresses used to access the NVS region. | ||
126 | */ | ||
127 | void suspend_nvs_restore(void) | ||
128 | { | ||
129 | struct nvs_page *entry; | ||
130 | |||
131 | printk(KERN_INFO "PM: Restoring platform NVS memory\n"); | ||
132 | |||
133 | list_for_each_entry(entry, &nvs_list, node) | ||
134 | if (entry->data) | ||
135 | memcpy(entry->kaddr, entry->data, entry->size); | ||
136 | } | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 006270fe382d..9a00a0a26280 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -14,6 +14,10 @@ struct swsusp_info { | |||
14 | } __attribute__((aligned(PAGE_SIZE))); | 14 | } __attribute__((aligned(PAGE_SIZE))); |
15 | 15 | ||
16 | #ifdef CONFIG_HIBERNATION | 16 | #ifdef CONFIG_HIBERNATION |
17 | /* kernel/power/snapshot.c */ | ||
18 | extern void __init hibernate_reserved_size_init(void); | ||
19 | extern void __init hibernate_image_size_init(void); | ||
20 | |||
17 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER | 21 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER |
18 | /* Maximum size of architecture specific data in a hibernation header */ | 22 | /* Maximum size of architecture specific data in a hibernation header */ |
19 | #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) | 23 | #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) |
@@ -49,7 +53,12 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
49 | extern int hibernation_snapshot(int platform_mode); | 53 | extern int hibernation_snapshot(int platform_mode); |
50 | extern int hibernation_restore(int platform_mode); | 54 | extern int hibernation_restore(int platform_mode); |
51 | extern int hibernation_platform_enter(void); | 55 | extern int hibernation_platform_enter(void); |
52 | #endif | 56 | |
57 | #else /* !CONFIG_HIBERNATION */ | ||
58 | |||
59 | static inline void hibernate_reserved_size_init(void) {} | ||
60 | static inline void hibernate_image_size_init(void) {} | ||
61 | #endif /* !CONFIG_HIBERNATION */ | ||
53 | 62 | ||
54 | extern int pfn_is_nosave(unsigned long); | 63 | extern int pfn_is_nosave(unsigned long); |
55 | 64 | ||
@@ -65,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \ | |||
65 | 74 | ||
66 | /* Preferred image size in bytes (default 500 MB) */ | 75 | /* Preferred image size in bytes (default 500 MB) */ |
67 | extern unsigned long image_size; | 76 | extern unsigned long image_size; |
77 | /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ | ||
78 | extern unsigned long reserved_size; | ||
68 | extern int in_suspend; | 79 | extern int in_suspend; |
69 | extern dev_t swsusp_resume_device; | 80 | extern dev_t swsusp_resume_device; |
70 | extern sector_t swsusp_resume_block; | 81 | extern sector_t swsusp_resume_block; |
@@ -134,6 +145,7 @@ extern int swsusp_swap_in_use(void); | |||
134 | * the image header. | 145 | * the image header. |
135 | */ | 146 | */ |
136 | #define SF_PLATFORM_MODE 1 | 147 | #define SF_PLATFORM_MODE 1 |
148 | #define SF_NOCOMPRESS_MODE 2 | ||
137 | 149 | ||
138 | /* kernel/power/hibernate.c */ | 150 | /* kernel/power/hibernate.c */ |
139 | extern int swsusp_check(void); | 151 | extern int swsusp_check(void); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 028a99598f49..0cf3a27a6c9d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -22,7 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | #define TIMEOUT (20 * HZ) | 23 | #define TIMEOUT (20 * HZ) |
24 | 24 | ||
25 | static inline int freezeable(struct task_struct * p) | 25 | static inline int freezable(struct task_struct * p) |
26 | { | 26 | { |
27 | if ((p == current) || | 27 | if ((p == current) || |
28 | (p->flags & PF_NOFREEZE) || | 28 | (p->flags & PF_NOFREEZE) || |
@@ -40,6 +40,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
40 | struct timeval start, end; | 40 | struct timeval start, end; |
41 | u64 elapsed_csecs64; | 41 | u64 elapsed_csecs64; |
42 | unsigned int elapsed_csecs; | 42 | unsigned int elapsed_csecs; |
43 | bool wakeup = false; | ||
43 | 44 | ||
44 | do_gettimeofday(&start); | 45 | do_gettimeofday(&start); |
45 | 46 | ||
@@ -52,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
52 | todo = 0; | 53 | todo = 0; |
53 | read_lock(&tasklist_lock); | 54 | read_lock(&tasklist_lock); |
54 | do_each_thread(g, p) { | 55 | do_each_thread(g, p) { |
55 | if (frozen(p) || !freezeable(p)) | 56 | if (frozen(p) || !freezable(p)) |
56 | continue; | 57 | continue; |
57 | 58 | ||
58 | if (!freeze_task(p, sig_only)) | 59 | if (!freeze_task(p, sig_only)) |
@@ -63,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
63 | * perturb a task in TASK_STOPPED or TASK_TRACED. | 64 | * perturb a task in TASK_STOPPED or TASK_TRACED. |
64 | * It is "frozen enough". If the task does wake | 65 | * It is "frozen enough". If the task does wake |
65 | * up, it will immediately call try_to_freeze. | 66 | * up, it will immediately call try_to_freeze. |
67 | * | ||
68 | * Because freeze_task() goes through p's | ||
69 | * scheduler lock after setting TIF_FREEZE, it's | ||
70 | * guaranteed that either we see TASK_RUNNING or | ||
71 | * try_to_stop() after schedule() in ptrace/signal | ||
72 | * stop sees TIF_FREEZE. | ||
66 | */ | 73 | */ |
67 | if (!task_is_stopped_or_traced(p) && | 74 | if (!task_is_stopped_or_traced(p) && |
68 | !freezer_should_skip(p)) | 75 | !freezer_should_skip(p)) |
@@ -78,6 +85,11 @@ static int try_to_freeze_tasks(bool sig_only) | |||
78 | if (!todo || time_after(jiffies, end_time)) | 85 | if (!todo || time_after(jiffies, end_time)) |
79 | break; | 86 | break; |
80 | 87 | ||
88 | if (pm_wakeup_pending()) { | ||
89 | wakeup = true; | ||
90 | break; | ||
91 | } | ||
92 | |||
81 | /* | 93 | /* |
82 | * We need to retry, but first give the freezing tasks some | 94 | * We need to retry, but first give the freezing tasks some |
83 | * time to enter the regrigerator. | 95 | * time to enter the regrigerator. |
@@ -97,8 +109,9 @@ static int try_to_freeze_tasks(bool sig_only) | |||
97 | * but it cleans up leftover PF_FREEZE requests. | 109 | * but it cleans up leftover PF_FREEZE requests. |
98 | */ | 110 | */ |
99 | printk("\n"); | 111 | printk("\n"); |
100 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | 112 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " |
101 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 113 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
114 | wakeup ? "aborted" : "failed", | ||
102 | elapsed_csecs / 100, elapsed_csecs % 100, | 115 | elapsed_csecs / 100, elapsed_csecs % 100, |
103 | todo - wq_busy, wq_busy); | 116 | todo - wq_busy, wq_busy); |
104 | 117 | ||
@@ -107,7 +120,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
107 | read_lock(&tasklist_lock); | 120 | read_lock(&tasklist_lock); |
108 | do_each_thread(g, p) { | 121 | do_each_thread(g, p) { |
109 | task_lock(p); | 122 | task_lock(p); |
110 | if (freezing(p) && !freezer_should_skip(p)) | 123 | if (!wakeup && freezing(p) && !freezer_should_skip(p)) |
111 | sched_show_task(p); | 124 | sched_show_task(p); |
112 | cancel_freezing(p); | 125 | cancel_freezing(p); |
113 | task_unlock(p); | 126 | task_unlock(p); |
@@ -154,7 +167,7 @@ static void thaw_tasks(bool nosig_only) | |||
154 | 167 | ||
155 | read_lock(&tasklist_lock); | 168 | read_lock(&tasklist_lock); |
156 | do_each_thread(g, p) { | 169 | do_each_thread(g, p) { |
157 | if (!freezeable(p)) | 170 | if (!freezable(p)) |
158 | continue; | 171 | continue; |
159 | 172 | ||
160 | if (nosig_only && should_send_signal(p)) | 173 | if (nosig_only && should_send_signal(p)) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d3f795f01bbc..06efa54f93d6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -41,12 +41,29 @@ static void swsusp_set_page_forbidden(struct page *); | |||
41 | static void swsusp_unset_page_forbidden(struct page *); | 41 | static void swsusp_unset_page_forbidden(struct page *); |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Number of bytes to reserve for memory allocations made by device drivers | ||
45 | * from their ->freeze() and ->freeze_noirq() callbacks so that they don't | ||
46 | * cause image creation to fail (tunable via /sys/power/reserved_size). | ||
47 | */ | ||
48 | unsigned long reserved_size; | ||
49 | |||
50 | void __init hibernate_reserved_size_init(void) | ||
51 | { | ||
52 | reserved_size = SPARE_PAGES * PAGE_SIZE; | ||
53 | } | ||
54 | |||
55 | /* | ||
44 | * Preferred image size in bytes (tunable via /sys/power/image_size). | 56 | * Preferred image size in bytes (tunable via /sys/power/image_size). |
45 | * When it is set to N, swsusp will do its best to ensure the image | 57 | * When it is set to N, swsusp will do its best to ensure the image |
46 | * size will not exceed N bytes, but if that is impossible, it will | 58 | * size will not exceed N bytes, but if that is impossible, it will |
47 | * try to create the smallest image possible. | 59 | * try to create the smallest image possible. |
48 | */ | 60 | */ |
49 | unsigned long image_size = 500 * 1024 * 1024; | 61 | unsigned long image_size; |
62 | |||
63 | void __init hibernate_image_size_init(void) | ||
64 | { | ||
65 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; | ||
66 | } | ||
50 | 67 | ||
51 | /* List of PBEs needed for restoring the pages that were allocated before | 68 | /* List of PBEs needed for restoring the pages that were allocated before |
52 | * the suspend and included in the suspend image, but have also been | 69 | * the suspend and included in the suspend image, but have also been |
@@ -979,8 +996,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
979 | src = kmap_atomic(s_page, KM_USER0); | 996 | src = kmap_atomic(s_page, KM_USER0); |
980 | dst = kmap_atomic(d_page, KM_USER1); | 997 | dst = kmap_atomic(d_page, KM_USER1); |
981 | do_copy_page(dst, src); | 998 | do_copy_page(dst, src); |
982 | kunmap_atomic(src, KM_USER0); | ||
983 | kunmap_atomic(dst, KM_USER1); | 999 | kunmap_atomic(dst, KM_USER1); |
1000 | kunmap_atomic(src, KM_USER0); | ||
984 | } else { | 1001 | } else { |
985 | if (PageHighMem(d_page)) { | 1002 | if (PageHighMem(d_page)) { |
986 | /* Page pointed to by src may contain some kernel | 1003 | /* Page pointed to by src may contain some kernel |
@@ -988,7 +1005,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
988 | */ | 1005 | */ |
989 | safe_copy_page(buffer, s_page); | 1006 | safe_copy_page(buffer, s_page); |
990 | dst = kmap_atomic(d_page, KM_USER0); | 1007 | dst = kmap_atomic(d_page, KM_USER0); |
991 | memcpy(dst, buffer, PAGE_SIZE); | 1008 | copy_page(dst, buffer); |
992 | kunmap_atomic(dst, KM_USER0); | 1009 | kunmap_atomic(dst, KM_USER0); |
993 | } else { | 1010 | } else { |
994 | safe_copy_page(page_address(d_page), s_page); | 1011 | safe_copy_page(page_address(d_page), s_page); |
@@ -1194,7 +1211,11 @@ static void free_unnecessary_pages(void) | |||
1194 | to_free_highmem = alloc_highmem - save; | 1211 | to_free_highmem = alloc_highmem - save; |
1195 | } else { | 1212 | } else { |
1196 | to_free_highmem = 0; | 1213 | to_free_highmem = 0; |
1197 | to_free_normal -= save - alloc_highmem; | 1214 | save -= alloc_highmem; |
1215 | if (to_free_normal > save) | ||
1216 | to_free_normal -= save; | ||
1217 | else | ||
1218 | to_free_normal = 0; | ||
1198 | } | 1219 | } |
1199 | 1220 | ||
1200 | memory_bm_position_reset(©_bm); | 1221 | memory_bm_position_reset(©_bm); |
@@ -1258,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
1258 | * frame in use. We also need a number of page frames to be free during | 1279 | * frame in use. We also need a number of page frames to be free during |
1259 | * hibernation for allocations made while saving the image and for device | 1280 | * hibernation for allocations made while saving the image and for device |
1260 | * drivers, in case they need to allocate memory from their hibernation | 1281 | * drivers, in case they need to allocate memory from their hibernation |
1261 | * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, | 1282 | * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough |
1262 | * respectively, both of which are rough estimates). To make this happen, we | 1283 | * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through |
1263 | * compute the total number of available page frames and allocate at least | 1284 | * /sys/power/reserved_size, respectively). To make this happen, we compute the |
1285 | * total number of available page frames and allocate at least | ||
1264 | * | 1286 | * |
1265 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES | 1287 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 |
1288 | * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) | ||
1266 | * | 1289 | * |
1267 | * of them, which corresponds to the maximum size of a hibernation image. | 1290 | * of them, which corresponds to the maximum size of a hibernation image. |
1268 | * | 1291 | * |
@@ -1317,13 +1340,16 @@ int hibernate_preallocate_memory(void) | |||
1317 | count -= totalreserve_pages; | 1340 | count -= totalreserve_pages; |
1318 | 1341 | ||
1319 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1342 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1320 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; | 1343 | max_size = (count - (size + PAGES_FOR_IO)) / 2 |
1344 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); | ||
1345 | /* Compute the desired number of image pages specified by image_size. */ | ||
1321 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); | 1346 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); |
1322 | if (size > max_size) | 1347 | if (size > max_size) |
1323 | size = max_size; | 1348 | size = max_size; |
1324 | /* | 1349 | /* |
1325 | * If the maximum is not less than the current number of saveable pages | 1350 | * If the desired number of image pages is at least as large as the |
1326 | * in memory, allocate page frames for the image and we're done. | 1351 | * current number of saveable pages in memory, allocate page frames for |
1352 | * the image and we're done. | ||
1327 | */ | 1353 | */ |
1328 | if (size >= saveable) { | 1354 | if (size >= saveable) { |
1329 | pages = preallocate_image_highmem(save_highmem); | 1355 | pages = preallocate_image_highmem(save_highmem); |
@@ -1512,11 +1538,8 @@ static int | |||
1512 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 1538 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, |
1513 | unsigned int nr_pages, unsigned int nr_highmem) | 1539 | unsigned int nr_pages, unsigned int nr_highmem) |
1514 | { | 1540 | { |
1515 | int error = 0; | ||
1516 | |||
1517 | if (nr_highmem > 0) { | 1541 | if (nr_highmem > 0) { |
1518 | error = get_highmem_buffer(PG_ANY); | 1542 | if (get_highmem_buffer(PG_ANY)) |
1519 | if (error) | ||
1520 | goto err_out; | 1543 | goto err_out; |
1521 | if (nr_highmem > alloc_highmem) { | 1544 | if (nr_highmem > alloc_highmem) { |
1522 | nr_highmem -= alloc_highmem; | 1545 | nr_highmem -= alloc_highmem; |
@@ -1539,7 +1562,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
1539 | 1562 | ||
1540 | err_out: | 1563 | err_out: |
1541 | swsusp_free(); | 1564 | swsusp_free(); |
1542 | return error; | 1565 | return -ENOMEM; |
1543 | } | 1566 | } |
1544 | 1567 | ||
1545 | asmlinkage int swsusp_save(void) | 1568 | asmlinkage int swsusp_save(void) |
@@ -1680,7 +1703,7 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1680 | memory_bm_position_reset(&orig_bm); | 1703 | memory_bm_position_reset(&orig_bm); |
1681 | memory_bm_position_reset(©_bm); | 1704 | memory_bm_position_reset(©_bm); |
1682 | } else if (handle->cur <= nr_meta_pages) { | 1705 | } else if (handle->cur <= nr_meta_pages) { |
1683 | memset(buffer, 0, PAGE_SIZE); | 1706 | clear_page(buffer); |
1684 | pack_pfns(buffer, &orig_bm); | 1707 | pack_pfns(buffer, &orig_bm); |
1685 | } else { | 1708 | } else { |
1686 | struct page *page; | 1709 | struct page *page; |
@@ -1694,7 +1717,7 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1694 | void *kaddr; | 1717 | void *kaddr; |
1695 | 1718 | ||
1696 | kaddr = kmap_atomic(page, KM_USER0); | 1719 | kaddr = kmap_atomic(page, KM_USER0); |
1697 | memcpy(buffer, kaddr, PAGE_SIZE); | 1720 | copy_page(buffer, kaddr); |
1698 | kunmap_atomic(kaddr, KM_USER0); | 1721 | kunmap_atomic(kaddr, KM_USER0); |
1699 | handle->buffer = buffer; | 1722 | handle->buffer = buffer; |
1700 | } else { | 1723 | } else { |
@@ -1977,7 +2000,7 @@ static void copy_last_highmem_page(void) | |||
1977 | void *dst; | 2000 | void *dst; |
1978 | 2001 | ||
1979 | dst = kmap_atomic(last_highmem_page, KM_USER0); | 2002 | dst = kmap_atomic(last_highmem_page, KM_USER0); |
1980 | memcpy(dst, buffer, PAGE_SIZE); | 2003 | copy_page(dst, buffer); |
1981 | kunmap_atomic(dst, KM_USER0); | 2004 | kunmap_atomic(dst, KM_USER0); |
1982 | last_highmem_page = NULL; | 2005 | last_highmem_page = NULL; |
1983 | } | 2006 | } |
@@ -2263,11 +2286,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | |||
2263 | 2286 | ||
2264 | kaddr1 = kmap_atomic(p1, KM_USER0); | 2287 | kaddr1 = kmap_atomic(p1, KM_USER0); |
2265 | kaddr2 = kmap_atomic(p2, KM_USER1); | 2288 | kaddr2 = kmap_atomic(p2, KM_USER1); |
2266 | memcpy(buf, kaddr1, PAGE_SIZE); | 2289 | copy_page(buf, kaddr1); |
2267 | memcpy(kaddr1, kaddr2, PAGE_SIZE); | 2290 | copy_page(kaddr1, kaddr2); |
2268 | memcpy(kaddr2, buf, PAGE_SIZE); | 2291 | copy_page(kaddr2, buf); |
2269 | kunmap_atomic(kaddr1, KM_USER0); | ||
2270 | kunmap_atomic(kaddr2, KM_USER1); | 2292 | kunmap_atomic(kaddr2, KM_USER1); |
2293 | kunmap_atomic(kaddr1, KM_USER0); | ||
2271 | } | 2294 | } |
2272 | 2295 | ||
2273 | /** | 2296 | /** |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee473..1c41ba215419 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/suspend.h> | 24 | #include <linux/suspend.h> |
25 | #include <linux/syscore_ops.h> | ||
26 | #include <trace/events/power.h> | ||
25 | 27 | ||
26 | #include "power.h" | 28 | #include "power.h" |
27 | 29 | ||
@@ -30,13 +32,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { | |||
30 | [PM_SUSPEND_MEM] = "mem", | 32 | [PM_SUSPEND_MEM] = "mem", |
31 | }; | 33 | }; |
32 | 34 | ||
33 | static struct platform_suspend_ops *suspend_ops; | 35 | static const struct platform_suspend_ops *suspend_ops; |
34 | 36 | ||
35 | /** | 37 | /** |
36 | * suspend_set_ops - Set the global suspend method table. | 38 | * suspend_set_ops - Set the global suspend method table. |
37 | * @ops: Pointer to ops structure. | 39 | * @ops: Pointer to ops structure. |
38 | */ | 40 | */ |
39 | void suspend_set_ops(struct platform_suspend_ops *ops) | 41 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
40 | { | 42 | { |
41 | mutex_lock(&pm_mutex); | 43 | mutex_lock(&pm_mutex); |
42 | suspend_ops = ops; | 44 | suspend_ops = ops; |
@@ -161,13 +163,13 @@ static int suspend_enter(suspend_state_t state) | |||
161 | arch_suspend_disable_irqs(); | 163 | arch_suspend_disable_irqs(); |
162 | BUG_ON(!irqs_disabled()); | 164 | BUG_ON(!irqs_disabled()); |
163 | 165 | ||
164 | error = sysdev_suspend(PMSG_SUSPEND); | 166 | error = syscore_suspend(); |
165 | if (!error) { | 167 | if (!error) { |
166 | if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { | 168 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
167 | error = suspend_ops->enter(state); | 169 | error = suspend_ops->enter(state); |
168 | events_check_enabled = false; | 170 | events_check_enabled = false; |
169 | } | 171 | } |
170 | sysdev_resume(); | 172 | syscore_resume(); |
171 | } | 173 | } |
172 | 174 | ||
173 | arch_suspend_enable_irqs(); | 175 | arch_suspend_enable_irqs(); |
@@ -197,18 +199,17 @@ static int suspend_enter(suspend_state_t state) | |||
197 | int suspend_devices_and_enter(suspend_state_t state) | 199 | int suspend_devices_and_enter(suspend_state_t state) |
198 | { | 200 | { |
199 | int error; | 201 | int error; |
200 | gfp_t saved_mask; | ||
201 | 202 | ||
202 | if (!suspend_ops) | 203 | if (!suspend_ops) |
203 | return -ENOSYS; | 204 | return -ENOSYS; |
204 | 205 | ||
206 | trace_machine_suspend(state); | ||
205 | if (suspend_ops->begin) { | 207 | if (suspend_ops->begin) { |
206 | error = suspend_ops->begin(state); | 208 | error = suspend_ops->begin(state); |
207 | if (error) | 209 | if (error) |
208 | goto Close; | 210 | goto Close; |
209 | } | 211 | } |
210 | suspend_console(); | 212 | suspend_console(); |
211 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | ||
212 | suspend_test_start(); | 213 | suspend_test_start(); |
213 | error = dpm_suspend_start(PMSG_SUSPEND); | 214 | error = dpm_suspend_start(PMSG_SUSPEND); |
214 | if (error) { | 215 | if (error) { |
@@ -219,17 +220,17 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
219 | if (suspend_test(TEST_DEVICES)) | 220 | if (suspend_test(TEST_DEVICES)) |
220 | goto Recover_platform; | 221 | goto Recover_platform; |
221 | 222 | ||
222 | suspend_enter(state); | 223 | error = suspend_enter(state); |
223 | 224 | ||
224 | Resume_devices: | 225 | Resume_devices: |
225 | suspend_test_start(); | 226 | suspend_test_start(); |
226 | dpm_resume_end(PMSG_RESUME); | 227 | dpm_resume_end(PMSG_RESUME); |
227 | suspend_test_finish("resume devices"); | 228 | suspend_test_finish("resume devices"); |
228 | set_gfp_allowed_mask(saved_mask); | ||
229 | resume_console(); | 229 | resume_console(); |
230 | Close: | 230 | Close: |
231 | if (suspend_ops->end) | 231 | if (suspend_ops->end) |
232 | suspend_ops->end(); | 232 | suspend_ops->end(); |
233 | trace_machine_suspend(PWR_EVENT_EXIT); | ||
233 | return error; | 234 | return error; |
234 | 235 | ||
235 | Recover_platform: | 236 | Recover_platform: |
@@ -285,7 +286,9 @@ int enter_state(suspend_state_t state) | |||
285 | goto Finish; | 286 | goto Finish; |
286 | 287 | ||
287 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 288 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
289 | pm_restrict_gfp_mask(); | ||
288 | error = suspend_devices_and_enter(state); | 290 | error = suspend_devices_and_enter(state); |
291 | pm_restore_gfp_mask(); | ||
289 | 292 | ||
290 | Finish: | 293 | Finish: |
291 | pr_debug("PM: Finishing wakeup.\n"); | 294 | pr_debug("PM: Finishing wakeup.\n"); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e6a5bdf61a37..7c97c3a0eee3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | ||
9 | * | 10 | * |
10 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
11 | * | 12 | * |
@@ -24,10 +25,12 @@ | |||
24 | #include <linux/swapops.h> | 25 | #include <linux/swapops.h> |
25 | #include <linux/pm.h> | 26 | #include <linux/pm.h> |
26 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/lzo.h> | ||
29 | #include <linux/vmalloc.h> | ||
27 | 30 | ||
28 | #include "power.h" | 31 | #include "power.h" |
29 | 32 | ||
30 | #define SWSUSP_SIG "S1SUSPEND" | 33 | #define HIBERNATE_SIG "S1SUSPEND" |
31 | 34 | ||
32 | /* | 35 | /* |
33 | * The swap map is a data structure used for keeping track of each page | 36 | * The swap map is a data structure used for keeping track of each page |
@@ -193,7 +196,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
193 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || | 196 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || |
194 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { | 197 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { |
195 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 198 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
196 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); | 199 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); |
197 | swsusp_header->image = handle->first_sector; | 200 | swsusp_header->image = handle->first_sector; |
198 | swsusp_header->flags = flags; | 201 | swsusp_header->flags = flags; |
199 | error = hib_bio_write_page(swsusp_resume_block, | 202 | error = hib_bio_write_page(swsusp_resume_block, |
@@ -221,7 +224,7 @@ static int swsusp_swap_check(void) | |||
221 | return res; | 224 | return res; |
222 | 225 | ||
223 | root_swap = res; | 226 | root_swap = res; |
224 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE); | 227 | res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); |
225 | if (res) | 228 | if (res) |
226 | return res; | 229 | return res; |
227 | 230 | ||
@@ -249,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
249 | if (bio_chain) { | 252 | if (bio_chain) { |
250 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 253 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
251 | if (src) { | 254 | if (src) { |
252 | memcpy(src, buf, PAGE_SIZE); | 255 | copy_page(src, buf); |
253 | } else { | 256 | } else { |
254 | WARN_ON_ONCE(1); | 257 | WARN_ON_ONCE(1); |
255 | bio_chain = NULL; /* Go synchronous */ | 258 | bio_chain = NULL; /* Go synchronous */ |
@@ -323,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
323 | error = write_page(handle->cur, handle->cur_swap, NULL); | 326 | error = write_page(handle->cur, handle->cur_swap, NULL); |
324 | if (error) | 327 | if (error) |
325 | goto out; | 328 | goto out; |
326 | memset(handle->cur, 0, PAGE_SIZE); | 329 | clear_page(handle->cur); |
327 | handle->cur_swap = offset; | 330 | handle->cur_swap = offset; |
328 | handle->k = 0; | 331 | handle->k = 0; |
329 | } | 332 | } |
@@ -357,6 +360,18 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
357 | return error; | 360 | return error; |
358 | } | 361 | } |
359 | 362 | ||
363 | /* We need to remember how much compressed data we need to read. */ | ||
364 | #define LZO_HEADER sizeof(size_t) | ||
365 | |||
366 | /* Number of pages/bytes we'll compress at one time. */ | ||
367 | #define LZO_UNC_PAGES 32 | ||
368 | #define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) | ||
369 | |||
370 | /* Number of pages/bytes we need for compressed data (worst case). */ | ||
371 | #define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ | ||
372 | LZO_HEADER, PAGE_SIZE) | ||
373 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) | ||
374 | |||
360 | /** | 375 | /** |
361 | * save_image - save the suspend image data | 376 | * save_image - save the suspend image data |
362 | */ | 377 | */ |
@@ -404,6 +419,137 @@ static int save_image(struct swap_map_handle *handle, | |||
404 | return ret; | 419 | return ret; |
405 | } | 420 | } |
406 | 421 | ||
422 | |||
423 | /** | ||
424 | * save_image_lzo - Save the suspend image data compressed with LZO. | ||
425 | * @handle: Swap mam handle to use for saving the image. | ||
426 | * @snapshot: Image to read data from. | ||
427 | * @nr_to_write: Number of pages to save. | ||
428 | */ | ||
429 | static int save_image_lzo(struct swap_map_handle *handle, | ||
430 | struct snapshot_handle *snapshot, | ||
431 | unsigned int nr_to_write) | ||
432 | { | ||
433 | unsigned int m; | ||
434 | int ret = 0; | ||
435 | int nr_pages; | ||
436 | int err2; | ||
437 | struct bio *bio; | ||
438 | struct timeval start; | ||
439 | struct timeval stop; | ||
440 | size_t off, unc_len, cmp_len; | ||
441 | unsigned char *unc, *cmp, *wrk, *page; | ||
442 | |||
443 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
444 | if (!page) { | ||
445 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
446 | return -ENOMEM; | ||
447 | } | ||
448 | |||
449 | wrk = vmalloc(LZO1X_1_MEM_COMPRESS); | ||
450 | if (!wrk) { | ||
451 | printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); | ||
452 | free_page((unsigned long)page); | ||
453 | return -ENOMEM; | ||
454 | } | ||
455 | |||
456 | unc = vmalloc(LZO_UNC_SIZE); | ||
457 | if (!unc) { | ||
458 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | ||
459 | vfree(wrk); | ||
460 | free_page((unsigned long)page); | ||
461 | return -ENOMEM; | ||
462 | } | ||
463 | |||
464 | cmp = vmalloc(LZO_CMP_SIZE); | ||
465 | if (!cmp) { | ||
466 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | ||
467 | vfree(unc); | ||
468 | vfree(wrk); | ||
469 | free_page((unsigned long)page); | ||
470 | return -ENOMEM; | ||
471 | } | ||
472 | |||
473 | printk(KERN_INFO | ||
474 | "PM: Compressing and saving image data (%u pages) ... ", | ||
475 | nr_to_write); | ||
476 | m = nr_to_write / 100; | ||
477 | if (!m) | ||
478 | m = 1; | ||
479 | nr_pages = 0; | ||
480 | bio = NULL; | ||
481 | do_gettimeofday(&start); | ||
482 | for (;;) { | ||
483 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | ||
484 | ret = snapshot_read_next(snapshot); | ||
485 | if (ret < 0) | ||
486 | goto out_finish; | ||
487 | |||
488 | if (!ret) | ||
489 | break; | ||
490 | |||
491 | memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); | ||
492 | |||
493 | if (!(nr_pages % m)) | ||
494 | printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); | ||
495 | nr_pages++; | ||
496 | } | ||
497 | |||
498 | if (!off) | ||
499 | break; | ||
500 | |||
501 | unc_len = off; | ||
502 | ret = lzo1x_1_compress(unc, unc_len, | ||
503 | cmp + LZO_HEADER, &cmp_len, wrk); | ||
504 | if (ret < 0) { | ||
505 | printk(KERN_ERR "PM: LZO compression failed\n"); | ||
506 | break; | ||
507 | } | ||
508 | |||
509 | if (unlikely(!cmp_len || | ||
510 | cmp_len > lzo1x_worst_compress(unc_len))) { | ||
511 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | ||
512 | ret = -1; | ||
513 | break; | ||
514 | } | ||
515 | |||
516 | *(size_t *)cmp = cmp_len; | ||
517 | |||
518 | /* | ||
519 | * Given we are writing one page at a time to disk, we copy | ||
520 | * that much from the buffer, although the last bit will likely | ||
521 | * be smaller than full page. This is OK - we saved the length | ||
522 | * of the compressed data, so any garbage at the end will be | ||
523 | * discarded when we read it. | ||
524 | */ | ||
525 | for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
526 | memcpy(page, cmp + off, PAGE_SIZE); | ||
527 | |||
528 | ret = swap_write_page(handle, page, &bio); | ||
529 | if (ret) | ||
530 | goto out_finish; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | out_finish: | ||
535 | err2 = hib_wait_on_bio_chain(&bio); | ||
536 | do_gettimeofday(&stop); | ||
537 | if (!ret) | ||
538 | ret = err2; | ||
539 | if (!ret) | ||
540 | printk(KERN_CONT "\b\b\b\bdone\n"); | ||
541 | else | ||
542 | printk(KERN_CONT "\n"); | ||
543 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | ||
544 | |||
545 | vfree(cmp); | ||
546 | vfree(unc); | ||
547 | vfree(wrk); | ||
548 | free_page((unsigned long)page); | ||
549 | |||
550 | return ret; | ||
551 | } | ||
552 | |||
407 | /** | 553 | /** |
408 | * enough_swap - Make sure we have enough swap to save the image. | 554 | * enough_swap - Make sure we have enough swap to save the image. |
409 | * | 555 | * |
@@ -411,12 +557,16 @@ static int save_image(struct swap_map_handle *handle, | |||
411 | * space avaiable from the resume partition. | 557 | * space avaiable from the resume partition. |
412 | */ | 558 | */ |
413 | 559 | ||
414 | static int enough_swap(unsigned int nr_pages) | 560 | static int enough_swap(unsigned int nr_pages, unsigned int flags) |
415 | { | 561 | { |
416 | unsigned int free_swap = count_swap_pages(root_swap, 1); | 562 | unsigned int free_swap = count_swap_pages(root_swap, 1); |
563 | unsigned int required; | ||
417 | 564 | ||
418 | pr_debug("PM: Free swap pages: %u\n", free_swap); | 565 | pr_debug("PM: Free swap pages: %u\n", free_swap); |
419 | return free_swap > nr_pages + PAGES_FOR_IO; | 566 | |
567 | required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? | ||
568 | nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); | ||
569 | return free_swap > required; | ||
420 | } | 570 | } |
421 | 571 | ||
422 | /** | 572 | /** |
@@ -443,7 +593,7 @@ int swsusp_write(unsigned int flags) | |||
443 | printk(KERN_ERR "PM: Cannot get swap writer\n"); | 593 | printk(KERN_ERR "PM: Cannot get swap writer\n"); |
444 | return error; | 594 | return error; |
445 | } | 595 | } |
446 | if (!enough_swap(pages)) { | 596 | if (!enough_swap(pages, flags)) { |
447 | printk(KERN_ERR "PM: Not enough free swap\n"); | 597 | printk(KERN_ERR "PM: Not enough free swap\n"); |
448 | error = -ENOSPC; | 598 | error = -ENOSPC; |
449 | goto out_finish; | 599 | goto out_finish; |
@@ -458,8 +608,11 @@ int swsusp_write(unsigned int flags) | |||
458 | } | 608 | } |
459 | header = (struct swsusp_info *)data_of(snapshot); | 609 | header = (struct swsusp_info *)data_of(snapshot); |
460 | error = swap_write_page(&handle, header, NULL); | 610 | error = swap_write_page(&handle, header, NULL); |
461 | if (!error) | 611 | if (!error) { |
462 | error = save_image(&handle, &snapshot, pages - 1); | 612 | error = (flags & SF_NOCOMPRESS_MODE) ? |
613 | save_image(&handle, &snapshot, pages - 1) : | ||
614 | save_image_lzo(&handle, &snapshot, pages - 1); | ||
615 | } | ||
463 | out_finish: | 616 | out_finish: |
464 | error = swap_writer_finish(&handle, flags, error); | 617 | error = swap_writer_finish(&handle, flags, error); |
465 | return error; | 618 | return error; |
@@ -590,9 +743,152 @@ static int load_image(struct swap_map_handle *handle, | |||
590 | } | 743 | } |
591 | 744 | ||
592 | /** | 745 | /** |
746 | * load_image_lzo - Load compressed image data and decompress them with LZO. | ||
747 | * @handle: Swap map handle to use for loading data. | ||
748 | * @snapshot: Image to copy uncompressed data into. | ||
749 | * @nr_to_read: Number of pages to load. | ||
750 | */ | ||
751 | static int load_image_lzo(struct swap_map_handle *handle, | ||
752 | struct snapshot_handle *snapshot, | ||
753 | unsigned int nr_to_read) | ||
754 | { | ||
755 | unsigned int m; | ||
756 | int error = 0; | ||
757 | struct bio *bio; | ||
758 | struct timeval start; | ||
759 | struct timeval stop; | ||
760 | unsigned nr_pages; | ||
761 | size_t i, off, unc_len, cmp_len; | ||
762 | unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; | ||
763 | |||
764 | for (i = 0; i < LZO_CMP_PAGES; i++) { | ||
765 | page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
766 | if (!page[i]) { | ||
767 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
768 | |||
769 | while (i) | ||
770 | free_page((unsigned long)page[--i]); | ||
771 | |||
772 | return -ENOMEM; | ||
773 | } | ||
774 | } | ||
775 | |||
776 | unc = vmalloc(LZO_UNC_SIZE); | ||
777 | if (!unc) { | ||
778 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | ||
779 | |||
780 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
781 | free_page((unsigned long)page[i]); | ||
782 | |||
783 | return -ENOMEM; | ||
784 | } | ||
785 | |||
786 | cmp = vmalloc(LZO_CMP_SIZE); | ||
787 | if (!cmp) { | ||
788 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | ||
789 | |||
790 | vfree(unc); | ||
791 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
792 | free_page((unsigned long)page[i]); | ||
793 | |||
794 | return -ENOMEM; | ||
795 | } | ||
796 | |||
797 | printk(KERN_INFO | ||
798 | "PM: Loading and decompressing image data (%u pages) ... ", | ||
799 | nr_to_read); | ||
800 | m = nr_to_read / 100; | ||
801 | if (!m) | ||
802 | m = 1; | ||
803 | nr_pages = 0; | ||
804 | bio = NULL; | ||
805 | do_gettimeofday(&start); | ||
806 | |||
807 | error = snapshot_write_next(snapshot); | ||
808 | if (error <= 0) | ||
809 | goto out_finish; | ||
810 | |||
811 | for (;;) { | ||
812 | error = swap_read_page(handle, page[0], NULL); /* sync */ | ||
813 | if (error) | ||
814 | break; | ||
815 | |||
816 | cmp_len = *(size_t *)page[0]; | ||
817 | if (unlikely(!cmp_len || | ||
818 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | ||
819 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | ||
820 | error = -1; | ||
821 | break; | ||
822 | } | ||
823 | |||
824 | for (off = PAGE_SIZE, i = 1; | ||
825 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | ||
826 | error = swap_read_page(handle, page[i], &bio); | ||
827 | if (error) | ||
828 | goto out_finish; | ||
829 | } | ||
830 | |||
831 | error = hib_wait_on_bio_chain(&bio); /* need all data now */ | ||
832 | if (error) | ||
833 | goto out_finish; | ||
834 | |||
835 | for (off = 0, i = 0; | ||
836 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | ||
837 | memcpy(cmp + off, page[i], PAGE_SIZE); | ||
838 | } | ||
839 | |||
840 | unc_len = LZO_UNC_SIZE; | ||
841 | error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, | ||
842 | unc, &unc_len); | ||
843 | if (error < 0) { | ||
844 | printk(KERN_ERR "PM: LZO decompression failed\n"); | ||
845 | break; | ||
846 | } | ||
847 | |||
848 | if (unlikely(!unc_len || | ||
849 | unc_len > LZO_UNC_SIZE || | ||
850 | unc_len & (PAGE_SIZE - 1))) { | ||
851 | printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); | ||
852 | error = -1; | ||
853 | break; | ||
854 | } | ||
855 | |||
856 | for (off = 0; off < unc_len; off += PAGE_SIZE) { | ||
857 | memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); | ||
858 | |||
859 | if (!(nr_pages % m)) | ||
860 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
861 | nr_pages++; | ||
862 | |||
863 | error = snapshot_write_next(snapshot); | ||
864 | if (error <= 0) | ||
865 | goto out_finish; | ||
866 | } | ||
867 | } | ||
868 | |||
869 | out_finish: | ||
870 | do_gettimeofday(&stop); | ||
871 | if (!error) { | ||
872 | printk("\b\b\b\bdone\n"); | ||
873 | snapshot_write_finalize(snapshot); | ||
874 | if (!snapshot_image_loaded(snapshot)) | ||
875 | error = -ENODATA; | ||
876 | } else | ||
877 | printk("\n"); | ||
878 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | ||
879 | |||
880 | vfree(cmp); | ||
881 | vfree(unc); | ||
882 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
883 | free_page((unsigned long)page[i]); | ||
884 | |||
885 | return error; | ||
886 | } | ||
887 | |||
888 | /** | ||
593 | * swsusp_read - read the hibernation image. | 889 | * swsusp_read - read the hibernation image. |
594 | * @flags_p: flags passed by the "frozen" kernel in the image header should | 890 | * @flags_p: flags passed by the "frozen" kernel in the image header should |
595 | * be written into this memeory location | 891 | * be written into this memory location |
596 | */ | 892 | */ |
597 | 893 | ||
598 | int swsusp_read(unsigned int *flags_p) | 894 | int swsusp_read(unsigned int *flags_p) |
@@ -612,8 +908,11 @@ int swsusp_read(unsigned int *flags_p) | |||
612 | goto end; | 908 | goto end; |
613 | if (!error) | 909 | if (!error) |
614 | error = swap_read_page(&handle, header, NULL); | 910 | error = swap_read_page(&handle, header, NULL); |
615 | if (!error) | 911 | if (!error) { |
616 | error = load_image(&handle, &snapshot, header->pages - 1); | 912 | error = (*flags_p & SF_NOCOMPRESS_MODE) ? |
913 | load_image(&handle, &snapshot, header->pages - 1) : | ||
914 | load_image_lzo(&handle, &snapshot, header->pages - 1); | ||
915 | } | ||
617 | swap_reader_finish(&handle); | 916 | swap_reader_finish(&handle); |
618 | end: | 917 | end: |
619 | if (!error) | 918 | if (!error) |
@@ -631,16 +930,17 @@ int swsusp_check(void) | |||
631 | { | 930 | { |
632 | int error; | 931 | int error; |
633 | 932 | ||
634 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 933 | hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, |
934 | FMODE_READ, NULL); | ||
635 | if (!IS_ERR(hib_resume_bdev)) { | 935 | if (!IS_ERR(hib_resume_bdev)) { |
636 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 936 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
637 | memset(swsusp_header, 0, PAGE_SIZE); | 937 | clear_page(swsusp_header); |
638 | error = hib_bio_read_page(swsusp_resume_block, | 938 | error = hib_bio_read_page(swsusp_resume_block, |
639 | swsusp_header, NULL); | 939 | swsusp_header, NULL); |
640 | if (error) | 940 | if (error) |
641 | goto put; | 941 | goto put; |
642 | 942 | ||
643 | if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { | 943 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { |
644 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); | 944 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); |
645 | /* Reset swap signature now */ | 945 | /* Reset swap signature now */ |
646 | error = hib_bio_write_page(swsusp_resume_block, | 946 | error = hib_bio_write_page(swsusp_resume_block, |
@@ -653,13 +953,13 @@ put: | |||
653 | if (error) | 953 | if (error) |
654 | blkdev_put(hib_resume_bdev, FMODE_READ); | 954 | blkdev_put(hib_resume_bdev, FMODE_READ); |
655 | else | 955 | else |
656 | pr_debug("PM: Signature found, resuming\n"); | 956 | pr_debug("PM: Image signature found, resuming\n"); |
657 | } else { | 957 | } else { |
658 | error = PTR_ERR(hib_resume_bdev); | 958 | error = PTR_ERR(hib_resume_bdev); |
659 | } | 959 | } |
660 | 960 | ||
661 | if (error) | 961 | if (error) |
662 | pr_debug("PM: Error %d checking image file\n", error); | 962 | pr_debug("PM: Image not found (code %d)\n", error); |
663 | 963 | ||
664 | return error; | 964 | return error; |
665 | } | 965 | } |
diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877ca..42ddbc6f0de6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
113 | if (error) | 113 | if (error) |
114 | pm_notifier_call_chain(PM_POST_RESTORE); | 114 | pm_notifier_call_chain(PM_POST_RESTORE); |
115 | } | 115 | } |
116 | if (error) | 116 | if (error) { |
117 | free_basic_memory_bitmaps(); | ||
117 | atomic_inc(&snapshot_device_available); | 118 | atomic_inc(&snapshot_device_available); |
119 | } | ||
118 | data->frozen = 0; | 120 | data->frozen = 0; |
119 | data->ready = 0; | 121 | data->ready = 0; |
120 | data->platform_support = 0; | 122 | data->platform_support = 0; |
@@ -135,9 +137,11 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
135 | free_basic_memory_bitmaps(); | 137 | free_basic_memory_bitmaps(); |
136 | data = filp->private_data; | 138 | data = filp->private_data; |
137 | free_all_swap_pages(data->swap); | 139 | free_all_swap_pages(data->swap); |
138 | if (data->frozen) | 140 | if (data->frozen) { |
141 | pm_restore_gfp_mask(); | ||
139 | thaw_processes(); | 142 | thaw_processes(); |
140 | pm_notifier_call_chain(data->mode == O_WRONLY ? | 143 | } |
144 | pm_notifier_call_chain(data->mode == O_RDONLY ? | ||
141 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 145 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
142 | atomic_inc(&snapshot_device_available); | 146 | atomic_inc(&snapshot_device_available); |
143 | 147 | ||
@@ -263,6 +267,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
263 | case SNAPSHOT_UNFREEZE: | 267 | case SNAPSHOT_UNFREEZE: |
264 | if (!data->frozen || data->ready) | 268 | if (!data->frozen || data->ready) |
265 | break; | 269 | break; |
270 | pm_restore_gfp_mask(); | ||
266 | thaw_processes(); | 271 | thaw_processes(); |
267 | usermodehelper_enable(); | 272 | usermodehelper_enable(); |
268 | data->frozen = 0; | 273 | data->frozen = 0; |
@@ -275,6 +280,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
275 | error = -EPERM; | 280 | error = -EPERM; |
276 | break; | 281 | break; |
277 | } | 282 | } |
283 | pm_restore_gfp_mask(); | ||
278 | error = hibernation_snapshot(data->platform_support); | 284 | error = hibernation_snapshot(data->platform_support); |
279 | if (!error) | 285 | if (!error) |
280 | error = put_user(in_suspend, (int __user *)arg); | 286 | error = put_user(in_suspend, (int __user *)arg); |
@@ -377,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
377 | * PM_HIBERNATION_PREPARE | 383 | * PM_HIBERNATION_PREPARE |
378 | */ | 384 | */ |
379 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); | 385 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); |
386 | data->ready = 0; | ||
380 | break; | 387 | break; |
381 | 388 | ||
382 | case SNAPSHOT_PLATFORM_SUPPORT: | 389 | case SNAPSHOT_PLATFORM_SUPPORT: |
diff --git a/kernel/printk.c b/kernel/printk.c index 9dc8ea140426..b799a2ee96e5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/smp.h> | 31 | #include <linux/smp.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/memblock.h> | ||
34 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
35 | #include <linux/kexec.h> | 36 | #include <linux/kexec.h> |
36 | #include <linux/kdb.h> | 37 | #include <linux/kdb.h> |
@@ -39,16 +40,11 @@ | |||
39 | #include <linux/syslog.h> | 40 | #include <linux/syslog.h> |
40 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
41 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/rculist.h> | ||
42 | 44 | ||
43 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
44 | 46 | ||
45 | /* | 47 | /* |
46 | * for_each_console() allows you to iterate on each console | ||
47 | */ | ||
48 | #define for_each_console(con) \ | ||
49 | for (con = console_drivers; con != NULL; con = con->next) | ||
50 | |||
51 | /* | ||
52 | * Architectures can override it: | 48 | * Architectures can override it: |
53 | */ | 49 | */ |
54 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | 50 | void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) |
@@ -58,7 +54,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
58 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 54 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
59 | 55 | ||
60 | /* printk's without a loglevel use this.. */ | 56 | /* printk's without a loglevel use this.. */ |
61 | #define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ | 57 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
62 | 58 | ||
63 | /* We show everything that is MORE important than this.. */ | 59 | /* We show everything that is MORE important than this.. */ |
64 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | 60 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ |
@@ -92,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress); | |||
92 | * provides serialisation for access to the entire console | 88 | * provides serialisation for access to the entire console |
93 | * driver system. | 89 | * driver system. |
94 | */ | 90 | */ |
95 | static DECLARE_MUTEX(console_sem); | 91 | static DEFINE_SEMAPHORE(console_sem); |
96 | struct console *console_drivers; | 92 | struct console *console_drivers; |
97 | EXPORT_SYMBOL_GPL(console_drivers); | 93 | EXPORT_SYMBOL_GPL(console_drivers); |
98 | 94 | ||
@@ -109,7 +105,7 @@ static int console_locked, console_suspended; | |||
109 | /* | 105 | /* |
110 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | 106 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars |
111 | * It is also used in interesting ways to provide interlocking in | 107 | * It is also used in interesting ways to provide interlocking in |
112 | * release_console_sem(). | 108 | * console_unlock();. |
113 | */ | 109 | */ |
114 | static DEFINE_SPINLOCK(logbuf_lock); | 110 | static DEFINE_SPINLOCK(logbuf_lock); |
115 | 111 | ||
@@ -125,6 +121,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol | |||
125 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | 121 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ |
126 | 122 | ||
127 | /* | 123 | /* |
124 | * If exclusive_console is non-NULL then only this console is to be printed to. | ||
125 | */ | ||
126 | static struct console *exclusive_console; | ||
127 | |||
128 | /* | ||
128 | * Array of consoles built from command line options (console=) | 129 | * Array of consoles built from command line options (console=) |
129 | */ | 130 | */ |
130 | struct console_cmdline | 131 | struct console_cmdline |
@@ -174,50 +175,78 @@ void log_buf_kexec_setup(void) | |||
174 | } | 175 | } |
175 | #endif | 176 | #endif |
176 | 177 | ||
178 | /* requested log_buf_len from kernel cmdline */ | ||
179 | static unsigned long __initdata new_log_buf_len; | ||
180 | |||
181 | /* save requested log_buf_len since it's too early to process it */ | ||
177 | static int __init log_buf_len_setup(char *str) | 182 | static int __init log_buf_len_setup(char *str) |
178 | { | 183 | { |
179 | unsigned size = memparse(str, &str); | 184 | unsigned size = memparse(str, &str); |
180 | unsigned long flags; | ||
181 | 185 | ||
182 | if (size) | 186 | if (size) |
183 | size = roundup_pow_of_two(size); | 187 | size = roundup_pow_of_two(size); |
184 | if (size > log_buf_len) { | 188 | if (size > log_buf_len) |
185 | unsigned start, dest_idx, offset; | 189 | new_log_buf_len = size; |
186 | char *new_log_buf; | ||
187 | 190 | ||
188 | new_log_buf = alloc_bootmem(size); | 191 | return 0; |
189 | if (!new_log_buf) { | 192 | } |
190 | printk(KERN_WARNING "log_buf_len: allocation failed\n"); | 193 | early_param("log_buf_len", log_buf_len_setup); |
191 | goto out; | ||
192 | } | ||
193 | 194 | ||
194 | spin_lock_irqsave(&logbuf_lock, flags); | 195 | void __init setup_log_buf(int early) |
195 | log_buf_len = size; | 196 | { |
196 | log_buf = new_log_buf; | 197 | unsigned long flags; |
197 | 198 | unsigned start, dest_idx, offset; | |
198 | offset = start = min(con_start, log_start); | 199 | char *new_log_buf; |
199 | dest_idx = 0; | 200 | int free; |
200 | while (start != log_end) { | 201 | |
201 | log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; | 202 | if (!new_log_buf_len) |
202 | start++; | 203 | return; |
203 | dest_idx++; | 204 | |
204 | } | 205 | if (early) { |
205 | log_start -= offset; | 206 | unsigned long mem; |
206 | con_start -= offset; | ||
207 | log_end -= offset; | ||
208 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
209 | 207 | ||
210 | printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); | 208 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); |
209 | if (mem == MEMBLOCK_ERROR) | ||
210 | return; | ||
211 | new_log_buf = __va(mem); | ||
212 | } else { | ||
213 | new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); | ||
211 | } | 214 | } |
212 | out: | ||
213 | return 1; | ||
214 | } | ||
215 | 215 | ||
216 | __setup("log_buf_len=", log_buf_len_setup); | 216 | if (unlikely(!new_log_buf)) { |
217 | pr_err("log_buf_len: %ld bytes not available\n", | ||
218 | new_log_buf_len); | ||
219 | return; | ||
220 | } | ||
221 | |||
222 | spin_lock_irqsave(&logbuf_lock, flags); | ||
223 | log_buf_len = new_log_buf_len; | ||
224 | log_buf = new_log_buf; | ||
225 | new_log_buf_len = 0; | ||
226 | free = __LOG_BUF_LEN - log_end; | ||
227 | |||
228 | offset = start = min(con_start, log_start); | ||
229 | dest_idx = 0; | ||
230 | while (start != log_end) { | ||
231 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
232 | |||
233 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
234 | start++; | ||
235 | dest_idx++; | ||
236 | } | ||
237 | log_start -= offset; | ||
238 | con_start -= offset; | ||
239 | log_end -= offset; | ||
240 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
241 | |||
242 | pr_info("log_buf_len: %d\n", log_buf_len); | ||
243 | pr_info("early log buf free: %d(%d%%)\n", | ||
244 | free, (free * 100) / __LOG_BUF_LEN); | ||
245 | } | ||
217 | 246 | ||
218 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 247 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
219 | 248 | ||
220 | static unsigned int boot_delay; /* msecs delay after each printk during bootup */ | 249 | static int boot_delay; /* msecs delay after each printk during bootup */ |
221 | static unsigned long long loops_per_msec; /* based on boot_delay */ | 250 | static unsigned long long loops_per_msec; /* based on boot_delay */ |
222 | 251 | ||
223 | static int __init boot_delay_setup(char *str) | 252 | static int __init boot_delay_setup(char *str) |
@@ -268,14 +297,55 @@ static inline void boot_delay_msec(void) | |||
268 | } | 297 | } |
269 | #endif | 298 | #endif |
270 | 299 | ||
300 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | ||
301 | int dmesg_restrict = 1; | ||
302 | #else | ||
303 | int dmesg_restrict; | ||
304 | #endif | ||
305 | |||
306 | static int syslog_action_restricted(int type) | ||
307 | { | ||
308 | if (dmesg_restrict) | ||
309 | return 1; | ||
310 | /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ | ||
311 | return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; | ||
312 | } | ||
313 | |||
314 | static int check_syslog_permissions(int type, bool from_file) | ||
315 | { | ||
316 | /* | ||
317 | * If this is from /proc/kmsg and we've already opened it, then we've | ||
318 | * already done the capabilities checks at open time. | ||
319 | */ | ||
320 | if (from_file && type != SYSLOG_ACTION_OPEN) | ||
321 | return 0; | ||
322 | |||
323 | if (syslog_action_restricted(type)) { | ||
324 | if (capable(CAP_SYSLOG)) | ||
325 | return 0; | ||
326 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | ||
327 | if (capable(CAP_SYS_ADMIN)) { | ||
328 | WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " | ||
329 | "but no CAP_SYSLOG (deprecated).\n"); | ||
330 | return 0; | ||
331 | } | ||
332 | return -EPERM; | ||
333 | } | ||
334 | return 0; | ||
335 | } | ||
336 | |||
271 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 337 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
272 | { | 338 | { |
273 | unsigned i, j, limit, count; | 339 | unsigned i, j, limit, count; |
274 | int do_clear = 0; | 340 | int do_clear = 0; |
275 | char c; | 341 | char c; |
276 | int error = 0; | 342 | int error; |
343 | |||
344 | error = check_syslog_permissions(type, from_file); | ||
345 | if (error) | ||
346 | goto out; | ||
277 | 347 | ||
278 | error = security_syslog(type, from_file); | 348 | error = security_syslog(type); |
279 | if (error) | 349 | if (error) |
280 | return error; | 350 | return error; |
281 | 351 | ||
@@ -447,6 +517,8 @@ static void __call_console_drivers(unsigned start, unsigned end) | |||
447 | struct console *con; | 517 | struct console *con; |
448 | 518 | ||
449 | for_each_console(con) { | 519 | for_each_console(con) { |
520 | if (exclusive_console && con != exclusive_console) | ||
521 | continue; | ||
450 | if ((con->flags & CON_ENABLED) && con->write && | 522 | if ((con->flags & CON_ENABLED) && con->write && |
451 | (cpu_online(smp_processor_id()) || | 523 | (cpu_online(smp_processor_id()) || |
452 | (con->flags & CON_ANYTIME))) | 524 | (con->flags & CON_ANYTIME))) |
@@ -486,9 +558,74 @@ static void _call_console_drivers(unsigned start, | |||
486 | } | 558 | } |
487 | 559 | ||
488 | /* | 560 | /* |
561 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
562 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
563 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
564 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
565 | * to extract the correct log level for in-kernel processing, and not mangle | ||
566 | * the original value. | ||
567 | * | ||
568 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
569 | * passed, it will be filled in with the log level without a possible facility | ||
570 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
571 | * and returned. If no valid header is found, 0 is returned and the passed | ||
572 | * variables are not touched. | ||
573 | */ | ||
574 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
575 | { | ||
576 | unsigned int lev = 0; | ||
577 | char sp = '\0'; | ||
578 | size_t len; | ||
579 | |||
580 | if (p[0] != '<' || !p[1]) | ||
581 | return 0; | ||
582 | if (p[2] == '>') { | ||
583 | /* usual single digit level number or special char */ | ||
584 | switch (p[1]) { | ||
585 | case '0' ... '7': | ||
586 | lev = p[1] - '0'; | ||
587 | break; | ||
588 | case 'c': /* KERN_CONT */ | ||
589 | case 'd': /* KERN_DEFAULT */ | ||
590 | sp = p[1]; | ||
591 | break; | ||
592 | default: | ||
593 | return 0; | ||
594 | } | ||
595 | len = 3; | ||
596 | } else { | ||
597 | /* multi digit including the level and facility number */ | ||
598 | char *endp = NULL; | ||
599 | |||
600 | if (p[1] < '0' && p[1] > '9') | ||
601 | return 0; | ||
602 | |||
603 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
604 | if (endp == NULL || endp[0] != '>') | ||
605 | return 0; | ||
606 | len = (endp + 1) - p; | ||
607 | } | ||
608 | |||
609 | /* do not accept special char if not asked for */ | ||
610 | if (sp && !special) | ||
611 | return 0; | ||
612 | |||
613 | if (special) { | ||
614 | *special = sp; | ||
615 | /* return special char, do not touch level */ | ||
616 | if (sp) | ||
617 | return len; | ||
618 | } | ||
619 | |||
620 | if (level) | ||
621 | *level = lev; | ||
622 | return len; | ||
623 | } | ||
624 | |||
625 | /* | ||
489 | * Call the console drivers, asking them to write out | 626 | * Call the console drivers, asking them to write out |
490 | * log_buf[start] to log_buf[end - 1]. | 627 | * log_buf[start] to log_buf[end - 1]. |
491 | * The console_sem must be held. | 628 | * The console_lock must be held. |
492 | */ | 629 | */ |
493 | static void call_console_drivers(unsigned start, unsigned end) | 630 | static void call_console_drivers(unsigned start, unsigned end) |
494 | { | 631 | { |
@@ -500,13 +637,9 @@ static void call_console_drivers(unsigned start, unsigned end) | |||
500 | cur_index = start; | 637 | cur_index = start; |
501 | start_print = start; | 638 | start_print = start; |
502 | while (cur_index != end) { | 639 | while (cur_index != end) { |
503 | if (msg_level < 0 && ((end - cur_index) > 2) && | 640 | if (msg_level < 0 && ((end - cur_index) > 2)) { |
504 | LOG_BUF(cur_index + 0) == '<' && | 641 | /* strip log prefix */ |
505 | LOG_BUF(cur_index + 1) >= '0' && | 642 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); |
506 | LOG_BUF(cur_index + 1) <= '7' && | ||
507 | LOG_BUF(cur_index + 2) == '>') { | ||
508 | msg_level = LOG_BUF(cur_index + 1) - '0'; | ||
509 | cur_index += 3; | ||
510 | start_print = cur_index; | 643 | start_print = cur_index; |
511 | } | 644 | } |
512 | while (cur_index != end) { | 645 | while (cur_index != end) { |
@@ -563,7 +696,7 @@ static void zap_locks(void) | |||
563 | /* If a crash is occurring, make sure we can't deadlock */ | 696 | /* If a crash is occurring, make sure we can't deadlock */ |
564 | spin_lock_init(&logbuf_lock); | 697 | spin_lock_init(&logbuf_lock); |
565 | /* And make sure that we print immediately */ | 698 | /* And make sure that we print immediately */ |
566 | init_MUTEX(&console_sem); | 699 | sema_init(&console_sem, 1); |
567 | } | 700 | } |
568 | 701 | ||
569 | #if defined(CONFIG_PRINTK_TIME) | 702 | #if defined(CONFIG_PRINTK_TIME) |
@@ -591,11 +724,11 @@ static int have_callable_console(void) | |||
591 | * | 724 | * |
592 | * This is printk(). It can be called from any context. We want it to work. | 725 | * This is printk(). It can be called from any context. We want it to work. |
593 | * | 726 | * |
594 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 727 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and |
595 | * call the console drivers. If we fail to get the semaphore we place the output | 728 | * call the console drivers. If we fail to get the semaphore we place the output |
596 | * into the log buffer and return. The current holder of the console_sem will | 729 | * into the log buffer and return. The current holder of the console_sem will |
597 | * notice the new output in release_console_sem() and will send it to the | 730 | * notice the new output in console_unlock(); and will send it to the |
598 | * consoles before releasing the semaphore. | 731 | * consoles before releasing the lock. |
599 | * | 732 | * |
600 | * One effect of this deferred printing is that code which calls printk() and | 733 | * One effect of this deferred printing is that code which calls printk() and |
601 | * then changes console_loglevel may break. This is because console_loglevel | 734 | * then changes console_loglevel may break. This is because console_loglevel |
@@ -646,18 +779,19 @@ static inline int can_use_console(unsigned int cpu) | |||
646 | /* | 779 | /* |
647 | * Try to get console ownership to actually show the kernel | 780 | * Try to get console ownership to actually show the kernel |
648 | * messages from a 'printk'. Return true (and with the | 781 | * messages from a 'printk'. Return true (and with the |
649 | * console_semaphore held, and 'console_locked' set) if it | 782 | * console_lock held, and 'console_locked' set) if it |
650 | * is successful, false otherwise. | 783 | * is successful, false otherwise. |
651 | * | 784 | * |
652 | * This gets called with the 'logbuf_lock' spinlock held and | 785 | * This gets called with the 'logbuf_lock' spinlock held and |
653 | * interrupts disabled. It should return with 'lockbuf_lock' | 786 | * interrupts disabled. It should return with 'lockbuf_lock' |
654 | * released but interrupts still disabled. | 787 | * released but interrupts still disabled. |
655 | */ | 788 | */ |
656 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | 789 | static int console_trylock_for_printk(unsigned int cpu) |
790 | __releases(&logbuf_lock) | ||
657 | { | 791 | { |
658 | int retval = 0; | 792 | int retval = 0; |
659 | 793 | ||
660 | if (!try_acquire_console_sem()) { | 794 | if (console_trylock()) { |
661 | retval = 1; | 795 | retval = 1; |
662 | 796 | ||
663 | /* | 797 | /* |
@@ -703,6 +837,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
703 | unsigned long flags; | 837 | unsigned long flags; |
704 | int this_cpu; | 838 | int this_cpu; |
705 | char *p; | 839 | char *p; |
840 | size_t plen; | ||
841 | char special; | ||
706 | 842 | ||
707 | boot_delay_msec(); | 843 | boot_delay_msec(); |
708 | printk_delay(); | 844 | printk_delay(); |
@@ -746,45 +882,52 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
746 | if (trace_override && !trace_recurse) | 882 | if (trace_override && !trace_recurse) |
747 | TRACE("%s", printk_buf); | 883 | TRACE("%s", printk_buf); |
748 | 884 | ||
749 | |||
750 | p = printk_buf; | 885 | p = printk_buf; |
751 | 886 | ||
752 | /* Do we have a loglevel in the string? */ | 887 | /* Read log level and handle special printk prefix */ |
753 | if (p[0] == '<') { | 888 | plen = log_prefix(p, ¤t_log_level, &special); |
754 | unsigned char c = p[1]; | 889 | if (plen) { |
755 | if (c && p[2] == '>') { | 890 | p += plen; |
756 | switch (c) { | 891 | |
757 | case '0' ... '7': /* loglevel */ | 892 | switch (special) { |
758 | current_log_level = c - '0'; | 893 | case 'c': /* Strip <c> KERN_CONT, continue line */ |
759 | /* Fallthrough - make sure we're on a new line */ | 894 | plen = 0; |
760 | case 'd': /* KERN_DEFAULT */ | 895 | break; |
761 | if (!new_text_line) { | 896 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ |
762 | emit_log_char('\n'); | 897 | plen = 0; |
763 | new_text_line = 1; | 898 | default: |
764 | } | 899 | if (!new_text_line) { |
765 | /* Fallthrough - skip the loglevel */ | 900 | emit_log_char('\n'); |
766 | case 'c': /* KERN_CONT */ | 901 | new_text_line = 1; |
767 | p += 3; | ||
768 | break; | ||
769 | } | 902 | } |
770 | } | 903 | } |
771 | } | 904 | } |
772 | 905 | ||
773 | /* | 906 | /* |
774 | * Copy the output into log_buf. If the caller didn't provide | 907 | * Copy the output into log_buf. If the caller didn't provide |
775 | * appropriate log level tags, we insert them here | 908 | * the appropriate log prefix, we insert them here |
776 | */ | 909 | */ |
777 | for ( ; *p; p++) { | 910 | for (; *p; p++) { |
778 | if (new_text_line) { | 911 | if (new_text_line) { |
779 | /* Always output the token */ | ||
780 | emit_log_char('<'); | ||
781 | emit_log_char(current_log_level + '0'); | ||
782 | emit_log_char('>'); | ||
783 | printed_len += 3; | ||
784 | new_text_line = 0; | 912 | new_text_line = 0; |
785 | 913 | ||
914 | if (plen) { | ||
915 | /* Copy original log prefix */ | ||
916 | int i; | ||
917 | |||
918 | for (i = 0; i < plen; i++) | ||
919 | emit_log_char(printk_buf[i]); | ||
920 | printed_len += plen; | ||
921 | } else { | ||
922 | /* Add log prefix */ | ||
923 | emit_log_char('<'); | ||
924 | emit_log_char(current_log_level + '0'); | ||
925 | emit_log_char('>'); | ||
926 | printed_len += 3; | ||
927 | } | ||
928 | |||
786 | if (printk_time) { | 929 | if (printk_time) { |
787 | /* Follow the token with the time */ | 930 | /* Add the current time stamp */ |
788 | char tbuf[50], *tp; | 931 | char tbuf[50], *tp; |
789 | unsigned tlen; | 932 | unsigned tlen; |
790 | unsigned long long t; | 933 | unsigned long long t; |
@@ -816,12 +959,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
816 | * actual magic (print out buffers, wake up klogd, | 959 | * actual magic (print out buffers, wake up klogd, |
817 | * etc). | 960 | * etc). |
818 | * | 961 | * |
819 | * The acquire_console_semaphore_for_printk() function | 962 | * The console_trylock_for_printk() function |
820 | * will release 'logbuf_lock' regardless of whether it | 963 | * will release 'logbuf_lock' regardless of whether it |
821 | * actually gets the semaphore or not. | 964 | * actually gets the semaphore or not. |
822 | */ | 965 | */ |
823 | if (acquire_console_semaphore_for_printk(this_cpu)) | 966 | if (console_trylock_for_printk(this_cpu)) |
824 | release_console_sem(); | 967 | console_unlock(); |
825 | 968 | ||
826 | lockdep_on(); | 969 | lockdep_on(); |
827 | out_restore_irqs: | 970 | out_restore_irqs: |
@@ -982,7 +1125,7 @@ void suspend_console(void) | |||
982 | if (!console_suspend_enabled) | 1125 | if (!console_suspend_enabled) |
983 | return; | 1126 | return; |
984 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1127 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
985 | acquire_console_sem(); | 1128 | console_lock(); |
986 | console_suspended = 1; | 1129 | console_suspended = 1; |
987 | up(&console_sem); | 1130 | up(&console_sem); |
988 | } | 1131 | } |
@@ -993,7 +1136,7 @@ void resume_console(void) | |||
993 | return; | 1136 | return; |
994 | down(&console_sem); | 1137 | down(&console_sem); |
995 | console_suspended = 0; | 1138 | console_suspended = 0; |
996 | release_console_sem(); | 1139 | console_unlock(); |
997 | } | 1140 | } |
998 | 1141 | ||
999 | /** | 1142 | /** |
@@ -1016,21 +1159,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1016 | case CPU_DYING: | 1159 | case CPU_DYING: |
1017 | case CPU_DOWN_FAILED: | 1160 | case CPU_DOWN_FAILED: |
1018 | case CPU_UP_CANCELED: | 1161 | case CPU_UP_CANCELED: |
1019 | acquire_console_sem(); | 1162 | console_lock(); |
1020 | release_console_sem(); | 1163 | console_unlock(); |
1021 | } | 1164 | } |
1022 | return NOTIFY_OK; | 1165 | return NOTIFY_OK; |
1023 | } | 1166 | } |
1024 | 1167 | ||
1025 | /** | 1168 | /** |
1026 | * acquire_console_sem - lock the console system for exclusive use. | 1169 | * console_lock - lock the console system for exclusive use. |
1027 | * | 1170 | * |
1028 | * Acquires a semaphore which guarantees that the caller has | 1171 | * Acquires a lock which guarantees that the caller has |
1029 | * exclusive access to the console system and the console_drivers list. | 1172 | * exclusive access to the console system and the console_drivers list. |
1030 | * | 1173 | * |
1031 | * Can sleep, returns nothing. | 1174 | * Can sleep, returns nothing. |
1032 | */ | 1175 | */ |
1033 | void acquire_console_sem(void) | 1176 | void console_lock(void) |
1034 | { | 1177 | { |
1035 | BUG_ON(in_interrupt()); | 1178 | BUG_ON(in_interrupt()); |
1036 | down(&console_sem); | 1179 | down(&console_sem); |
@@ -1039,21 +1182,29 @@ void acquire_console_sem(void) | |||
1039 | console_locked = 1; | 1182 | console_locked = 1; |
1040 | console_may_schedule = 1; | 1183 | console_may_schedule = 1; |
1041 | } | 1184 | } |
1042 | EXPORT_SYMBOL(acquire_console_sem); | 1185 | EXPORT_SYMBOL(console_lock); |
1043 | 1186 | ||
1044 | int try_acquire_console_sem(void) | 1187 | /** |
1188 | * console_trylock - try to lock the console system for exclusive use. | ||
1189 | * | ||
1190 | * Tried to acquire a lock which guarantees that the caller has | ||
1191 | * exclusive access to the console system and the console_drivers list. | ||
1192 | * | ||
1193 | * returns 1 on success, and 0 on failure to acquire the lock. | ||
1194 | */ | ||
1195 | int console_trylock(void) | ||
1045 | { | 1196 | { |
1046 | if (down_trylock(&console_sem)) | 1197 | if (down_trylock(&console_sem)) |
1047 | return -1; | 1198 | return 0; |
1048 | if (console_suspended) { | 1199 | if (console_suspended) { |
1049 | up(&console_sem); | 1200 | up(&console_sem); |
1050 | return -1; | 1201 | return 0; |
1051 | } | 1202 | } |
1052 | console_locked = 1; | 1203 | console_locked = 1; |
1053 | console_may_schedule = 0; | 1204 | console_may_schedule = 0; |
1054 | return 0; | 1205 | return 1; |
1055 | } | 1206 | } |
1056 | EXPORT_SYMBOL(try_acquire_console_sem); | 1207 | EXPORT_SYMBOL(console_trylock); |
1057 | 1208 | ||
1058 | int is_console_locked(void) | 1209 | int is_console_locked(void) |
1059 | { | 1210 | { |
@@ -1064,38 +1215,40 @@ static DEFINE_PER_CPU(int, printk_pending); | |||
1064 | 1215 | ||
1065 | void printk_tick(void) | 1216 | void printk_tick(void) |
1066 | { | 1217 | { |
1067 | if (__get_cpu_var(printk_pending)) { | 1218 | if (__this_cpu_read(printk_pending)) { |
1068 | __get_cpu_var(printk_pending) = 0; | 1219 | __this_cpu_write(printk_pending, 0); |
1069 | wake_up_interruptible(&log_wait); | 1220 | wake_up_interruptible(&log_wait); |
1070 | } | 1221 | } |
1071 | } | 1222 | } |
1072 | 1223 | ||
1073 | int printk_needs_cpu(int cpu) | 1224 | int printk_needs_cpu(int cpu) |
1074 | { | 1225 | { |
1075 | return per_cpu(printk_pending, cpu); | 1226 | if (cpu_is_offline(cpu)) |
1227 | printk_tick(); | ||
1228 | return __this_cpu_read(printk_pending); | ||
1076 | } | 1229 | } |
1077 | 1230 | ||
1078 | void wake_up_klogd(void) | 1231 | void wake_up_klogd(void) |
1079 | { | 1232 | { |
1080 | if (!trace_override && waitqueue_active(&log_wait)) | 1233 | if (!trace_override && waitqueue_active(&log_wait)) |
1081 | __raw_get_cpu_var(printk_pending) = 1; | 1234 | this_cpu_write(printk_pending, 1); |
1082 | } | 1235 | } |
1083 | 1236 | ||
1084 | /** | 1237 | /** |
1085 | * release_console_sem - unlock the console system | 1238 | * console_unlock - unlock the console system |
1086 | * | 1239 | * |
1087 | * Releases the semaphore which the caller holds on the console system | 1240 | * Releases the console_lock which the caller holds on the console system |
1088 | * and the console driver list. | 1241 | * and the console driver list. |
1089 | * | 1242 | * |
1090 | * While the semaphore was held, console output may have been buffered | 1243 | * While the console_lock was held, console output may have been buffered |
1091 | * by printk(). If this is the case, release_console_sem() emits | 1244 | * by printk(). If this is the case, console_unlock(); emits |
1092 | * the output prior to releasing the semaphore. | 1245 | * the output prior to releasing the lock. |
1093 | * | 1246 | * |
1094 | * If there is output waiting for klogd, we wake it up. | 1247 | * If there is output waiting for klogd, we wake it up. |
1095 | * | 1248 | * |
1096 | * release_console_sem() may be called from any context. | 1249 | * console_unlock(); may be called from any context. |
1097 | */ | 1250 | */ |
1098 | void release_console_sem(void) | 1251 | void console_unlock(void) |
1099 | { | 1252 | { |
1100 | unsigned long flags; | 1253 | unsigned long flags; |
1101 | unsigned _con_start, _log_end; | 1254 | unsigned _con_start, _log_end; |
@@ -1123,12 +1276,17 @@ void release_console_sem(void) | |||
1123 | local_irq_restore(flags); | 1276 | local_irq_restore(flags); |
1124 | } | 1277 | } |
1125 | console_locked = 0; | 1278 | console_locked = 0; |
1279 | |||
1280 | /* Release the exclusive_console once it is used */ | ||
1281 | if (unlikely(exclusive_console)) | ||
1282 | exclusive_console = NULL; | ||
1283 | |||
1126 | up(&console_sem); | 1284 | up(&console_sem); |
1127 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1285 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1128 | if (wake_klogd) | 1286 | if (wake_klogd) |
1129 | wake_up_klogd(); | 1287 | wake_up_klogd(); |
1130 | } | 1288 | } |
1131 | EXPORT_SYMBOL(release_console_sem); | 1289 | EXPORT_SYMBOL(console_unlock); |
1132 | 1290 | ||
1133 | /** | 1291 | /** |
1134 | * console_conditional_schedule - yield the CPU if required | 1292 | * console_conditional_schedule - yield the CPU if required |
@@ -1137,7 +1295,7 @@ EXPORT_SYMBOL(release_console_sem); | |||
1137 | * if this CPU should yield the CPU to another task, do | 1295 | * if this CPU should yield the CPU to another task, do |
1138 | * so here. | 1296 | * so here. |
1139 | * | 1297 | * |
1140 | * Must be called within acquire_console_sem(). | 1298 | * Must be called within console_lock();. |
1141 | */ | 1299 | */ |
1142 | void __sched console_conditional_schedule(void) | 1300 | void __sched console_conditional_schedule(void) |
1143 | { | 1301 | { |
@@ -1158,14 +1316,14 @@ void console_unblank(void) | |||
1158 | if (down_trylock(&console_sem) != 0) | 1316 | if (down_trylock(&console_sem) != 0) |
1159 | return; | 1317 | return; |
1160 | } else | 1318 | } else |
1161 | acquire_console_sem(); | 1319 | console_lock(); |
1162 | 1320 | ||
1163 | console_locked = 1; | 1321 | console_locked = 1; |
1164 | console_may_schedule = 0; | 1322 | console_may_schedule = 0; |
1165 | for_each_console(c) | 1323 | for_each_console(c) |
1166 | if ((c->flags & CON_ENABLED) && c->unblank) | 1324 | if ((c->flags & CON_ENABLED) && c->unblank) |
1167 | c->unblank(); | 1325 | c->unblank(); |
1168 | release_console_sem(); | 1326 | console_unlock(); |
1169 | } | 1327 | } |
1170 | 1328 | ||
1171 | /* | 1329 | /* |
@@ -1176,7 +1334,7 @@ struct tty_driver *console_device(int *index) | |||
1176 | struct console *c; | 1334 | struct console *c; |
1177 | struct tty_driver *driver = NULL; | 1335 | struct tty_driver *driver = NULL; |
1178 | 1336 | ||
1179 | acquire_console_sem(); | 1337 | console_lock(); |
1180 | for_each_console(c) { | 1338 | for_each_console(c) { |
1181 | if (!c->device) | 1339 | if (!c->device) |
1182 | continue; | 1340 | continue; |
@@ -1184,7 +1342,7 @@ struct tty_driver *console_device(int *index) | |||
1184 | if (driver) | 1342 | if (driver) |
1185 | break; | 1343 | break; |
1186 | } | 1344 | } |
1187 | release_console_sem(); | 1345 | console_unlock(); |
1188 | return driver; | 1346 | return driver; |
1189 | } | 1347 | } |
1190 | 1348 | ||
@@ -1195,20 +1353,32 @@ struct tty_driver *console_device(int *index) | |||
1195 | */ | 1353 | */ |
1196 | void console_stop(struct console *console) | 1354 | void console_stop(struct console *console) |
1197 | { | 1355 | { |
1198 | acquire_console_sem(); | 1356 | console_lock(); |
1199 | console->flags &= ~CON_ENABLED; | 1357 | console->flags &= ~CON_ENABLED; |
1200 | release_console_sem(); | 1358 | console_unlock(); |
1201 | } | 1359 | } |
1202 | EXPORT_SYMBOL(console_stop); | 1360 | EXPORT_SYMBOL(console_stop); |
1203 | 1361 | ||
1204 | void console_start(struct console *console) | 1362 | void console_start(struct console *console) |
1205 | { | 1363 | { |
1206 | acquire_console_sem(); | 1364 | console_lock(); |
1207 | console->flags |= CON_ENABLED; | 1365 | console->flags |= CON_ENABLED; |
1208 | release_console_sem(); | 1366 | console_unlock(); |
1209 | } | 1367 | } |
1210 | EXPORT_SYMBOL(console_start); | 1368 | EXPORT_SYMBOL(console_start); |
1211 | 1369 | ||
1370 | static int __read_mostly keep_bootcon; | ||
1371 | |||
1372 | static int __init keep_bootcon_setup(char *str) | ||
1373 | { | ||
1374 | keep_bootcon = 1; | ||
1375 | printk(KERN_INFO "debug: skip boot console de-registration.\n"); | ||
1376 | |||
1377 | return 0; | ||
1378 | } | ||
1379 | |||
1380 | early_param("keep_bootcon", keep_bootcon_setup); | ||
1381 | |||
1212 | /* | 1382 | /* |
1213 | * The console driver calls this routine during kernel initialization | 1383 | * The console driver calls this routine during kernel initialization |
1214 | * to register the console printing procedure with printk() and to | 1384 | * to register the console printing procedure with printk() and to |
@@ -1327,7 +1497,7 @@ void register_console(struct console *newcon) | |||
1327 | * Put this console in the list - keep the | 1497 | * Put this console in the list - keep the |
1328 | * preferred driver at the head of the list. | 1498 | * preferred driver at the head of the list. |
1329 | */ | 1499 | */ |
1330 | acquire_console_sem(); | 1500 | console_lock(); |
1331 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { | 1501 | if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { |
1332 | newcon->next = console_drivers; | 1502 | newcon->next = console_drivers; |
1333 | console_drivers = newcon; | 1503 | console_drivers = newcon; |
@@ -1339,14 +1509,21 @@ void register_console(struct console *newcon) | |||
1339 | } | 1509 | } |
1340 | if (newcon->flags & CON_PRINTBUFFER) { | 1510 | if (newcon->flags & CON_PRINTBUFFER) { |
1341 | /* | 1511 | /* |
1342 | * release_console_sem() will print out the buffered messages | 1512 | * console_unlock(); will print out the buffered messages |
1343 | * for us. | 1513 | * for us. |
1344 | */ | 1514 | */ |
1345 | spin_lock_irqsave(&logbuf_lock, flags); | 1515 | spin_lock_irqsave(&logbuf_lock, flags); |
1346 | con_start = log_start; | 1516 | con_start = log_start; |
1347 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1517 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1518 | /* | ||
1519 | * We're about to replay the log buffer. Only do this to the | ||
1520 | * just-registered console to avoid excessive message spam to | ||
1521 | * the already-registered consoles. | ||
1522 | */ | ||
1523 | exclusive_console = newcon; | ||
1348 | } | 1524 | } |
1349 | release_console_sem(); | 1525 | console_unlock(); |
1526 | console_sysfs_notify(); | ||
1350 | 1527 | ||
1351 | /* | 1528 | /* |
1352 | * By unregistering the bootconsoles after we enable the real console | 1529 | * By unregistering the bootconsoles after we enable the real console |
@@ -1355,7 +1532,9 @@ void register_console(struct console *newcon) | |||
1355 | * users know there might be something in the kernel's log buffer that | 1532 | * users know there might be something in the kernel's log buffer that |
1356 | * went to the bootconsole (that they do not see on the real console) | 1533 | * went to the bootconsole (that they do not see on the real console) |
1357 | */ | 1534 | */ |
1358 | if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { | 1535 | if (bcon && |
1536 | ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && | ||
1537 | !keep_bootcon) { | ||
1359 | /* we need to iterate through twice, to make sure we print | 1538 | /* we need to iterate through twice, to make sure we print |
1360 | * everything out, before we unregister the console(s) | 1539 | * everything out, before we unregister the console(s) |
1361 | */ | 1540 | */ |
@@ -1382,7 +1561,7 @@ int unregister_console(struct console *console) | |||
1382 | return braille_unregister_console(console); | 1561 | return braille_unregister_console(console); |
1383 | #endif | 1562 | #endif |
1384 | 1563 | ||
1385 | acquire_console_sem(); | 1564 | console_lock(); |
1386 | if (console_drivers == console) { | 1565 | if (console_drivers == console) { |
1387 | console_drivers=console->next; | 1566 | console_drivers=console->next; |
1388 | res = 0; | 1567 | res = 0; |
@@ -1404,7 +1583,8 @@ int unregister_console(struct console *console) | |||
1404 | if (console_drivers != NULL && console->flags & CON_CONSDEV) | 1583 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
1405 | console_drivers->flags |= CON_CONSDEV; | 1584 | console_drivers->flags |= CON_CONSDEV; |
1406 | 1585 | ||
1407 | release_console_sem(); | 1586 | console_unlock(); |
1587 | console_sysfs_notify(); | ||
1408 | return res; | 1588 | return res; |
1409 | } | 1589 | } |
1410 | EXPORT_SYMBOL(unregister_console); | 1590 | EXPORT_SYMBOL(unregister_console); |
@@ -1488,7 +1668,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) | |||
1488 | /* Don't allow registering multiple times */ | 1668 | /* Don't allow registering multiple times */ |
1489 | if (!dumper->registered) { | 1669 | if (!dumper->registered) { |
1490 | dumper->registered = 1; | 1670 | dumper->registered = 1; |
1491 | list_add_tail(&dumper->list, &dump_list); | 1671 | list_add_tail_rcu(&dumper->list, &dump_list); |
1492 | err = 0; | 1672 | err = 0; |
1493 | } | 1673 | } |
1494 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1674 | spin_unlock_irqrestore(&dump_list_lock, flags); |
@@ -1512,29 +1692,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1512 | spin_lock_irqsave(&dump_list_lock, flags); | 1692 | spin_lock_irqsave(&dump_list_lock, flags); |
1513 | if (dumper->registered) { | 1693 | if (dumper->registered) { |
1514 | dumper->registered = 0; | 1694 | dumper->registered = 0; |
1515 | list_del(&dumper->list); | 1695 | list_del_rcu(&dumper->list); |
1516 | err = 0; | 1696 | err = 0; |
1517 | } | 1697 | } |
1518 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1698 | spin_unlock_irqrestore(&dump_list_lock, flags); |
1699 | synchronize_rcu(); | ||
1519 | 1700 | ||
1520 | return err; | 1701 | return err; |
1521 | } | 1702 | } |
1522 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 1703 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1523 | 1704 | ||
1524 | static const char const *kmsg_reasons[] = { | ||
1525 | [KMSG_DUMP_OOPS] = "oops", | ||
1526 | [KMSG_DUMP_PANIC] = "panic", | ||
1527 | [KMSG_DUMP_KEXEC] = "kexec", | ||
1528 | }; | ||
1529 | |||
1530 | static const char *kmsg_to_str(enum kmsg_dump_reason reason) | ||
1531 | { | ||
1532 | if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) | ||
1533 | return "unknown"; | ||
1534 | |||
1535 | return kmsg_reasons[reason]; | ||
1536 | } | ||
1537 | |||
1538 | /** | 1705 | /** |
1539 | * kmsg_dump - dump kernel log to kernel message dumpers. | 1706 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1540 | * @reason: the reason (oops, panic etc) for dumping | 1707 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1573,13 +1740,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1573 | l2 = chars; | 1740 | l2 = chars; |
1574 | } | 1741 | } |
1575 | 1742 | ||
1576 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { | 1743 | rcu_read_lock(); |
1577 | printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", | 1744 | list_for_each_entry_rcu(dumper, &dump_list, list) |
1578 | kmsg_to_str(reason)); | ||
1579 | return; | ||
1580 | } | ||
1581 | list_for_each_entry(dumper, &dump_list, list) | ||
1582 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 1745 | dumper->dump(dumper, reason, s1, l1, s2, l2); |
1583 | spin_unlock_irqrestore(&dump_list_lock, flags); | 1746 | rcu_read_unlock(); |
1584 | } | 1747 | } |
1585 | #endif | 1748 | #endif |
diff --git a/kernel/profile.c b/kernel/profile.c index b22a899934cc..961b389fe52f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -126,11 +126,9 @@ int __ref profile_init(void) | |||
126 | if (prof_buffer) | 126 | if (prof_buffer) |
127 | return 0; | 127 | return 0; |
128 | 128 | ||
129 | prof_buffer = vmalloc(buffer_bytes); | 129 | prof_buffer = vzalloc(buffer_bytes); |
130 | if (prof_buffer) { | 130 | if (prof_buffer) |
131 | memset(prof_buffer, 0, buffer_bytes); | ||
132 | return 0; | 131 | return 0; |
133 | } | ||
134 | 132 | ||
135 | free_cpumask_var(prof_cpu_mask); | 133 | free_cpumask_var(prof_cpu_mask); |
136 | return -ENOMEM; | 134 | return -ENOMEM; |
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void) | |||
305 | mutex_unlock(&profile_flip_mutex); | 303 | mutex_unlock(&profile_flip_mutex); |
306 | } | 304 | } |
307 | 305 | ||
308 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | 306 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
309 | { | 307 | { |
310 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 308 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
311 | int i, j, cpu; | 309 | int i, j, cpu; |
312 | struct profile_hit *hits; | 310 | struct profile_hit *hits; |
313 | 311 | ||
314 | if (prof_on != type || !prof_buffer) | ||
315 | return; | ||
316 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); | 312 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); |
317 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | 313 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; |
318 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | 314 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; |
@@ -419,16 +415,20 @@ out_free: | |||
419 | #define profile_discard_flip_buffers() do { } while (0) | 415 | #define profile_discard_flip_buffers() do { } while (0) |
420 | #define profile_cpu_callback NULL | 416 | #define profile_cpu_callback NULL |
421 | 417 | ||
422 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | 418 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
423 | { | 419 | { |
424 | unsigned long pc; | 420 | unsigned long pc; |
425 | |||
426 | if (prof_on != type || !prof_buffer) | ||
427 | return; | ||
428 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 421 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
429 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 422 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
430 | } | 423 | } |
431 | #endif /* !CONFIG_SMP */ | 424 | #endif /* !CONFIG_SMP */ |
425 | |||
426 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | ||
427 | { | ||
428 | if (prof_on != type || !prof_buffer) | ||
429 | return; | ||
430 | do_profile_hits(type, __pc, nr_hits); | ||
431 | } | ||
432 | EXPORT_SYMBOL_GPL(profile_hits); | 432 | EXPORT_SYMBOL_GPL(profile_hits); |
433 | 433 | ||
434 | void profile_tick(int type) | 434 | void profile_tick(int type) |
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
555 | static const struct file_operations proc_profile_operations = { | 555 | static const struct file_operations proc_profile_operations = { |
556 | .read = read_profile, | 556 | .read = read_profile, |
557 | .write = write_profile, | 557 | .write = write_profile, |
558 | .llseek = default_llseek, | ||
558 | }; | 559 | }; |
559 | 560 | ||
560 | #ifdef CONFIG_SMP | 561 | #ifdef CONFIG_SMP |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a2..2df115790cd9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/uaccess.h> | 23 | #include <linux/uaccess.h> |
24 | #include <linux/regset.h> | 24 | #include <linux/regset.h> |
25 | #include <linux/hw_breakpoint.h> | ||
25 | 26 | ||
26 | 27 | ||
27 | /* | 28 | /* |
@@ -37,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) | |||
37 | child->parent = new_parent; | 38 | child->parent = new_parent; |
38 | } | 39 | } |
39 | 40 | ||
40 | /* | 41 | /** |
41 | * Turn a tracing stop into a normal stop now, since with no tracer there | 42 | * __ptrace_unlink - unlink ptracee and restore its execution state |
42 | * would be no way to wake it up with SIGCONT or SIGKILL. If there was a | 43 | * @child: ptracee to be unlinked |
43 | * signal sent that would resume the child, but didn't because it was in | ||
44 | * TASK_TRACED, resume it now. | ||
45 | * Requires that irqs be disabled. | ||
46 | */ | ||
47 | static void ptrace_untrace(struct task_struct *child) | ||
48 | { | ||
49 | spin_lock(&child->sighand->siglock); | ||
50 | if (task_is_traced(child)) { | ||
51 | /* | ||
52 | * If the group stop is completed or in progress, | ||
53 | * this thread was already counted as stopped. | ||
54 | */ | ||
55 | if (child->signal->flags & SIGNAL_STOP_STOPPED || | ||
56 | child->signal->group_stop_count) | ||
57 | __set_task_state(child, TASK_STOPPED); | ||
58 | else | ||
59 | signal_wake_up(child, 1); | ||
60 | } | ||
61 | spin_unlock(&child->sighand->siglock); | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * unptrace a task: move it back to its original parent and | ||
66 | * remove it from the ptrace list. | ||
67 | * | 44 | * |
68 | * Must be called with the tasklist lock write-held. | 45 | * Remove @child from the ptrace list, move it back to the original parent, |
46 | * and restore the execution state so that it conforms to the group stop | ||
47 | * state. | ||
48 | * | ||
49 | * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer | ||
50 | * exiting. For PTRACE_DETACH, unless the ptracee has been killed between | ||
51 | * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. | ||
52 | * If the ptracer is exiting, the ptracee can be in any state. | ||
53 | * | ||
54 | * After detach, the ptracee should be in a state which conforms to the | ||
55 | * group stop. If the group is stopped or in the process of stopping, the | ||
56 | * ptracee should be put into TASK_STOPPED; otherwise, it should be woken | ||
57 | * up from TASK_TRACED. | ||
58 | * | ||
59 | * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, | ||
60 | * it goes through TRACED -> RUNNING -> STOPPED transition which is similar | ||
61 | * to but in the opposite direction of what happens while attaching to a | ||
62 | * stopped task. However, in this direction, the intermediate RUNNING | ||
63 | * state is not hidden even from the current ptracer and if it immediately | ||
64 | * re-attaches and performs a WNOHANG wait(2), it may fail. | ||
65 | * | ||
66 | * CONTEXT: | ||
67 | * write_lock_irq(tasklist_lock) | ||
69 | */ | 68 | */ |
70 | void __ptrace_unlink(struct task_struct *child) | 69 | void __ptrace_unlink(struct task_struct *child) |
71 | { | 70 | { |
@@ -75,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child) | |||
75 | child->parent = child->real_parent; | 74 | child->parent = child->real_parent; |
76 | list_del_init(&child->ptrace_entry); | 75 | list_del_init(&child->ptrace_entry); |
77 | 76 | ||
78 | if (task_is_traced(child)) | 77 | spin_lock(&child->sighand->siglock); |
79 | ptrace_untrace(child); | 78 | |
79 | /* | ||
80 | * Reinstate GROUP_STOP_PENDING if group stop is in effect and | ||
81 | * @child isn't dead. | ||
82 | */ | ||
83 | if (!(child->flags & PF_EXITING) && | ||
84 | (child->signal->flags & SIGNAL_STOP_STOPPED || | ||
85 | child->signal->group_stop_count)) | ||
86 | child->group_stop |= GROUP_STOP_PENDING; | ||
87 | |||
88 | /* | ||
89 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick | ||
90 | * @child in the butt. Note that @resume should be used iff @child | ||
91 | * is in TASK_TRACED; otherwise, we might unduly disrupt | ||
92 | * TASK_KILLABLE sleeps. | ||
93 | */ | ||
94 | if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) | ||
95 | signal_wake_up(child, task_is_traced(child)); | ||
96 | |||
97 | spin_unlock(&child->sighand->siglock); | ||
80 | } | 98 | } |
81 | 99 | ||
82 | /* | 100 | /* |
@@ -95,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
95 | */ | 113 | */ |
96 | read_lock(&tasklist_lock); | 114 | read_lock(&tasklist_lock); |
97 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { | 115 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { |
98 | ret = 0; | ||
99 | /* | 116 | /* |
100 | * child->sighand can't be NULL, release_task() | 117 | * child->sighand can't be NULL, release_task() |
101 | * does ptrace_unlink() before __exit_signal(). | 118 | * does ptrace_unlink() before __exit_signal(). |
102 | */ | 119 | */ |
103 | spin_lock_irq(&child->sighand->siglock); | 120 | spin_lock_irq(&child->sighand->siglock); |
104 | if (task_is_stopped(child)) | 121 | WARN_ON_ONCE(task_is_stopped(child)); |
105 | child->state = TASK_TRACED; | 122 | if (task_is_traced(child) || kill) |
106 | else if (!task_is_traced(child) && !kill) | 123 | ret = 0; |
107 | ret = -ESRCH; | ||
108 | spin_unlock_irq(&child->sighand->siglock); | 124 | spin_unlock_irq(&child->sighand->siglock); |
109 | } | 125 | } |
110 | read_unlock(&tasklist_lock); | 126 | read_unlock(&tasklist_lock); |
@@ -134,21 +150,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
134 | return 0; | 150 | return 0; |
135 | rcu_read_lock(); | 151 | rcu_read_lock(); |
136 | tcred = __task_cred(task); | 152 | tcred = __task_cred(task); |
137 | if ((cred->uid != tcred->euid || | 153 | if (cred->user->user_ns == tcred->user->user_ns && |
138 | cred->uid != tcred->suid || | 154 | (cred->uid == tcred->euid && |
139 | cred->uid != tcred->uid || | 155 | cred->uid == tcred->suid && |
140 | cred->gid != tcred->egid || | 156 | cred->uid == tcred->uid && |
141 | cred->gid != tcred->sgid || | 157 | cred->gid == tcred->egid && |
142 | cred->gid != tcred->gid) && | 158 | cred->gid == tcred->sgid && |
143 | !capable(CAP_SYS_PTRACE)) { | 159 | cred->gid == tcred->gid)) |
144 | rcu_read_unlock(); | 160 | goto ok; |
145 | return -EPERM; | 161 | if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) |
146 | } | 162 | goto ok; |
163 | rcu_read_unlock(); | ||
164 | return -EPERM; | ||
165 | ok: | ||
147 | rcu_read_unlock(); | 166 | rcu_read_unlock(); |
148 | smp_rmb(); | 167 | smp_rmb(); |
149 | if (task->mm) | 168 | if (task->mm) |
150 | dumpable = get_dumpable(task->mm); | 169 | dumpable = get_dumpable(task->mm); |
151 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | 170 | if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) |
152 | return -EPERM; | 171 | return -EPERM; |
153 | 172 | ||
154 | return security_ptrace_access_check(task, mode); | 173 | return security_ptrace_access_check(task, mode); |
@@ -163,8 +182,9 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
163 | return !err; | 182 | return !err; |
164 | } | 183 | } |
165 | 184 | ||
166 | int ptrace_attach(struct task_struct *task) | 185 | static int ptrace_attach(struct task_struct *task) |
167 | { | 186 | { |
187 | bool wait_trap = false; | ||
168 | int retval; | 188 | int retval; |
169 | 189 | ||
170 | audit_ptrace(task); | 190 | audit_ptrace(task); |
@@ -181,7 +201,7 @@ int ptrace_attach(struct task_struct *task) | |||
181 | * under ptrace. | 201 | * under ptrace. |
182 | */ | 202 | */ |
183 | retval = -ERESTARTNOINTR; | 203 | retval = -ERESTARTNOINTR; |
184 | if (mutex_lock_interruptible(&task->cred_guard_mutex)) | 204 | if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) |
185 | goto out; | 205 | goto out; |
186 | 206 | ||
187 | task_lock(task); | 207 | task_lock(task); |
@@ -198,18 +218,48 @@ int ptrace_attach(struct task_struct *task) | |||
198 | goto unlock_tasklist; | 218 | goto unlock_tasklist; |
199 | 219 | ||
200 | task->ptrace = PT_PTRACED; | 220 | task->ptrace = PT_PTRACED; |
201 | if (capable(CAP_SYS_PTRACE)) | 221 | if (task_ns_capable(task, CAP_SYS_PTRACE)) |
202 | task->ptrace |= PT_PTRACE_CAP; | 222 | task->ptrace |= PT_PTRACE_CAP; |
203 | 223 | ||
204 | __ptrace_link(task, current); | 224 | __ptrace_link(task, current); |
205 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); | 225 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); |
206 | 226 | ||
227 | spin_lock(&task->sighand->siglock); | ||
228 | |||
229 | /* | ||
230 | * If the task is already STOPPED, set GROUP_STOP_PENDING and | ||
231 | * TRAPPING, and kick it so that it transits to TRACED. TRAPPING | ||
232 | * will be cleared if the child completes the transition or any | ||
233 | * event which clears the group stop states happens. We'll wait | ||
234 | * for the transition to complete before returning from this | ||
235 | * function. | ||
236 | * | ||
237 | * This hides STOPPED -> RUNNING -> TRACED transition from the | ||
238 | * attaching thread but a different thread in the same group can | ||
239 | * still observe the transient RUNNING state. IOW, if another | ||
240 | * thread's WNOHANG wait(2) on the stopped tracee races against | ||
241 | * ATTACH, the wait(2) may fail due to the transient RUNNING. | ||
242 | * | ||
243 | * The following task_is_stopped() test is safe as both transitions | ||
244 | * in and out of STOPPED are protected by siglock. | ||
245 | */ | ||
246 | if (task_is_stopped(task)) { | ||
247 | task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; | ||
248 | signal_wake_up(task, 1); | ||
249 | wait_trap = true; | ||
250 | } | ||
251 | |||
252 | spin_unlock(&task->sighand->siglock); | ||
253 | |||
207 | retval = 0; | 254 | retval = 0; |
208 | unlock_tasklist: | 255 | unlock_tasklist: |
209 | write_unlock_irq(&tasklist_lock); | 256 | write_unlock_irq(&tasklist_lock); |
210 | unlock_creds: | 257 | unlock_creds: |
211 | mutex_unlock(&task->cred_guard_mutex); | 258 | mutex_unlock(&task->signal->cred_guard_mutex); |
212 | out: | 259 | out: |
260 | if (wait_trap) | ||
261 | wait_event(current->signal->wait_chldexit, | ||
262 | !(task->group_stop & GROUP_STOP_TRAPPING)); | ||
213 | return retval; | 263 | return retval; |
214 | } | 264 | } |
215 | 265 | ||
@@ -219,7 +269,7 @@ out: | |||
219 | * Performs checks and sets PT_PTRACED. | 269 | * Performs checks and sets PT_PTRACED. |
220 | * Should be used by all ptrace implementations for PTRACE_TRACEME. | 270 | * Should be used by all ptrace implementations for PTRACE_TRACEME. |
221 | */ | 271 | */ |
222 | int ptrace_traceme(void) | 272 | static int ptrace_traceme(void) |
223 | { | 273 | { |
224 | int ret = -EPERM; | 274 | int ret = -EPERM; |
225 | 275 | ||
@@ -293,7 +343,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) | |||
293 | return false; | 343 | return false; |
294 | } | 344 | } |
295 | 345 | ||
296 | int ptrace_detach(struct task_struct *child, unsigned int data) | 346 | static int ptrace_detach(struct task_struct *child, unsigned int data) |
297 | { | 347 | { |
298 | bool dead = false; | 348 | bool dead = false; |
299 | 349 | ||
@@ -312,8 +362,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
312 | if (child->ptrace) { | 362 | if (child->ptrace) { |
313 | child->exit_code = data; | 363 | child->exit_code = data; |
314 | dead = __ptrace_detach(current, child); | 364 | dead = __ptrace_detach(current, child); |
315 | if (!child->exit_state) | ||
316 | wake_up_process(child); | ||
317 | } | 365 | } |
318 | write_unlock_irq(&tasklist_lock); | 366 | write_unlock_irq(&tasklist_lock); |
319 | 367 | ||
@@ -329,6 +377,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
329 | * and reacquire the lock. | 377 | * and reacquire the lock. |
330 | */ | 378 | */ |
331 | void exit_ptrace(struct task_struct *tracer) | 379 | void exit_ptrace(struct task_struct *tracer) |
380 | __releases(&tasklist_lock) | ||
381 | __acquires(&tasklist_lock) | ||
332 | { | 382 | { |
333 | struct task_struct *p, *n; | 383 | struct task_struct *p, *n; |
334 | LIST_HEAD(ptrace_dead); | 384 | LIST_HEAD(ptrace_dead); |
@@ -402,7 +452,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds | |||
402 | return copied; | 452 | return copied; |
403 | } | 453 | } |
404 | 454 | ||
405 | static int ptrace_setoptions(struct task_struct *child, long data) | 455 | static int ptrace_setoptions(struct task_struct *child, unsigned long data) |
406 | { | 456 | { |
407 | child->ptrace &= ~PT_TRACE_MASK; | 457 | child->ptrace &= ~PT_TRACE_MASK; |
408 | 458 | ||
@@ -481,7 +531,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) | |||
481 | #define is_sysemu_singlestep(request) 0 | 531 | #define is_sysemu_singlestep(request) 0 |
482 | #endif | 532 | #endif |
483 | 533 | ||
484 | static int ptrace_resume(struct task_struct *child, long request, long data) | 534 | static int ptrace_resume(struct task_struct *child, long request, |
535 | unsigned long data) | ||
485 | { | 536 | { |
486 | if (!valid_signal(data)) | 537 | if (!valid_signal(data)) |
487 | return -EIO; | 538 | return -EIO; |
@@ -511,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, long data) | |||
511 | } | 562 | } |
512 | 563 | ||
513 | child->exit_code = data; | 564 | child->exit_code = data; |
514 | wake_up_process(child); | 565 | wake_up_state(child, __TASK_TRACED); |
515 | 566 | ||
516 | return 0; | 567 | return 0; |
517 | } | 568 | } |
@@ -558,10 +609,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | |||
558 | #endif | 609 | #endif |
559 | 610 | ||
560 | int ptrace_request(struct task_struct *child, long request, | 611 | int ptrace_request(struct task_struct *child, long request, |
561 | long addr, long data) | 612 | unsigned long addr, unsigned long data) |
562 | { | 613 | { |
563 | int ret = -EIO; | 614 | int ret = -EIO; |
564 | siginfo_t siginfo; | 615 | siginfo_t siginfo; |
616 | void __user *datavp = (void __user *) data; | ||
617 | unsigned long __user *datalp = datavp; | ||
565 | 618 | ||
566 | switch (request) { | 619 | switch (request) { |
567 | case PTRACE_PEEKTEXT: | 620 | case PTRACE_PEEKTEXT: |
@@ -578,19 +631,17 @@ int ptrace_request(struct task_struct *child, long request, | |||
578 | ret = ptrace_setoptions(child, data); | 631 | ret = ptrace_setoptions(child, data); |
579 | break; | 632 | break; |
580 | case PTRACE_GETEVENTMSG: | 633 | case PTRACE_GETEVENTMSG: |
581 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | 634 | ret = put_user(child->ptrace_message, datalp); |
582 | break; | 635 | break; |
583 | 636 | ||
584 | case PTRACE_GETSIGINFO: | 637 | case PTRACE_GETSIGINFO: |
585 | ret = ptrace_getsiginfo(child, &siginfo); | 638 | ret = ptrace_getsiginfo(child, &siginfo); |
586 | if (!ret) | 639 | if (!ret) |
587 | ret = copy_siginfo_to_user((siginfo_t __user *) data, | 640 | ret = copy_siginfo_to_user(datavp, &siginfo); |
588 | &siginfo); | ||
589 | break; | 641 | break; |
590 | 642 | ||
591 | case PTRACE_SETSIGINFO: | 643 | case PTRACE_SETSIGINFO: |
592 | if (copy_from_user(&siginfo, (siginfo_t __user *) data, | 644 | if (copy_from_user(&siginfo, datavp, sizeof siginfo)) |
593 | sizeof siginfo)) | ||
594 | ret = -EFAULT; | 645 | ret = -EFAULT; |
595 | else | 646 | else |
596 | ret = ptrace_setsiginfo(child, &siginfo); | 647 | ret = ptrace_setsiginfo(child, &siginfo); |
@@ -621,7 +672,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
621 | } | 672 | } |
622 | mmput(mm); | 673 | mmput(mm); |
623 | 674 | ||
624 | ret = put_user(tmp, (unsigned long __user *) data); | 675 | ret = put_user(tmp, datalp); |
625 | break; | 676 | break; |
626 | } | 677 | } |
627 | #endif | 678 | #endif |
@@ -650,7 +701,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
650 | case PTRACE_SETREGSET: | 701 | case PTRACE_SETREGSET: |
651 | { | 702 | { |
652 | struct iovec kiov; | 703 | struct iovec kiov; |
653 | struct iovec __user *uiov = (struct iovec __user *) data; | 704 | struct iovec __user *uiov = datavp; |
654 | 705 | ||
655 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | 706 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) |
656 | return -EFAULT; | 707 | return -EFAULT; |
@@ -691,7 +742,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid) | |||
691 | #define arch_ptrace_attach(child) do { } while (0) | 742 | #define arch_ptrace_attach(child) do { } while (0) |
692 | #endif | 743 | #endif |
693 | 744 | ||
694 | SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) | 745 | SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, |
746 | unsigned long, data) | ||
695 | { | 747 | { |
696 | struct task_struct *child; | 748 | struct task_struct *child; |
697 | long ret; | 749 | long ret; |
@@ -732,7 +784,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) | |||
732 | return ret; | 784 | return ret; |
733 | } | 785 | } |
734 | 786 | ||
735 | int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | 787 | int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, |
788 | unsigned long data) | ||
736 | { | 789 | { |
737 | unsigned long tmp; | 790 | unsigned long tmp; |
738 | int copied; | 791 | int copied; |
@@ -743,7 +796,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | |||
743 | return put_user(tmp, (unsigned long __user *)data); | 796 | return put_user(tmp, (unsigned long __user *)data); |
744 | } | 797 | } |
745 | 798 | ||
746 | int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | 799 | int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, |
800 | unsigned long data) | ||
747 | { | 801 | { |
748 | int copied; | 802 | int copied; |
749 | 803 | ||
@@ -870,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
870 | return ret; | 924 | return ret; |
871 | } | 925 | } |
872 | #endif /* CONFIG_COMPAT */ | 926 | #endif /* CONFIG_COMPAT */ |
927 | |||
928 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
929 | int ptrace_get_breakpoints(struct task_struct *tsk) | ||
930 | { | ||
931 | if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) | ||
932 | return 0; | ||
933 | |||
934 | return -1; | ||
935 | } | ||
936 | |||
937 | void ptrace_put_breakpoints(struct task_struct *tsk) | ||
938 | { | ||
939 | if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) | ||
940 | flush_ptrace_hw_breakpoint(tsk); | ||
941 | } | ||
942 | #endif /* CONFIG_HAVE_HW_BREAKPOINT */ | ||
diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb5..37fa9b99ad58 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) | |||
119 | 119 | ||
120 | int clean_sort_range(struct range *range, int az) | 120 | int clean_sort_range(struct range *range, int az) |
121 | { | 121 | { |
122 | int i, j, k = az - 1, nr_range = 0; | 122 | int i, j, k = az - 1, nr_range = az; |
123 | 123 | ||
124 | for (i = 0; i < k; i++) { | 124 | for (i = 0; i < k; i++) { |
125 | if (range[i].end) | 125 | if (range[i].end) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..7784bd216b6a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) | |||
73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | 73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); |
74 | 74 | ||
75 | /** | 75 | /** |
76 | * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? | 76 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
77 | * | 77 | * |
78 | * Check for bottom half being disabled, which covers both the | 78 | * Check for bottom half being disabled, which covers both the |
79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | 79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses |
80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | 80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) |
81 | * will show the situation. | 81 | * will show the situation. This is useful for debug checks in functions |
82 | * that require that they be called within an RCU read-side critical | ||
83 | * section. | ||
82 | * | 84 | * |
83 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | 85 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. |
84 | */ | 86 | */ |
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) | |||
86 | { | 88 | { |
87 | if (!debug_lockdep_rcu_enabled()) | 89 | if (!debug_lockdep_rcu_enabled()) |
88 | return 1; | 90 | return 1; |
89 | return in_softirq(); | 91 | return in_softirq() || irqs_disabled(); |
90 | } | 92 | } |
91 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 93 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
92 | 94 | ||
@@ -140,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | |||
140 | * Ensure that queued callbacks are all executed. | 142 | * Ensure that queued callbacks are all executed. |
141 | * If we detect that we are nested in a RCU read-side critical | 143 | * If we detect that we are nested in a RCU read-side critical |
142 | * section, we should simply fail, otherwise we would deadlock. | 144 | * section, we should simply fail, otherwise we would deadlock. |
145 | * In !PREEMPT configurations, there is no way to tell if we are | ||
146 | * in a RCU read-side critical section or not, so we never | ||
147 | * attempt any fixup and just print a warning. | ||
143 | */ | 148 | */ |
149 | #ifndef CONFIG_PREEMPT | ||
150 | WARN_ON_ONCE(1); | ||
151 | return 0; | ||
152 | #endif | ||
144 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 153 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
145 | irqs_disabled()) { | 154 | irqs_disabled()) { |
146 | WARN_ON(1); | 155 | WARN_ON_ONCE(1); |
147 | return 0; | 156 | return 0; |
148 | } | 157 | } |
149 | rcu_barrier(); | 158 | rcu_barrier(); |
@@ -182,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
182 | * Ensure that queued callbacks are all executed. | 191 | * Ensure that queued callbacks are all executed. |
183 | * If we detect that we are nested in a RCU read-side critical | 192 | * If we detect that we are nested in a RCU read-side critical |
184 | * section, we should simply fail, otherwise we would deadlock. | 193 | * section, we should simply fail, otherwise we would deadlock. |
194 | * In !PREEMPT configurations, there is no way to tell if we are | ||
195 | * in a RCU read-side critical section or not, so we never | ||
196 | * attempt any fixup and just print a warning. | ||
185 | */ | 197 | */ |
198 | #ifndef CONFIG_PREEMPT | ||
199 | WARN_ON_ONCE(1); | ||
200 | return 0; | ||
201 | #endif | ||
186 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 202 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
187 | irqs_disabled()) { | 203 | irqs_disabled()) { |
188 | WARN_ON(1); | 204 | WARN_ON_ONCE(1); |
189 | return 0; | 205 | return 0; |
190 | } | 206 | } |
191 | rcu_barrier(); | 207 | rcu_barrier(); |
@@ -212,14 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
212 | * Ensure that queued callbacks are all executed. | 228 | * Ensure that queued callbacks are all executed. |
213 | * If we detect that we are nested in a RCU read-side critical | 229 | * If we detect that we are nested in a RCU read-side critical |
214 | * section, we should simply fail, otherwise we would deadlock. | 230 | * section, we should simply fail, otherwise we would deadlock. |
231 | * In !PREEMPT configurations, there is no way to tell if we are | ||
232 | * in a RCU read-side critical section or not, so we never | ||
233 | * attempt any fixup and just print a warning. | ||
215 | */ | 234 | */ |
216 | #ifndef CONFIG_PREEMPT | 235 | #ifndef CONFIG_PREEMPT |
217 | WARN_ON(1); | 236 | WARN_ON_ONCE(1); |
218 | return 0; | 237 | return 0; |
219 | #else | 238 | #endif |
220 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 239 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
221 | irqs_disabled()) { | 240 | irqs_disabled()) { |
222 | WARN_ON(1); | 241 | WARN_ON_ONCE(1); |
223 | return 0; | 242 | return 0; |
224 | } | 243 | } |
225 | rcu_barrier(); | 244 | rcu_barrier(); |
@@ -227,7 +246,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
227 | rcu_barrier_bh(); | 246 | rcu_barrier_bh(); |
228 | debug_object_free(head, &rcuhead_debug_descr); | 247 | debug_object_free(head, &rcuhead_debug_descr); |
229 | return 1; | 248 | return 1; |
230 | #endif | ||
231 | default: | 249 | default: |
232 | return 0; | 250 | return 0; |
233 | } | 251 | } |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..7bbac7d0f5ab 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -35,29 +35,23 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
38 | #include <linux/prefetch.h> | ||
39 | |||
40 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | ||
41 | static struct task_struct *rcu_kthread_task; | ||
42 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | ||
43 | static unsigned long have_rcu_kthread_work; | ||
44 | |||
45 | /* Forward declarations for rcutiny_plugin.h. */ | ||
46 | struct rcu_ctrlblk; | ||
47 | static void invoke_rcu_kthread(void); | ||
48 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
49 | static int rcu_kthread(void *arg); | ||
50 | static void __call_rcu(struct rcu_head *head, | ||
51 | void (*func)(struct rcu_head *rcu), | ||
52 | struct rcu_ctrlblk *rcp); | ||
38 | 53 | ||
39 | /* Global control variables for rcupdate callback mechanism. */ | 54 | #include "rcutiny_plugin.h" |
40 | struct rcu_ctrlblk { | ||
41 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
42 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
43 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
44 | }; | ||
45 | |||
46 | /* Definition for rcupdate control block. */ | ||
47 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
48 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
49 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
50 | }; | ||
51 | |||
52 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
53 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
54 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
55 | }; | ||
56 | |||
57 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
58 | int rcu_scheduler_active __read_mostly; | ||
59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
61 | 55 | ||
62 | #ifdef CONFIG_NO_HZ | 56 | #ifdef CONFIG_NO_HZ |
63 | 57 | ||
@@ -86,36 +80,45 @@ void rcu_exit_nohz(void) | |||
86 | #endif /* #ifdef CONFIG_NO_HZ */ | 80 | #endif /* #ifdef CONFIG_NO_HZ */ |
87 | 81 | ||
88 | /* | 82 | /* |
89 | * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). | 83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
90 | * Also disable irqs to avoid confusion due to interrupt handlers | 84 | * Also irqs are disabled to avoid confusion due to interrupt handlers |
91 | * invoking call_rcu(). | 85 | * invoking call_rcu(). |
92 | */ | 86 | */ |
93 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 87 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
94 | { | 88 | { |
95 | unsigned long flags; | ||
96 | |||
97 | local_irq_save(flags); | ||
98 | if (rcp->rcucblist != NULL && | 89 | if (rcp->rcucblist != NULL && |
99 | rcp->donetail != rcp->curtail) { | 90 | rcp->donetail != rcp->curtail) { |
100 | rcp->donetail = rcp->curtail; | 91 | rcp->donetail = rcp->curtail; |
101 | local_irq_restore(flags); | ||
102 | return 1; | 92 | return 1; |
103 | } | 93 | } |
104 | local_irq_restore(flags); | ||
105 | 94 | ||
106 | return 0; | 95 | return 0; |
107 | } | 96 | } |
108 | 97 | ||
109 | /* | 98 | /* |
99 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
100 | * or to boost readers. | ||
101 | */ | ||
102 | static void invoke_rcu_kthread(void) | ||
103 | { | ||
104 | have_rcu_kthread_work = 1; | ||
105 | wake_up(&rcu_kthread_wq); | ||
106 | } | ||
107 | |||
108 | /* | ||
110 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | 109 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we |
111 | * are at it, given that any rcu quiescent state is also an rcu_bh | 110 | * are at it, given that any rcu quiescent state is also an rcu_bh |
112 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 111 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
113 | */ | 112 | */ |
114 | void rcu_sched_qs(int cpu) | 113 | void rcu_sched_qs(int cpu) |
115 | { | 114 | { |
115 | unsigned long flags; | ||
116 | |||
117 | local_irq_save(flags); | ||
116 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 118 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
117 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
118 | raise_softirq(RCU_SOFTIRQ); | 120 | invoke_rcu_kthread(); |
121 | local_irq_restore(flags); | ||
119 | } | 122 | } |
120 | 123 | ||
121 | /* | 124 | /* |
@@ -123,8 +126,12 @@ void rcu_sched_qs(int cpu) | |||
123 | */ | 126 | */ |
124 | void rcu_bh_qs(int cpu) | 127 | void rcu_bh_qs(int cpu) |
125 | { | 128 | { |
129 | unsigned long flags; | ||
130 | |||
131 | local_irq_save(flags); | ||
126 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 132 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
127 | raise_softirq(RCU_SOFTIRQ); | 133 | invoke_rcu_kthread(); |
134 | local_irq_restore(flags); | ||
128 | } | 135 | } |
129 | 136 | ||
130 | /* | 137 | /* |
@@ -140,16 +147,18 @@ void rcu_check_callbacks(int cpu, int user) | |||
140 | rcu_sched_qs(cpu); | 147 | rcu_sched_qs(cpu); |
141 | else if (!in_softirq()) | 148 | else if (!in_softirq()) |
142 | rcu_bh_qs(cpu); | 149 | rcu_bh_qs(cpu); |
150 | rcu_preempt_check_callbacks(); | ||
143 | } | 151 | } |
144 | 152 | ||
145 | /* | 153 | /* |
146 | * Helper function for rcu_process_callbacks() that operates on the | 154 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure |
147 | * specified rcu_ctrlkblk structure. | 155 | * whose grace period has elapsed. |
148 | */ | 156 | */ |
149 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 157 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
150 | { | 158 | { |
151 | struct rcu_head *next, *list; | 159 | struct rcu_head *next, *list; |
152 | unsigned long flags; | 160 | unsigned long flags; |
161 | RCU_TRACE(int cb_count = 0); | ||
153 | 162 | ||
154 | /* If no RCU callbacks ready to invoke, just return. */ | 163 | /* If no RCU callbacks ready to invoke, just return. */ |
155 | if (&rcp->rcucblist == rcp->donetail) | 164 | if (&rcp->rcucblist == rcp->donetail) |
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
162 | *rcp->donetail = NULL; | 171 | *rcp->donetail = NULL; |
163 | if (rcp->curtail == rcp->donetail) | 172 | if (rcp->curtail == rcp->donetail) |
164 | rcp->curtail = &rcp->rcucblist; | 173 | rcp->curtail = &rcp->rcucblist; |
174 | rcu_preempt_remove_callbacks(rcp); | ||
165 | rcp->donetail = &rcp->rcucblist; | 175 | rcp->donetail = &rcp->rcucblist; |
166 | local_irq_restore(flags); | 176 | local_irq_restore(flags); |
167 | 177 | ||
@@ -170,18 +180,45 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
170 | next = list->next; | 180 | next = list->next; |
171 | prefetch(next); | 181 | prefetch(next); |
172 | debug_rcu_head_unqueue(list); | 182 | debug_rcu_head_unqueue(list); |
173 | list->func(list); | 183 | local_bh_disable(); |
184 | __rcu_reclaim(list); | ||
185 | local_bh_enable(); | ||
174 | list = next; | 186 | list = next; |
187 | RCU_TRACE(cb_count++); | ||
175 | } | 188 | } |
189 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | ||
176 | } | 190 | } |
177 | 191 | ||
178 | /* | 192 | /* |
179 | * Invoke any callbacks whose grace period has completed. | 193 | * This kthread invokes RCU callbacks whose grace periods have |
194 | * elapsed. It is awakened as needed, and takes the place of the | ||
195 | * RCU_SOFTIRQ that was used previously for this purpose. | ||
196 | * This is a kthread, but it is never stopped, at least not until | ||
197 | * the system goes down. | ||
180 | */ | 198 | */ |
181 | static void rcu_process_callbacks(struct softirq_action *unused) | 199 | static int rcu_kthread(void *arg) |
182 | { | 200 | { |
183 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 201 | unsigned long work; |
184 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 202 | unsigned long morework; |
203 | unsigned long flags; | ||
204 | |||
205 | for (;;) { | ||
206 | wait_event_interruptible(rcu_kthread_wq, | ||
207 | have_rcu_kthread_work != 0); | ||
208 | morework = rcu_boost(); | ||
209 | local_irq_save(flags); | ||
210 | work = have_rcu_kthread_work; | ||
211 | have_rcu_kthread_work = morework; | ||
212 | local_irq_restore(flags); | ||
213 | if (work) { | ||
214 | rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
215 | rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
216 | rcu_preempt_process_callbacks(); | ||
217 | } | ||
218 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
219 | } | ||
220 | |||
221 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
185 | } | 222 | } |
186 | 223 | ||
187 | /* | 224 | /* |
@@ -219,19 +256,20 @@ static void __call_rcu(struct rcu_head *head, | |||
219 | local_irq_save(flags); | 256 | local_irq_save(flags); |
220 | *rcp->curtail = head; | 257 | *rcp->curtail = head; |
221 | rcp->curtail = &head->next; | 258 | rcp->curtail = &head->next; |
259 | RCU_TRACE(rcp->qlen++); | ||
222 | local_irq_restore(flags); | 260 | local_irq_restore(flags); |
223 | } | 261 | } |
224 | 262 | ||
225 | /* | 263 | /* |
226 | * Post an RCU callback to be invoked after the end of an RCU grace | 264 | * Post an RCU callback to be invoked after the end of an RCU-sched grace |
227 | * period. But since we have but one CPU, that would be after any | 265 | * period. But since we have but one CPU, that would be after any |
228 | * quiescent state. | 266 | * quiescent state. |
229 | */ | 267 | */ |
230 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 268 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
231 | { | 269 | { |
232 | __call_rcu(head, func, &rcu_sched_ctrlblk); | 270 | __call_rcu(head, func, &rcu_sched_ctrlblk); |
233 | } | 271 | } |
234 | EXPORT_SYMBOL_GPL(call_rcu); | 272 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
235 | 273 | ||
236 | /* | 274 | /* |
237 | * Post an RCU bottom-half callback to be invoked after any subsequent | 275 | * Post an RCU bottom-half callback to be invoked after any subsequent |
@@ -243,20 +281,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
243 | } | 281 | } |
244 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 282 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
245 | 283 | ||
246 | void rcu_barrier(void) | ||
247 | { | ||
248 | struct rcu_synchronize rcu; | ||
249 | |||
250 | init_rcu_head_on_stack(&rcu.head); | ||
251 | init_completion(&rcu.completion); | ||
252 | /* Will wake me after RCU finished. */ | ||
253 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
254 | /* Wait for it. */ | ||
255 | wait_for_completion(&rcu.completion); | ||
256 | destroy_rcu_head_on_stack(&rcu.head); | ||
257 | } | ||
258 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
259 | |||
260 | void rcu_barrier_bh(void) | 284 | void rcu_barrier_bh(void) |
261 | { | 285 | { |
262 | struct rcu_synchronize rcu; | 286 | struct rcu_synchronize rcu; |
@@ -285,9 +309,16 @@ void rcu_barrier_sched(void) | |||
285 | } | 309 | } |
286 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 310 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
287 | 311 | ||
288 | void __init rcu_init(void) | 312 | /* |
313 | * Spawn the kthread that invokes RCU callbacks. | ||
314 | */ | ||
315 | static int __init rcu_spawn_kthreads(void) | ||
289 | { | 316 | { |
290 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 317 | struct sched_param sp; |
291 | } | ||
292 | 318 | ||
293 | #include "rcutiny_plugin.h" | 319 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); |
320 | sp.sched_priority = RCU_BOOST_PRIO; | ||
321 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
322 | return 0; | ||
323 | } | ||
324 | early_initcall(rcu_spawn_kthreads); | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..f259c676195f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition |
3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -17,23 +17,991 @@ | |||
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
19 | * | 19 | * |
20 | * Copyright IBM Corporation, 2009 | 20 | * Copyright (c) 2010 Linaro |
21 | * | 21 | * |
22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/kthread.h> | ||
26 | #include <linux/debugfs.h> | ||
27 | #include <linux/seq_file.h> | ||
28 | |||
29 | #ifdef CONFIG_RCU_TRACE | ||
30 | #define RCU_TRACE(stmt) stmt | ||
31 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
32 | #define RCU_TRACE(stmt) | ||
33 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
34 | |||
35 | /* Global control variables for rcupdate callback mechanism. */ | ||
36 | struct rcu_ctrlblk { | ||
37 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
38 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
39 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
40 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | ||
41 | }; | ||
42 | |||
43 | /* Definition for rcupdate control block. */ | ||
44 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
45 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
46 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
47 | }; | ||
48 | |||
49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
52 | }; | ||
53 | |||
25 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | int rcu_scheduler_active __read_mostly; | ||
56 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
57 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
58 | |||
59 | #ifdef CONFIG_TINY_PREEMPT_RCU | ||
60 | |||
61 | #include <linux/delay.h> | ||
62 | |||
63 | /* Global control variables for preemptible RCU. */ | ||
64 | struct rcu_preempt_ctrlblk { | ||
65 | struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ | ||
66 | struct rcu_head **nexttail; | ||
67 | /* Tasks blocked in a preemptible RCU */ | ||
68 | /* read-side critical section while an */ | ||
69 | /* preemptible-RCU grace period is in */ | ||
70 | /* progress must wait for a later grace */ | ||
71 | /* period. This pointer points to the */ | ||
72 | /* ->next pointer of the last task that */ | ||
73 | /* must wait for a later grace period, or */ | ||
74 | /* to &->rcb.rcucblist if there is no */ | ||
75 | /* such task. */ | ||
76 | struct list_head blkd_tasks; | ||
77 | /* Tasks blocked in RCU read-side critical */ | ||
78 | /* section. Tasks are placed at the head */ | ||
79 | /* of this list and age towards the tail. */ | ||
80 | struct list_head *gp_tasks; | ||
81 | /* Pointer to the first task blocking the */ | ||
82 | /* current grace period, or NULL if there */ | ||
83 | /* is no such task. */ | ||
84 | struct list_head *exp_tasks; | ||
85 | /* Pointer to first task blocking the */ | ||
86 | /* current expedited grace period, or NULL */ | ||
87 | /* if there is no such task. If there */ | ||
88 | /* is no current expedited grace period, */ | ||
89 | /* then there cannot be any such task. */ | ||
90 | #ifdef CONFIG_RCU_BOOST | ||
91 | struct list_head *boost_tasks; | ||
92 | /* Pointer to first task that needs to be */ | ||
93 | /* priority-boosted, or NULL if no priority */ | ||
94 | /* boosting is needed. If there is no */ | ||
95 | /* current or expedited grace period, there */ | ||
96 | /* can be no such task. */ | ||
97 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
98 | u8 gpnum; /* Current grace period. */ | ||
99 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | ||
100 | u8 completed; /* Last grace period completed. */ | ||
101 | /* If all three are equal, RCU is idle. */ | ||
102 | #ifdef CONFIG_RCU_BOOST | ||
103 | unsigned long boost_time; /* When to start boosting (jiffies) */ | ||
104 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
105 | #ifdef CONFIG_RCU_TRACE | ||
106 | unsigned long n_grace_periods; | ||
107 | #ifdef CONFIG_RCU_BOOST | ||
108 | unsigned long n_tasks_boosted; | ||
109 | /* Total number of tasks boosted. */ | ||
110 | unsigned long n_exp_boosts; | ||
111 | /* Number of tasks boosted for expedited GP. */ | ||
112 | unsigned long n_normal_boosts; | ||
113 | /* Number of tasks boosted for normal GP. */ | ||
114 | unsigned long n_balk_blkd_tasks; | ||
115 | /* Refused to boost: no blocked tasks. */ | ||
116 | unsigned long n_balk_exp_gp_tasks; | ||
117 | /* Refused to boost: nothing blocking GP. */ | ||
118 | unsigned long n_balk_boost_tasks; | ||
119 | /* Refused to boost: already boosting. */ | ||
120 | unsigned long n_balk_notyet; | ||
121 | /* Refused to boost: not yet time. */ | ||
122 | unsigned long n_balk_nos; | ||
123 | /* Refused to boost: not sure why, though. */ | ||
124 | /* This can happen due to race conditions. */ | ||
125 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
126 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
127 | }; | ||
128 | |||
129 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | ||
130 | .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
131 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
132 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
133 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | ||
134 | }; | ||
135 | |||
136 | static int rcu_preempted_readers_exp(void); | ||
137 | static void rcu_report_exp_done(void); | ||
138 | |||
139 | /* | ||
140 | * Return true if the CPU has not yet responded to the current grace period. | ||
141 | */ | ||
142 | static int rcu_cpu_blocking_cur_gp(void) | ||
143 | { | ||
144 | return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * Check for a running RCU reader. Because there is only one CPU, | ||
149 | * there can be but one running RCU reader at a time. ;-) | ||
150 | */ | ||
151 | static int rcu_preempt_running_reader(void) | ||
152 | { | ||
153 | return current->rcu_read_lock_nesting; | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * Check for preempted RCU readers blocking any grace period. | ||
158 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
159 | */ | ||
160 | static int rcu_preempt_blocked_readers_any(void) | ||
161 | { | ||
162 | return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Check for preempted RCU readers blocking the current grace period. | ||
167 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
168 | */ | ||
169 | static int rcu_preempt_blocked_readers_cgp(void) | ||
170 | { | ||
171 | return rcu_preempt_ctrlblk.gp_tasks != NULL; | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Return true if another preemptible-RCU grace period is needed. | ||
176 | */ | ||
177 | static int rcu_preempt_needs_another_gp(void) | ||
178 | { | ||
179 | return *rcu_preempt_ctrlblk.rcb.curtail != NULL; | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * Return true if a preemptible-RCU grace period is in progress. | ||
184 | * The caller must disable hardirqs. | ||
185 | */ | ||
186 | static int rcu_preempt_gp_in_progress(void) | ||
187 | { | ||
188 | return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
193 | * returning NULL if at the end of the list. | ||
194 | */ | ||
195 | static struct list_head *rcu_next_node_entry(struct task_struct *t) | ||
196 | { | ||
197 | struct list_head *np; | ||
198 | |||
199 | np = t->rcu_node_entry.next; | ||
200 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
201 | np = NULL; | ||
202 | return np; | ||
203 | } | ||
204 | |||
205 | #ifdef CONFIG_RCU_TRACE | ||
206 | |||
207 | #ifdef CONFIG_RCU_BOOST | ||
208 | static void rcu_initiate_boost_trace(void); | ||
209 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
210 | |||
211 | /* | ||
212 | * Dump additional statistice for TINY_PREEMPT_RCU. | ||
213 | */ | ||
214 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
215 | { | ||
216 | seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", | ||
217 | rcu_preempt_ctrlblk.rcb.qlen, | ||
218 | rcu_preempt_ctrlblk.n_grace_periods, | ||
219 | rcu_preempt_ctrlblk.gpnum, | ||
220 | rcu_preempt_ctrlblk.gpcpu, | ||
221 | rcu_preempt_ctrlblk.completed, | ||
222 | "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], | ||
223 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | ||
224 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | ||
225 | #ifdef CONFIG_RCU_BOOST | ||
226 | seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
227 | " ", | ||
228 | "B."[!rcu_preempt_ctrlblk.boost_tasks], | ||
229 | rcu_preempt_ctrlblk.n_tasks_boosted, | ||
230 | rcu_preempt_ctrlblk.n_exp_boosts, | ||
231 | rcu_preempt_ctrlblk.n_normal_boosts, | ||
232 | (int)(jiffies & 0xffff), | ||
233 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | ||
234 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", | ||
235 | " balk", | ||
236 | rcu_preempt_ctrlblk.n_balk_blkd_tasks, | ||
237 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, | ||
238 | rcu_preempt_ctrlblk.n_balk_boost_tasks, | ||
239 | rcu_preempt_ctrlblk.n_balk_notyet, | ||
240 | rcu_preempt_ctrlblk.n_balk_nos); | ||
241 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
242 | } | ||
243 | |||
244 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
245 | |||
246 | #ifdef CONFIG_RCU_BOOST | ||
247 | |||
248 | #include "rtmutex_common.h" | ||
249 | |||
250 | /* | ||
251 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | ||
252 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | ||
253 | */ | ||
254 | static int rcu_boost(void) | ||
255 | { | ||
256 | unsigned long flags; | ||
257 | struct rt_mutex mtx; | ||
258 | struct task_struct *t; | ||
259 | struct list_head *tb; | ||
260 | |||
261 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
262 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
263 | return 0; /* Nothing to boost. */ | ||
264 | |||
265 | raw_local_irq_save(flags); | ||
266 | |||
267 | /* | ||
268 | * Recheck with irqs disabled: all tasks in need of boosting | ||
269 | * might exit their RCU read-side critical sections on their own | ||
270 | * if we are preempted just before disabling irqs. | ||
271 | */ | ||
272 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
273 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
274 | raw_local_irq_restore(flags); | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * Preferentially boost tasks blocking expedited grace periods. | ||
280 | * This cannot starve the normal grace periods because a second | ||
281 | * expedited grace period must boost all blocked tasks, including | ||
282 | * those blocking the pre-existing normal grace period. | ||
283 | */ | ||
284 | if (rcu_preempt_ctrlblk.exp_tasks != NULL) { | ||
285 | tb = rcu_preempt_ctrlblk.exp_tasks; | ||
286 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
287 | } else { | ||
288 | tb = rcu_preempt_ctrlblk.boost_tasks; | ||
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
290 | } | ||
291 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
292 | |||
293 | /* | ||
294 | * We boost task t by manufacturing an rt_mutex that appears to | ||
295 | * be held by task t. We leave a pointer to that rt_mutex where | ||
296 | * task t can find it, and task t will release the mutex when it | ||
297 | * exits its outermost RCU read-side critical section. Then | ||
298 | * simply acquiring this artificial rt_mutex will boost task | ||
299 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
300 | */ | ||
301 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
302 | rt_mutex_init_proxy_locked(&mtx, t); | ||
303 | t->rcu_boost_mutex = &mtx; | ||
304 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
305 | raw_local_irq_restore(flags); | ||
306 | rt_mutex_lock(&mtx); | ||
307 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
308 | |||
309 | return rcu_preempt_ctrlblk.boost_tasks != NULL || | ||
310 | rcu_preempt_ctrlblk.exp_tasks != NULL; | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Check to see if it is now time to start boosting RCU readers blocking | ||
315 | * the current grace period, and, if so, tell the rcu_kthread_task to | ||
316 | * start boosting them. If there is an expedited boost in progress, | ||
317 | * we wait for it to complete. | ||
318 | * | ||
319 | * If there are no blocked readers blocking the current grace period, | ||
320 | * return 0 to let the caller know, otherwise return 1. Note that this | ||
321 | * return value is independent of whether or not boosting was done. | ||
322 | */ | ||
323 | static int rcu_initiate_boost(void) | ||
324 | { | ||
325 | if (!rcu_preempt_blocked_readers_cgp() && | ||
326 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
327 | RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); | ||
328 | return 0; | ||
329 | } | ||
330 | if (rcu_preempt_ctrlblk.exp_tasks != NULL || | ||
331 | (rcu_preempt_ctrlblk.gp_tasks != NULL && | ||
332 | rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
333 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { | ||
334 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
335 | rcu_preempt_ctrlblk.boost_tasks = | ||
336 | rcu_preempt_ctrlblk.gp_tasks; | ||
337 | invoke_rcu_kthread(); | ||
338 | } else | ||
339 | RCU_TRACE(rcu_initiate_boost_trace()); | ||
340 | return 1; | ||
341 | } | ||
342 | |||
343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
344 | |||
345 | /* | ||
346 | * Do priority-boost accounting for the start of a new grace period. | ||
347 | */ | ||
348 | static void rcu_preempt_boost_start_gp(void) | ||
349 | { | ||
350 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
351 | } | ||
352 | |||
353 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
354 | |||
355 | /* | ||
356 | * If there is no RCU priority boosting, we don't boost. | ||
357 | */ | ||
358 | static int rcu_boost(void) | ||
359 | { | ||
360 | return 0; | ||
361 | } | ||
362 | |||
363 | /* | ||
364 | * If there is no RCU priority boosting, we don't initiate boosting, | ||
365 | * but we do indicate whether there are blocked readers blocking the | ||
366 | * current grace period. | ||
367 | */ | ||
368 | static int rcu_initiate_boost(void) | ||
369 | { | ||
370 | return rcu_preempt_blocked_readers_cgp(); | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * If there is no RCU priority boosting, nothing to do at grace-period start. | ||
375 | */ | ||
376 | static void rcu_preempt_boost_start_gp(void) | ||
377 | { | ||
378 | } | ||
379 | |||
380 | #endif /* else #ifdef CONFIG_RCU_BOOST */ | ||
381 | |||
382 | /* | ||
383 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
384 | * that this just means that the task currently running on the CPU is | ||
385 | * in a quiescent state. There might be any number of tasks blocked | ||
386 | * while in an RCU read-side critical section. | ||
387 | * | ||
388 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
389 | * must disable irqs in order to protect the assignment to | ||
390 | * ->rcu_read_unlock_special. | ||
391 | * | ||
392 | * Because this is a single-CPU implementation, the only way a grace | ||
393 | * period can end is if the CPU is in a quiescent state. The reason is | ||
394 | * that a blocked preemptible-RCU reader can exit its critical section | ||
395 | * only if the CPU is running it at the time. Therefore, when the | ||
396 | * last task blocking the current grace period exits its RCU read-side | ||
397 | * critical section, neither the CPU nor blocked tasks will be stopping | ||
398 | * the current grace period. (In contrast, SMP implementations | ||
399 | * might have CPUs running in RCU read-side critical sections that | ||
400 | * block later grace periods -- but this is not possible given only | ||
401 | * one CPU.) | ||
402 | */ | ||
403 | static void rcu_preempt_cpu_qs(void) | ||
404 | { | ||
405 | /* Record both CPU and task as having responded to current GP. */ | ||
406 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | ||
407 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
408 | |||
409 | /* If there is no GP then there is nothing more to do. */ | ||
410 | if (!rcu_preempt_gp_in_progress()) | ||
411 | return; | ||
412 | /* | ||
413 | * Check up on boosting. If there are readers blocking the | ||
414 | * current grace period, leave. | ||
415 | */ | ||
416 | if (rcu_initiate_boost()) | ||
417 | return; | ||
418 | |||
419 | /* Advance callbacks. */ | ||
420 | rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; | ||
421 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; | ||
422 | rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; | ||
423 | |||
424 | /* If there are no blocked readers, next GP is done instantly. */ | ||
425 | if (!rcu_preempt_blocked_readers_any()) | ||
426 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | ||
427 | |||
428 | /* If there are done callbacks, cause them to be invoked. */ | ||
429 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | ||
430 | invoke_rcu_kthread(); | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Start a new RCU grace period if warranted. Hard irqs must be disabled. | ||
435 | */ | ||
436 | static void rcu_preempt_start_gp(void) | ||
437 | { | ||
438 | if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { | ||
439 | |||
440 | /* Official start of GP. */ | ||
441 | rcu_preempt_ctrlblk.gpnum++; | ||
442 | RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | ||
443 | |||
444 | /* Any blocked RCU readers block new GP. */ | ||
445 | if (rcu_preempt_blocked_readers_any()) | ||
446 | rcu_preempt_ctrlblk.gp_tasks = | ||
447 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
448 | |||
449 | /* Set up for RCU priority boosting. */ | ||
450 | rcu_preempt_boost_start_gp(); | ||
451 | |||
452 | /* If there is no running reader, CPU is done with GP. */ | ||
453 | if (!rcu_preempt_running_reader()) | ||
454 | rcu_preempt_cpu_qs(); | ||
455 | } | ||
456 | } | ||
457 | |||
458 | /* | ||
459 | * We have entered the scheduler, and the current task might soon be | ||
460 | * context-switched away from. If this task is in an RCU read-side | ||
461 | * critical section, we will no longer be able to rely on the CPU to | ||
462 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
463 | * If the task started after the current grace period began, as recorded | ||
464 | * by ->gpcpu, we enqueue at the beginning of the list. Otherwise | ||
465 | * before the element referenced by ->gp_tasks (or at the tail if | ||
466 | * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. | ||
467 | * The task will dequeue itself when it exits the outermost enclosing | ||
468 | * RCU read-side critical section. Therefore, the current grace period | ||
469 | * cannot be permitted to complete until the ->gp_tasks pointer becomes | ||
470 | * NULL. | ||
471 | * | ||
472 | * Caller must disable preemption. | ||
473 | */ | ||
474 | void rcu_preempt_note_context_switch(void) | ||
475 | { | ||
476 | struct task_struct *t = current; | ||
477 | unsigned long flags; | ||
478 | |||
479 | local_irq_save(flags); /* must exclude scheduler_tick(). */ | ||
480 | if (rcu_preempt_running_reader() && | ||
481 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
482 | |||
483 | /* Possibly blocking in an RCU read-side critical section. */ | ||
484 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
485 | |||
486 | /* | ||
487 | * If this CPU has already checked in, then this task | ||
488 | * will hold up the next grace period rather than the | ||
489 | * current grace period. Queue the task accordingly. | ||
490 | * If the task is queued for the current grace period | ||
491 | * (i.e., this CPU has not yet passed through a quiescent | ||
492 | * state for the current grace period), then as long | ||
493 | * as that task remains queued, the current grace period | ||
494 | * cannot end. | ||
495 | */ | ||
496 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); | ||
497 | if (rcu_cpu_blocking_cur_gp()) | ||
498 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; | ||
499 | } | ||
500 | |||
501 | /* | ||
502 | * Either we were not in an RCU read-side critical section to | ||
503 | * begin with, or we have now recorded that critical section | ||
504 | * globally. Either way, we can now note a quiescent state | ||
505 | * for this CPU. Again, if we were in an RCU read-side critical | ||
506 | * section, and if that critical section was blocking the current | ||
507 | * grace period, then the fact that the task has been enqueued | ||
508 | * means that current grace period continues to be blocked. | ||
509 | */ | ||
510 | rcu_preempt_cpu_qs(); | ||
511 | local_irq_restore(flags); | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * Tiny-preemptible RCU implementation for rcu_read_lock(). | ||
516 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
517 | * if we block. | ||
518 | */ | ||
519 | void __rcu_read_lock(void) | ||
520 | { | ||
521 | current->rcu_read_lock_nesting++; | ||
522 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ | ||
523 | } | ||
524 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
525 | |||
526 | /* | ||
527 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
528 | * notify RCU core processing or task having blocked during the RCU | ||
529 | * read-side critical section. | ||
530 | */ | ||
531 | static void rcu_read_unlock_special(struct task_struct *t) | ||
532 | { | ||
533 | int empty; | ||
534 | int empty_exp; | ||
535 | unsigned long flags; | ||
536 | struct list_head *np; | ||
537 | int special; | ||
538 | |||
539 | /* | ||
540 | * NMI handlers cannot block and cannot safely manipulate state. | ||
541 | * They therefore cannot possibly be special, so just leave. | ||
542 | */ | ||
543 | if (in_nmi()) | ||
544 | return; | ||
545 | |||
546 | local_irq_save(flags); | ||
547 | |||
548 | /* | ||
549 | * If RCU core is waiting for this CPU to exit critical section, | ||
550 | * let it know that we have done so. | ||
551 | */ | ||
552 | special = t->rcu_read_unlock_special; | ||
553 | if (special & RCU_READ_UNLOCK_NEED_QS) | ||
554 | rcu_preempt_cpu_qs(); | ||
555 | |||
556 | /* Hardware IRQ handlers cannot block. */ | ||
557 | if (in_irq()) { | ||
558 | local_irq_restore(flags); | ||
559 | return; | ||
560 | } | ||
561 | |||
562 | /* Clean up if blocked during RCU read-side critical section. */ | ||
563 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
564 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
565 | |||
566 | /* | ||
567 | * Remove this task from the ->blkd_tasks list and adjust | ||
568 | * any pointers that might have been referencing it. | ||
569 | */ | ||
570 | empty = !rcu_preempt_blocked_readers_cgp(); | ||
571 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | ||
572 | np = rcu_next_node_entry(t); | ||
573 | list_del_init(&t->rcu_node_entry); | ||
574 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | ||
575 | rcu_preempt_ctrlblk.gp_tasks = np; | ||
576 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | ||
577 | rcu_preempt_ctrlblk.exp_tasks = np; | ||
578 | #ifdef CONFIG_RCU_BOOST | ||
579 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | ||
580 | rcu_preempt_ctrlblk.boost_tasks = np; | ||
581 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
582 | |||
583 | /* | ||
584 | * If this was the last task on the current list, and if | ||
585 | * we aren't waiting on the CPU, report the quiescent state | ||
586 | * and start a new grace period if needed. | ||
587 | */ | ||
588 | if (!empty && !rcu_preempt_blocked_readers_cgp()) { | ||
589 | rcu_preempt_cpu_qs(); | ||
590 | rcu_preempt_start_gp(); | ||
591 | } | ||
26 | 592 | ||
593 | /* | ||
594 | * If this was the last task on the expedited lists, | ||
595 | * then we need wake up the waiting task. | ||
596 | */ | ||
597 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
598 | rcu_report_exp_done(); | ||
599 | } | ||
600 | #ifdef CONFIG_RCU_BOOST | ||
601 | /* Unboost self if was boosted. */ | ||
602 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
603 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | ||
604 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
605 | t->rcu_boost_mutex = NULL; | ||
606 | } | ||
607 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
608 | local_irq_restore(flags); | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * Tiny-preemptible RCU implementation for rcu_read_unlock(). | ||
613 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
614 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
615 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
616 | * in an RCU read-side critical section and other special cases. | ||
617 | */ | ||
618 | void __rcu_read_unlock(void) | ||
619 | { | ||
620 | struct task_struct *t = current; | ||
621 | |||
622 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | ||
623 | --t->rcu_read_lock_nesting; | ||
624 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
625 | if (t->rcu_read_lock_nesting == 0 && | ||
626 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
627 | rcu_read_unlock_special(t); | ||
628 | #ifdef CONFIG_PROVE_LOCKING | ||
629 | WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); | ||
630 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
631 | } | ||
632 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
633 | |||
634 | /* | ||
635 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
636 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | ||
637 | * checked elsewhere. This is called from the scheduling-clock interrupt. | ||
638 | * | ||
639 | * Caller must disable hard irqs. | ||
640 | */ | ||
641 | static void rcu_preempt_check_callbacks(void) | ||
642 | { | ||
643 | struct task_struct *t = current; | ||
644 | |||
645 | if (rcu_preempt_gp_in_progress() && | ||
646 | (!rcu_preempt_running_reader() || | ||
647 | !rcu_cpu_blocking_cur_gp())) | ||
648 | rcu_preempt_cpu_qs(); | ||
649 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | ||
650 | rcu_preempt_ctrlblk.rcb.donetail) | ||
651 | invoke_rcu_kthread(); | ||
652 | if (rcu_preempt_gp_in_progress() && | ||
653 | rcu_cpu_blocking_cur_gp() && | ||
654 | rcu_preempt_running_reader()) | ||
655 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
656 | } | ||
657 | |||
658 | /* | ||
659 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | ||
660 | * update, so this is invoked from rcu_process_callbacks() to | ||
661 | * handle that case. Of course, it is invoked for all flavors of | ||
662 | * RCU, but RCU callbacks can appear only on one of the lists, and | ||
663 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | ||
664 | * is no need for an explicit check. | ||
665 | */ | ||
666 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
667 | { | ||
668 | if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) | ||
669 | rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * Process callbacks for preemptible RCU. | ||
674 | */ | ||
675 | static void rcu_preempt_process_callbacks(void) | ||
676 | { | ||
677 | rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | ||
678 | } | ||
679 | |||
680 | /* | ||
681 | * Queue a preemptible -RCU callback for invocation after a grace period. | ||
682 | */ | ||
683 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
684 | { | ||
685 | unsigned long flags; | ||
686 | |||
687 | debug_rcu_head_queue(head); | ||
688 | head->func = func; | ||
689 | head->next = NULL; | ||
690 | |||
691 | local_irq_save(flags); | ||
692 | *rcu_preempt_ctrlblk.nexttail = head; | ||
693 | rcu_preempt_ctrlblk.nexttail = &head->next; | ||
694 | RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); | ||
695 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | ||
696 | local_irq_restore(flags); | ||
697 | } | ||
698 | EXPORT_SYMBOL_GPL(call_rcu); | ||
699 | |||
700 | void rcu_barrier(void) | ||
701 | { | ||
702 | struct rcu_synchronize rcu; | ||
703 | |||
704 | init_rcu_head_on_stack(&rcu.head); | ||
705 | init_completion(&rcu.completion); | ||
706 | /* Will wake me after RCU finished. */ | ||
707 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
708 | /* Wait for it. */ | ||
709 | wait_for_completion(&rcu.completion); | ||
710 | destroy_rcu_head_on_stack(&rcu.head); | ||
711 | } | ||
712 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
713 | |||
714 | /* | ||
715 | * synchronize_rcu - wait until a grace period has elapsed. | ||
716 | * | ||
717 | * Control will return to the caller some time after a full grace | ||
718 | * period has elapsed, in other words after all currently executing RCU | ||
719 | * read-side critical sections have completed. RCU read-side critical | ||
720 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
721 | * and may be nested. | ||
722 | */ | ||
723 | void synchronize_rcu(void) | ||
724 | { | ||
725 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
726 | if (!rcu_scheduler_active) | ||
727 | return; | ||
728 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
729 | |||
730 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
731 | if (!rcu_preempt_blocked_readers_any()) | ||
732 | return; | ||
733 | |||
734 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | ||
735 | rcu_barrier(); | ||
736 | } | ||
737 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
738 | |||
739 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
740 | static unsigned long sync_rcu_preempt_exp_count; | ||
741 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
742 | |||
743 | /* | ||
744 | * Return non-zero if there are any tasks in RCU read-side critical | ||
745 | * sections blocking the current preemptible-RCU expedited grace period. | ||
746 | * If there is no preemptible-RCU expedited grace period currently in | ||
747 | * progress, returns zero unconditionally. | ||
748 | */ | ||
749 | static int rcu_preempted_readers_exp(void) | ||
750 | { | ||
751 | return rcu_preempt_ctrlblk.exp_tasks != NULL; | ||
752 | } | ||
753 | |||
754 | /* | ||
755 | * Report the exit from RCU read-side critical section for the last task | ||
756 | * that queued itself during or before the current expedited preemptible-RCU | ||
757 | * grace period. | ||
758 | */ | ||
759 | static void rcu_report_exp_done(void) | ||
760 | { | ||
761 | wake_up(&sync_rcu_preempt_exp_wq); | ||
762 | } | ||
763 | |||
764 | /* | ||
765 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | ||
766 | * is to rely in the fact that there is but one CPU, and that it is | ||
767 | * illegal for a task to invoke synchronize_rcu_expedited() while in a | ||
768 | * preemptible-RCU read-side critical section. Therefore, any such | ||
769 | * critical sections must correspond to blocked tasks, which must therefore | ||
770 | * be on the ->blkd_tasks list. So just record the current head of the | ||
771 | * list in the ->exp_tasks pointer, and wait for all tasks including and | ||
772 | * after the task pointed to by ->exp_tasks to drain. | ||
773 | */ | ||
774 | void synchronize_rcu_expedited(void) | ||
775 | { | ||
776 | unsigned long flags; | ||
777 | struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; | ||
778 | unsigned long snap; | ||
779 | |||
780 | barrier(); /* ensure prior action seen before grace period. */ | ||
781 | |||
782 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
783 | |||
784 | /* | ||
785 | * Acquire lock so that there is only one preemptible RCU grace | ||
786 | * period in flight. Of course, if someone does the expedited | ||
787 | * grace period for us while we are acquiring the lock, just leave. | ||
788 | */ | ||
789 | snap = sync_rcu_preempt_exp_count + 1; | ||
790 | mutex_lock(&sync_rcu_preempt_exp_mutex); | ||
791 | if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) | ||
792 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
793 | |||
794 | local_irq_save(flags); | ||
795 | |||
796 | /* | ||
797 | * All RCU readers have to already be on blkd_tasks because | ||
798 | * we cannot legally be executing in an RCU read-side critical | ||
799 | * section. | ||
800 | */ | ||
801 | |||
802 | /* Snapshot current head of ->blkd_tasks list. */ | ||
803 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | ||
804 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | ||
805 | rpcp->exp_tasks = NULL; | ||
806 | |||
807 | /* Wait for tail of ->blkd_tasks list to drain. */ | ||
808 | if (!rcu_preempted_readers_exp()) | ||
809 | local_irq_restore(flags); | ||
810 | else { | ||
811 | rcu_initiate_boost(); | ||
812 | local_irq_restore(flags); | ||
813 | wait_event(sync_rcu_preempt_exp_wq, | ||
814 | !rcu_preempted_readers_exp()); | ||
815 | } | ||
816 | |||
817 | /* Clean up and exit. */ | ||
818 | barrier(); /* ensure expedited GP seen before counter increment. */ | ||
819 | sync_rcu_preempt_exp_count++; | ||
820 | unlock_mb_ret: | ||
821 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
822 | barrier(); /* ensure subsequent action seen after grace period. */ | ||
823 | } | ||
824 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
825 | |||
826 | /* | ||
827 | * Does preemptible RCU need the CPU to stay out of dynticks mode? | ||
828 | */ | ||
829 | int rcu_preempt_needs_cpu(void) | ||
830 | { | ||
831 | if (!rcu_preempt_running_reader()) | ||
832 | rcu_preempt_cpu_qs(); | ||
833 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | ||
834 | } | ||
835 | |||
836 | /* | ||
837 | * Check for a task exiting while in a preemptible -RCU read-side | ||
838 | * critical section, clean up if so. No need to issue warnings, | ||
839 | * as debug_check_no_locks_held() already does this if lockdep | ||
840 | * is enabled. | ||
841 | */ | ||
842 | void exit_rcu(void) | ||
843 | { | ||
844 | struct task_struct *t = current; | ||
845 | |||
846 | if (t->rcu_read_lock_nesting == 0) | ||
847 | return; | ||
848 | t->rcu_read_lock_nesting = 1; | ||
849 | __rcu_read_unlock(); | ||
850 | } | ||
851 | |||
852 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
853 | |||
854 | #ifdef CONFIG_RCU_TRACE | ||
855 | |||
856 | /* | ||
857 | * Because preemptible RCU does not exist, it is not necessary to | ||
858 | * dump out its statistics. | ||
859 | */ | ||
860 | static void show_tiny_preempt_stats(struct seq_file *m) | ||
861 | { | ||
862 | } | ||
863 | |||
864 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
865 | |||
866 | /* | ||
867 | * Because preemptible RCU does not exist, it is never necessary to | ||
868 | * boost preempted RCU readers. | ||
869 | */ | ||
870 | static int rcu_boost(void) | ||
871 | { | ||
872 | return 0; | ||
873 | } | ||
874 | |||
875 | /* | ||
876 | * Because preemptible RCU does not exist, it never has any callbacks | ||
877 | * to check. | ||
878 | */ | ||
879 | static void rcu_preempt_check_callbacks(void) | ||
880 | { | ||
881 | } | ||
882 | |||
883 | /* | ||
884 | * Because preemptible RCU does not exist, it never has any callbacks | ||
885 | * to remove. | ||
886 | */ | ||
887 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
888 | { | ||
889 | } | ||
890 | |||
891 | /* | ||
892 | * Because preemptible RCU does not exist, it never has any callbacks | ||
893 | * to process. | ||
894 | */ | ||
895 | static void rcu_preempt_process_callbacks(void) | ||
896 | { | ||
897 | } | ||
898 | |||
899 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
900 | |||
901 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
27 | #include <linux/kernel_stat.h> | 902 | #include <linux/kernel_stat.h> |
28 | 903 | ||
29 | /* | 904 | /* |
30 | * During boot, we forgive RCU lockdep issues. After this function is | 905 | * During boot, we forgive RCU lockdep issues. After this function is |
31 | * invoked, we start taking RCU lockdep issues seriously. | 906 | * invoked, we start taking RCU lockdep issues seriously. |
32 | */ | 907 | */ |
33 | void rcu_scheduler_starting(void) | 908 | void __init rcu_scheduler_starting(void) |
34 | { | 909 | { |
35 | WARN_ON(nr_context_switches() > 0); | 910 | WARN_ON(nr_context_switches() > 0); |
36 | rcu_scheduler_active = 1; | 911 | rcu_scheduler_active = 1; |
37 | } | 912 | } |
38 | 913 | ||
39 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 914 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
915 | |||
916 | #ifdef CONFIG_RCU_BOOST | ||
917 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
918 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
919 | #define RCU_BOOST_PRIO 1 | ||
920 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
921 | |||
922 | #ifdef CONFIG_RCU_TRACE | ||
923 | |||
924 | #ifdef CONFIG_RCU_BOOST | ||
925 | |||
926 | static void rcu_initiate_boost_trace(void) | ||
927 | { | ||
928 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
929 | rcu_preempt_ctrlblk.n_balk_blkd_tasks++; | ||
930 | else if (rcu_preempt_ctrlblk.gp_tasks == NULL && | ||
931 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
932 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; | ||
933 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | ||
934 | rcu_preempt_ctrlblk.n_balk_boost_tasks++; | ||
935 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | ||
936 | rcu_preempt_ctrlblk.n_balk_notyet++; | ||
937 | else | ||
938 | rcu_preempt_ctrlblk.n_balk_nos++; | ||
939 | } | ||
940 | |||
941 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
942 | |||
943 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | ||
944 | { | ||
945 | unsigned long flags; | ||
946 | |||
947 | raw_local_irq_save(flags); | ||
948 | rcp->qlen -= n; | ||
949 | raw_local_irq_restore(flags); | ||
950 | } | ||
951 | |||
952 | /* | ||
953 | * Dump statistics for TINY_RCU, such as they are. | ||
954 | */ | ||
955 | static int show_tiny_stats(struct seq_file *m, void *unused) | ||
956 | { | ||
957 | show_tiny_preempt_stats(m); | ||
958 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | ||
959 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | ||
960 | return 0; | ||
961 | } | ||
962 | |||
963 | static int show_tiny_stats_open(struct inode *inode, struct file *file) | ||
964 | { | ||
965 | return single_open(file, show_tiny_stats, NULL); | ||
966 | } | ||
967 | |||
968 | static const struct file_operations show_tiny_stats_fops = { | ||
969 | .owner = THIS_MODULE, | ||
970 | .open = show_tiny_stats_open, | ||
971 | .read = seq_read, | ||
972 | .llseek = seq_lseek, | ||
973 | .release = single_release, | ||
974 | }; | ||
975 | |||
976 | static struct dentry *rcudir; | ||
977 | |||
978 | static int __init rcutiny_trace_init(void) | ||
979 | { | ||
980 | struct dentry *retval; | ||
981 | |||
982 | rcudir = debugfs_create_dir("rcu", NULL); | ||
983 | if (!rcudir) | ||
984 | goto free_out; | ||
985 | retval = debugfs_create_file("rcudata", 0444, rcudir, | ||
986 | NULL, &show_tiny_stats_fops); | ||
987 | if (!retval) | ||
988 | goto free_out; | ||
989 | return 0; | ||
990 | free_out: | ||
991 | debugfs_remove_recursive(rcudir); | ||
992 | return 1; | ||
993 | } | ||
994 | |||
995 | static void __exit rcutiny_trace_cleanup(void) | ||
996 | { | ||
997 | debugfs_remove_recursive(rcudir); | ||
998 | } | ||
999 | |||
1000 | module_init(rcutiny_trace_init); | ||
1001 | module_exit(rcutiny_trace_cleanup); | ||
1002 | |||
1003 | MODULE_AUTHOR("Paul E. McKenney"); | ||
1004 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | ||
1005 | MODULE_LICENSE("GPL"); | ||
1006 | |||
1007 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..2e138db03382 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -64,6 +64,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
68 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
69 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
67 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | 70 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
68 | 71 | ||
69 | module_param(nreaders, int, 0444); | 72 | module_param(nreaders, int, 0444); |
@@ -88,6 +91,12 @@ module_param(fqs_holdoff, int, 0444); | |||
88 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 91 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
89 | module_param(fqs_stutter, int, 0444); | 92 | module_param(fqs_stutter, int, 0444); |
90 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 93 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
94 | module_param(test_boost, int, 0444); | ||
95 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | ||
96 | module_param(test_boost_interval, int, 0444); | ||
97 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
98 | module_param(test_boost_duration, int, 0444); | ||
99 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | ||
91 | module_param(torture_type, charp, 0444); | 100 | module_param(torture_type, charp, 0444); |
92 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 101 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
93 | 102 | ||
@@ -109,6 +118,7 @@ static struct task_struct *stats_task; | |||
109 | static struct task_struct *shuffler_task; | 118 | static struct task_struct *shuffler_task; |
110 | static struct task_struct *stutter_task; | 119 | static struct task_struct *stutter_task; |
111 | static struct task_struct *fqs_task; | 120 | static struct task_struct *fqs_task; |
121 | static struct task_struct *boost_tasks[NR_CPUS]; | ||
112 | 122 | ||
113 | #define RCU_TORTURE_PIPE_LEN 10 | 123 | #define RCU_TORTURE_PIPE_LEN 10 |
114 | 124 | ||
@@ -120,8 +130,8 @@ struct rcu_torture { | |||
120 | }; | 130 | }; |
121 | 131 | ||
122 | static LIST_HEAD(rcu_torture_freelist); | 132 | static LIST_HEAD(rcu_torture_freelist); |
123 | static struct rcu_torture *rcu_torture_current; | 133 | static struct rcu_torture __rcu *rcu_torture_current; |
124 | static long rcu_torture_current_version; | 134 | static unsigned long rcu_torture_current_version; |
125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
126 | static DEFINE_SPINLOCK(rcu_torture_lock); | 136 | static DEFINE_SPINLOCK(rcu_torture_lock); |
127 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | 137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
@@ -134,6 +144,10 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
134 | static atomic_t n_rcu_torture_free; | 144 | static atomic_t n_rcu_torture_free; |
135 | static atomic_t n_rcu_torture_mberror; | 145 | static atomic_t n_rcu_torture_mberror; |
136 | static atomic_t n_rcu_torture_error; | 146 | static atomic_t n_rcu_torture_error; |
147 | static long n_rcu_torture_boost_ktrerror; | ||
148 | static long n_rcu_torture_boost_rterror; | ||
149 | static long n_rcu_torture_boost_failure; | ||
150 | static long n_rcu_torture_boosts; | ||
137 | static long n_rcu_torture_timers; | 151 | static long n_rcu_torture_timers; |
138 | static struct list_head rcu_torture_removed; | 152 | static struct list_head rcu_torture_removed; |
139 | static cpumask_var_t shuffle_tmp_mask; | 153 | static cpumask_var_t shuffle_tmp_mask; |
@@ -147,14 +161,26 @@ static int stutter_pause_test; | |||
147 | #endif | 161 | #endif |
148 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
149 | 163 | ||
164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | ||
165 | #define rcu_can_boost() 1 | ||
166 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | ||
167 | #define rcu_can_boost() 0 | ||
168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | ||
169 | |||
170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | ||
171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | ||
172 | /* and boost task create/destroy. */ | ||
173 | |||
150 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 174 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
151 | 175 | ||
152 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ | 176 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ |
153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ | 177 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ |
154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ | 178 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ |
155 | static int fullstop = FULLSTOP_RMMOD; | 179 | static int fullstop = FULLSTOP_RMMOD; |
156 | DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ | 180 | /* |
157 | /* of kthreads. */ | 181 | * Protect fullstop transitions and spawning of kthreads. |
182 | */ | ||
183 | static DEFINE_MUTEX(fullstop_mutex); | ||
158 | 184 | ||
159 | /* | 185 | /* |
160 | * Detect and respond to a system shutdown. | 186 | * Detect and respond to a system shutdown. |
@@ -275,6 +301,7 @@ struct rcu_torture_ops { | |||
275 | void (*fqs)(void); | 301 | void (*fqs)(void); |
276 | int (*stats)(char *page); | 302 | int (*stats)(char *page); |
277 | int irq_capable; | 303 | int irq_capable; |
304 | int can_boost; | ||
278 | char *name; | 305 | char *name; |
279 | }; | 306 | }; |
280 | 307 | ||
@@ -303,6 +330,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) | |||
303 | mdelay(longdelay_ms); | 330 | mdelay(longdelay_ms); |
304 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | 331 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) |
305 | udelay(shortdelay_us); | 332 | udelay(shortdelay_us); |
333 | #ifdef CONFIG_PREEMPT | ||
334 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) | ||
335 | preempt_schedule(); /* No QS if preempt_disable() in effect */ | ||
336 | #endif | ||
306 | } | 337 | } |
307 | 338 | ||
308 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | 339 | static void rcu_torture_read_unlock(int idx) __releases(RCU) |
@@ -360,6 +391,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
360 | .fqs = rcu_force_quiescent_state, | 391 | .fqs = rcu_force_quiescent_state, |
361 | .stats = NULL, | 392 | .stats = NULL, |
362 | .irq_capable = 1, | 393 | .irq_capable = 1, |
394 | .can_boost = rcu_can_boost(), | ||
363 | .name = "rcu" | 395 | .name = "rcu" |
364 | }; | 396 | }; |
365 | 397 | ||
@@ -402,6 +434,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
402 | .fqs = rcu_force_quiescent_state, | 434 | .fqs = rcu_force_quiescent_state, |
403 | .stats = NULL, | 435 | .stats = NULL, |
404 | .irq_capable = 1, | 436 | .irq_capable = 1, |
437 | .can_boost = rcu_can_boost(), | ||
405 | .name = "rcu_sync" | 438 | .name = "rcu_sync" |
406 | }; | 439 | }; |
407 | 440 | ||
@@ -418,6 +451,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
418 | .fqs = rcu_force_quiescent_state, | 451 | .fqs = rcu_force_quiescent_state, |
419 | .stats = NULL, | 452 | .stats = NULL, |
420 | .irq_capable = 1, | 453 | .irq_capable = 1, |
454 | .can_boost = rcu_can_boost(), | ||
421 | .name = "rcu_expedited" | 455 | .name = "rcu_expedited" |
422 | }; | 456 | }; |
423 | 457 | ||
@@ -536,6 +570,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) | |||
536 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); | 570 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); |
537 | if (!delay) | 571 | if (!delay) |
538 | schedule_timeout_interruptible(longdelay); | 572 | schedule_timeout_interruptible(longdelay); |
573 | else | ||
574 | rcu_read_delay(rrsp); | ||
539 | } | 575 | } |
540 | 576 | ||
541 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) | 577 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
@@ -676,6 +712,112 @@ static struct rcu_torture_ops sched_expedited_ops = { | |||
676 | }; | 712 | }; |
677 | 713 | ||
678 | /* | 714 | /* |
715 | * RCU torture priority-boost testing. Runs one real-time thread per | ||
716 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | ||
717 | * spinning waiting for them to be invoked. If a given callback takes | ||
718 | * too long to be invoked, we assume that priority inversion has occurred. | ||
719 | */ | ||
720 | |||
721 | struct rcu_boost_inflight { | ||
722 | struct rcu_head rcu; | ||
723 | int inflight; | ||
724 | }; | ||
725 | |||
726 | static void rcu_torture_boost_cb(struct rcu_head *head) | ||
727 | { | ||
728 | struct rcu_boost_inflight *rbip = | ||
729 | container_of(head, struct rcu_boost_inflight, rcu); | ||
730 | |||
731 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | ||
732 | rbip->inflight = 0; | ||
733 | } | ||
734 | |||
735 | static int rcu_torture_boost(void *arg) | ||
736 | { | ||
737 | unsigned long call_rcu_time; | ||
738 | unsigned long endtime; | ||
739 | unsigned long oldstarttime; | ||
740 | struct rcu_boost_inflight rbi = { .inflight = 0 }; | ||
741 | struct sched_param sp; | ||
742 | |||
743 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | ||
744 | |||
745 | /* Set real-time priority. */ | ||
746 | sp.sched_priority = 1; | ||
747 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | ||
748 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | ||
749 | n_rcu_torture_boost_rterror++; | ||
750 | } | ||
751 | |||
752 | init_rcu_head_on_stack(&rbi.rcu); | ||
753 | /* Each pass through the following loop does one boost-test cycle. */ | ||
754 | do { | ||
755 | /* Wait for the next test interval. */ | ||
756 | oldstarttime = boost_starttime; | ||
757 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | ||
758 | schedule_timeout_uninterruptible(1); | ||
759 | rcu_stutter_wait("rcu_torture_boost"); | ||
760 | if (kthread_should_stop() || | ||
761 | fullstop != FULLSTOP_DONTSTOP) | ||
762 | goto checkwait; | ||
763 | } | ||
764 | |||
765 | /* Do one boost-test interval. */ | ||
766 | endtime = oldstarttime + test_boost_duration * HZ; | ||
767 | call_rcu_time = jiffies; | ||
768 | while (jiffies - endtime > ULONG_MAX / 2) { | ||
769 | /* If we don't have a callback in flight, post one. */ | ||
770 | if (!rbi.inflight) { | ||
771 | smp_mb(); /* RCU core before ->inflight = 1. */ | ||
772 | rbi.inflight = 1; | ||
773 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); | ||
774 | if (jiffies - call_rcu_time > | ||
775 | test_boost_duration * HZ - HZ / 2) { | ||
776 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | ||
777 | n_rcu_torture_boost_failure++; | ||
778 | } | ||
779 | call_rcu_time = jiffies; | ||
780 | } | ||
781 | cond_resched(); | ||
782 | rcu_stutter_wait("rcu_torture_boost"); | ||
783 | if (kthread_should_stop() || | ||
784 | fullstop != FULLSTOP_DONTSTOP) | ||
785 | goto checkwait; | ||
786 | } | ||
787 | |||
788 | /* | ||
789 | * Set the start time of the next test interval. | ||
790 | * Yes, this is vulnerable to long delays, but such | ||
791 | * delays simply cause a false negative for the next | ||
792 | * interval. Besides, we are running at RT priority, | ||
793 | * so delays should be relatively rare. | ||
794 | */ | ||
795 | while (oldstarttime == boost_starttime) { | ||
796 | if (mutex_trylock(&boost_mutex)) { | ||
797 | boost_starttime = jiffies + | ||
798 | test_boost_interval * HZ; | ||
799 | n_rcu_torture_boosts++; | ||
800 | mutex_unlock(&boost_mutex); | ||
801 | break; | ||
802 | } | ||
803 | schedule_timeout_uninterruptible(1); | ||
804 | } | ||
805 | |||
806 | /* Go do the stutter. */ | ||
807 | checkwait: rcu_stutter_wait("rcu_torture_boost"); | ||
808 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
809 | |||
810 | /* Clean up and exit. */ | ||
811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | ||
812 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | ||
814 | while (!kthread_should_stop() || rbi.inflight) | ||
815 | schedule_timeout_uninterruptible(1); | ||
816 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | ||
817 | return 0; | ||
818 | } | ||
819 | |||
820 | /* | ||
679 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 821 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
680 | * bursts of calls to force_quiescent_state(), increasing the probability | 822 | * bursts of calls to force_quiescent_state(), increasing the probability |
681 | * of occurrence of some important types of race conditions. | 823 | * of occurrence of some important types of race conditions. |
@@ -731,7 +873,8 @@ rcu_torture_writer(void *arg) | |||
731 | continue; | 873 | continue; |
732 | rp->rtort_pipe_count = 0; | 874 | rp->rtort_pipe_count = 0; |
733 | udelay(rcu_random(&rand) & 0x3ff); | 875 | udelay(rcu_random(&rand) & 0x3ff); |
734 | old_rp = rcu_torture_current; | 876 | old_rp = rcu_dereference_check(rcu_torture_current, |
877 | current == writer_task); | ||
735 | rp->rtort_mbtest = 1; | 878 | rp->rtort_mbtest = 1; |
736 | rcu_assign_pointer(rcu_torture_current, rp); | 879 | rcu_assign_pointer(rcu_torture_current, rp); |
737 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ | 880 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ |
@@ -743,7 +886,7 @@ rcu_torture_writer(void *arg) | |||
743 | old_rp->rtort_pipe_count++; | 886 | old_rp->rtort_pipe_count++; |
744 | cur_ops->deferred_free(old_rp); | 887 | cur_ops->deferred_free(old_rp); |
745 | } | 888 | } |
746 | rcu_torture_current_version++; | 889 | rcutorture_record_progress(++rcu_torture_current_version); |
747 | oldbatch = cur_ops->completed(); | 890 | oldbatch = cur_ops->completed(); |
748 | rcu_stutter_wait("rcu_torture_writer"); | 891 | rcu_stutter_wait("rcu_torture_writer"); |
749 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
@@ -923,8 +1066,9 @@ rcu_torture_printk(char *page) | |||
923 | } | 1066 | } |
924 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
925 | cnt += sprintf(&page[cnt], | 1068 | cnt += sprintf(&page[cnt], |
926 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1069 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
927 | "rtmbe: %d nt: %ld", | 1070 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1071 | "rtbf: %ld rtb: %ld nt: %ld", | ||
928 | rcu_torture_current, | 1072 | rcu_torture_current, |
929 | rcu_torture_current_version, | 1073 | rcu_torture_current_version, |
930 | list_empty(&rcu_torture_freelist), | 1074 | list_empty(&rcu_torture_freelist), |
@@ -932,8 +1076,15 @@ rcu_torture_printk(char *page) | |||
932 | atomic_read(&n_rcu_torture_alloc_fail), | 1076 | atomic_read(&n_rcu_torture_alloc_fail), |
933 | atomic_read(&n_rcu_torture_free), | 1077 | atomic_read(&n_rcu_torture_free), |
934 | atomic_read(&n_rcu_torture_mberror), | 1078 | atomic_read(&n_rcu_torture_mberror), |
1079 | n_rcu_torture_boost_ktrerror, | ||
1080 | n_rcu_torture_boost_rterror, | ||
1081 | n_rcu_torture_boost_failure, | ||
1082 | n_rcu_torture_boosts, | ||
935 | n_rcu_torture_timers); | 1083 | n_rcu_torture_timers); |
936 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 1084 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1085 | n_rcu_torture_boost_ktrerror != 0 || | ||
1086 | n_rcu_torture_boost_rterror != 0 || | ||
1087 | n_rcu_torture_boost_failure != 0) | ||
937 | cnt += sprintf(&page[cnt], " !!!"); | 1088 | cnt += sprintf(&page[cnt], " !!!"); |
938 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1089 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
939 | if (i > 1) { | 1090 | if (i > 1) { |
@@ -1085,28 +1236,98 @@ rcu_torture_stutter(void *arg) | |||
1085 | } | 1236 | } |
1086 | 1237 | ||
1087 | static inline void | 1238 | static inline void |
1088 | rcu_torture_print_module_parms(char *tag) | 1239 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
1089 | { | 1240 | { |
1090 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1241 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1091 | "--- %s: nreaders=%d nfakewriters=%d " | 1242 | "--- %s: nreaders=%d nfakewriters=%d " |
1092 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1243 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1093 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1244 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1094 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | 1245 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1246 | "test_boost=%d/%d test_boost_interval=%d " | ||
1247 | "test_boost_duration=%d\n", | ||
1095 | torture_type, tag, nrealreaders, nfakewriters, | 1248 | torture_type, tag, nrealreaders, nfakewriters, |
1096 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1249 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1097 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); | 1250 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1251 | test_boost, cur_ops->can_boost, | ||
1252 | test_boost_interval, test_boost_duration); | ||
1098 | } | 1253 | } |
1099 | 1254 | ||
1100 | static struct notifier_block rcutorture_nb = { | 1255 | static struct notifier_block rcutorture_shutdown_nb = { |
1101 | .notifier_call = rcutorture_shutdown_notify, | 1256 | .notifier_call = rcutorture_shutdown_notify, |
1102 | }; | 1257 | }; |
1103 | 1258 | ||
1259 | static void rcutorture_booster_cleanup(int cpu) | ||
1260 | { | ||
1261 | struct task_struct *t; | ||
1262 | |||
1263 | if (boost_tasks[cpu] == NULL) | ||
1264 | return; | ||
1265 | mutex_lock(&boost_mutex); | ||
1266 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | ||
1267 | t = boost_tasks[cpu]; | ||
1268 | boost_tasks[cpu] = NULL; | ||
1269 | mutex_unlock(&boost_mutex); | ||
1270 | |||
1271 | /* This must be outside of the mutex, otherwise deadlock! */ | ||
1272 | kthread_stop(t); | ||
1273 | } | ||
1274 | |||
1275 | static int rcutorture_booster_init(int cpu) | ||
1276 | { | ||
1277 | int retval; | ||
1278 | |||
1279 | if (boost_tasks[cpu] != NULL) | ||
1280 | return 0; /* Already created, nothing more to do. */ | ||
1281 | |||
1282 | /* Don't allow time recalculation while creating a new task. */ | ||
1283 | mutex_lock(&boost_mutex); | ||
1284 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | ||
1285 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | ||
1286 | "rcu_torture_boost"); | ||
1287 | if (IS_ERR(boost_tasks[cpu])) { | ||
1288 | retval = PTR_ERR(boost_tasks[cpu]); | ||
1289 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | ||
1290 | n_rcu_torture_boost_ktrerror++; | ||
1291 | boost_tasks[cpu] = NULL; | ||
1292 | mutex_unlock(&boost_mutex); | ||
1293 | return retval; | ||
1294 | } | ||
1295 | kthread_bind(boost_tasks[cpu], cpu); | ||
1296 | wake_up_process(boost_tasks[cpu]); | ||
1297 | mutex_unlock(&boost_mutex); | ||
1298 | return 0; | ||
1299 | } | ||
1300 | |||
1301 | static int rcutorture_cpu_notify(struct notifier_block *self, | ||
1302 | unsigned long action, void *hcpu) | ||
1303 | { | ||
1304 | long cpu = (long)hcpu; | ||
1305 | |||
1306 | switch (action) { | ||
1307 | case CPU_ONLINE: | ||
1308 | case CPU_DOWN_FAILED: | ||
1309 | (void)rcutorture_booster_init(cpu); | ||
1310 | break; | ||
1311 | case CPU_DOWN_PREPARE: | ||
1312 | rcutorture_booster_cleanup(cpu); | ||
1313 | break; | ||
1314 | default: | ||
1315 | break; | ||
1316 | } | ||
1317 | return NOTIFY_OK; | ||
1318 | } | ||
1319 | |||
1320 | static struct notifier_block rcutorture_cpu_nb = { | ||
1321 | .notifier_call = rcutorture_cpu_notify, | ||
1322 | }; | ||
1323 | |||
1104 | static void | 1324 | static void |
1105 | rcu_torture_cleanup(void) | 1325 | rcu_torture_cleanup(void) |
1106 | { | 1326 | { |
1107 | int i; | 1327 | int i; |
1108 | 1328 | ||
1109 | mutex_lock(&fullstop_mutex); | 1329 | mutex_lock(&fullstop_mutex); |
1330 | rcutorture_record_test_transition(); | ||
1110 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1331 | if (fullstop == FULLSTOP_SHUTDOWN) { |
1111 | printk(KERN_WARNING /* but going down anyway, so... */ | 1332 | printk(KERN_WARNING /* but going down anyway, so... */ |
1112 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1333 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
@@ -1118,7 +1339,7 @@ rcu_torture_cleanup(void) | |||
1118 | } | 1339 | } |
1119 | fullstop = FULLSTOP_RMMOD; | 1340 | fullstop = FULLSTOP_RMMOD; |
1120 | mutex_unlock(&fullstop_mutex); | 1341 | mutex_unlock(&fullstop_mutex); |
1121 | unregister_reboot_notifier(&rcutorture_nb); | 1342 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1122 | if (stutter_task) { | 1343 | if (stutter_task) { |
1123 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1344 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
1124 | kthread_stop(stutter_task); | 1345 | kthread_stop(stutter_task); |
@@ -1175,6 +1396,12 @@ rcu_torture_cleanup(void) | |||
1175 | kthread_stop(fqs_task); | 1396 | kthread_stop(fqs_task); |
1176 | } | 1397 | } |
1177 | fqs_task = NULL; | 1398 | fqs_task = NULL; |
1399 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
1400 | test_boost == 2) { | ||
1401 | unregister_cpu_notifier(&rcutorture_cpu_nb); | ||
1402 | for_each_possible_cpu(i) | ||
1403 | rcutorture_booster_cleanup(i); | ||
1404 | } | ||
1178 | 1405 | ||
1179 | /* Wait for all RCU callbacks to fire. */ | 1406 | /* Wait for all RCU callbacks to fire. */ |
1180 | 1407 | ||
@@ -1186,9 +1413,9 @@ rcu_torture_cleanup(void) | |||
1186 | if (cur_ops->cleanup) | 1413 | if (cur_ops->cleanup) |
1187 | cur_ops->cleanup(); | 1414 | cur_ops->cleanup(); |
1188 | if (atomic_read(&n_rcu_torture_error)) | 1415 | if (atomic_read(&n_rcu_torture_error)) |
1189 | rcu_torture_print_module_parms("End of test: FAILURE"); | 1416 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1190 | else | 1417 | else |
1191 | rcu_torture_print_module_parms("End of test: SUCCESS"); | 1418 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1192 | } | 1419 | } |
1193 | 1420 | ||
1194 | static int __init | 1421 | static int __init |
@@ -1233,7 +1460,7 @@ rcu_torture_init(void) | |||
1233 | nrealreaders = nreaders; | 1460 | nrealreaders = nreaders; |
1234 | else | 1461 | else |
1235 | nrealreaders = 2 * num_online_cpus(); | 1462 | nrealreaders = 2 * num_online_cpus(); |
1236 | rcu_torture_print_module_parms("Start of test"); | 1463 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
1237 | fullstop = FULLSTOP_DONTSTOP; | 1464 | fullstop = FULLSTOP_DONTSTOP; |
1238 | 1465 | ||
1239 | /* Set up the freelist. */ | 1466 | /* Set up the freelist. */ |
@@ -1254,6 +1481,10 @@ rcu_torture_init(void) | |||
1254 | atomic_set(&n_rcu_torture_free, 0); | 1481 | atomic_set(&n_rcu_torture_free, 0); |
1255 | atomic_set(&n_rcu_torture_mberror, 0); | 1482 | atomic_set(&n_rcu_torture_mberror, 0); |
1256 | atomic_set(&n_rcu_torture_error, 0); | 1483 | atomic_set(&n_rcu_torture_error, 0); |
1484 | n_rcu_torture_boost_ktrerror = 0; | ||
1485 | n_rcu_torture_boost_rterror = 0; | ||
1486 | n_rcu_torture_boost_failure = 0; | ||
1487 | n_rcu_torture_boosts = 0; | ||
1257 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1488 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1258 | atomic_set(&rcu_torture_wcount[i], 0); | 1489 | atomic_set(&rcu_torture_wcount[i], 0); |
1259 | for_each_possible_cpu(cpu) { | 1490 | for_each_possible_cpu(cpu) { |
@@ -1367,7 +1598,28 @@ rcu_torture_init(void) | |||
1367 | goto unwind; | 1598 | goto unwind; |
1368 | } | 1599 | } |
1369 | } | 1600 | } |
1370 | register_reboot_notifier(&rcutorture_nb); | 1601 | if (test_boost_interval < 1) |
1602 | test_boost_interval = 1; | ||
1603 | if (test_boost_duration < 2) | ||
1604 | test_boost_duration = 2; | ||
1605 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
1606 | test_boost == 2) { | ||
1607 | int retval; | ||
1608 | |||
1609 | boost_starttime = jiffies + test_boost_interval * HZ; | ||
1610 | register_cpu_notifier(&rcutorture_cpu_nb); | ||
1611 | for_each_possible_cpu(i) { | ||
1612 | if (cpu_is_offline(i)) | ||
1613 | continue; /* Heuristic: CPU can go offline. */ | ||
1614 | retval = rcutorture_booster_init(i); | ||
1615 | if (retval < 0) { | ||
1616 | firsterr = retval; | ||
1617 | goto unwind; | ||
1618 | } | ||
1619 | } | ||
1620 | } | ||
1621 | register_reboot_notifier(&rcutorture_shutdown_nb); | ||
1622 | rcutorture_record_test_transition(); | ||
1371 | mutex_unlock(&fullstop_mutex); | 1623 | mutex_unlock(&fullstop_mutex); |
1372 | return 0; | 1624 | return 0; |
1373 | 1625 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..ba06207b1dd3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -36,7 +36,7 @@ | |||
36 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
37 | #include <linux/sched.h> | 37 | #include <linux/sched.h> |
38 | #include <linux/nmi.h> | 38 | #include <linux/nmi.h> |
39 | #include <asm/atomic.h> | 39 | #include <linux/atomic.h> |
40 | #include <linux/bitops.h> | 40 | #include <linux/bitops.h> |
41 | #include <linux/module.h> | 41 | #include <linux/module.h> |
42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
@@ -47,6 +47,9 @@ | |||
47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
49 | #include <linux/kernel_stat.h> | 49 | #include <linux/kernel_stat.h> |
50 | #include <linux/wait.h> | ||
51 | #include <linux/kthread.h> | ||
52 | #include <linux/prefetch.h> | ||
50 | 53 | ||
51 | #include "rcutree.h" | 54 | #include "rcutree.h" |
52 | 55 | ||
@@ -67,9 +70,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
67 | .gpnum = -300, \ | 70 | .gpnum = -300, \ |
68 | .completed = -300, \ | 71 | .completed = -300, \ |
69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ | 72 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ |
70 | .orphan_cbs_list = NULL, \ | ||
71 | .orphan_cbs_tail = &structname.orphan_cbs_list, \ | ||
72 | .orphan_qlen = 0, \ | ||
73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ | 73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ |
74 | .n_force_qs = 0, \ | 74 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 75 | .n_force_qs_ngp = 0, \ |
@@ -82,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
84 | 84 | ||
85 | static struct rcu_state *rcu_state; | ||
86 | |||
87 | /* | ||
88 | * The rcu_scheduler_active variable transitions from zero to one just | ||
89 | * before the first task is spawned. So when this variable is zero, RCU | ||
90 | * can assume that there is but one task, allowing RCU to (for example) | ||
91 | * optimized synchronize_sched() to a simple barrier(). When this variable | ||
92 | * is one, RCU must actually do all the hard work required to detect real | ||
93 | * grace periods. This variable is also used to suppress boot-time false | ||
94 | * positives from lockdep-RCU error checking. | ||
95 | */ | ||
85 | int rcu_scheduler_active __read_mostly; | 96 | int rcu_scheduler_active __read_mostly; |
86 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 97 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
87 | 98 | ||
88 | /* | 99 | /* |
100 | * The rcu_scheduler_fully_active variable transitions from zero to one | ||
101 | * during the early_initcall() processing, which is after the scheduler | ||
102 | * is capable of creating new tasks. So RCU processing (for example, | ||
103 | * creating tasks for RCU priority boosting) must be delayed until after | ||
104 | * rcu_scheduler_fully_active transitions from zero to one. We also | ||
105 | * currently delay invocation of any RCU callbacks until after this point. | ||
106 | * | ||
107 | * It might later prove better for people registering RCU callbacks during | ||
108 | * early boot to take responsibility for these callbacks, but one step at | ||
109 | * a time. | ||
110 | */ | ||
111 | static int rcu_scheduler_fully_active __read_mostly; | ||
112 | |||
113 | #ifdef CONFIG_RCU_BOOST | ||
114 | |||
115 | /* | ||
116 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
117 | * handle all flavors of RCU. | ||
118 | */ | ||
119 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
120 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
121 | DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
122 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
123 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
124 | |||
125 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
126 | |||
127 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | ||
128 | static void invoke_rcu_core(void); | ||
129 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | ||
130 | |||
131 | #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ | ||
132 | |||
133 | /* | ||
134 | * Track the rcutorture test sequence number and the update version | ||
135 | * number within a given test. The rcutorture_testseq is incremented | ||
136 | * on every rcutorture module load and unload, so has an odd value | ||
137 | * when a test is running. The rcutorture_vernum is set to zero | ||
138 | * when rcutorture starts and is incremented on each rcutorture update. | ||
139 | * These variables enable correlating rcutorture output with the | ||
140 | * RCU tracing information. | ||
141 | */ | ||
142 | unsigned long rcutorture_testseq; | ||
143 | unsigned long rcutorture_vernum; | ||
144 | |||
145 | /* | ||
89 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 146 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
90 | * permit this function to be invoked without holding the root rcu_node | 147 | * permit this function to be invoked without holding the root rcu_node |
91 | * structure's ->lock, but of course results can be subject to change. | 148 | * structure's ->lock, but of course results can be subject to change. |
@@ -127,11 +184,12 @@ void rcu_note_context_switch(int cpu) | |||
127 | rcu_sched_qs(cpu); | 184 | rcu_sched_qs(cpu); |
128 | rcu_preempt_note_context_switch(cpu); | 185 | rcu_preempt_note_context_switch(cpu); |
129 | } | 186 | } |
187 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | ||
130 | 188 | ||
131 | #ifdef CONFIG_NO_HZ | 189 | #ifdef CONFIG_NO_HZ |
132 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 190 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
133 | .dynticks_nesting = 1, | 191 | .dynticks_nesting = 1, |
134 | .dynticks = 1, | 192 | .dynticks = ATOMIC_INIT(1), |
135 | }; | 193 | }; |
136 | #endif /* #ifdef CONFIG_NO_HZ */ | 194 | #endif /* #ifdef CONFIG_NO_HZ */ |
137 | 195 | ||
@@ -143,6 +201,9 @@ module_param(blimit, int, 0); | |||
143 | module_param(qhimark, int, 0); | 201 | module_param(qhimark, int, 0); |
144 | module_param(qlowmark, int, 0); | 202 | module_param(qlowmark, int, 0); |
145 | 203 | ||
204 | int rcu_cpu_stall_suppress __read_mostly; | ||
205 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
206 | |||
146 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 207 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
147 | static int rcu_pending(int cpu); | 208 | static int rcu_pending(int cpu); |
148 | 209 | ||
@@ -174,6 +235,31 @@ void rcu_bh_force_quiescent_state(void) | |||
174 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 235 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
175 | 236 | ||
176 | /* | 237 | /* |
238 | * Record the number of times rcutorture tests have been initiated and | ||
239 | * terminated. This information allows the debugfs tracing stats to be | ||
240 | * correlated to the rcutorture messages, even when the rcutorture module | ||
241 | * is being repeatedly loaded and unloaded. In other words, we cannot | ||
242 | * store this state in rcutorture itself. | ||
243 | */ | ||
244 | void rcutorture_record_test_transition(void) | ||
245 | { | ||
246 | rcutorture_testseq++; | ||
247 | rcutorture_vernum = 0; | ||
248 | } | ||
249 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | ||
250 | |||
251 | /* | ||
252 | * Record the number of writer passes through the current rcutorture test. | ||
253 | * This is also used to correlate debugfs tracing stats with the rcutorture | ||
254 | * messages. | ||
255 | */ | ||
256 | void rcutorture_record_progress(unsigned long vernum) | ||
257 | { | ||
258 | rcutorture_vernum++; | ||
259 | } | ||
260 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | ||
261 | |||
262 | /* | ||
177 | * Force a quiescent state for RCU-sched. | 263 | * Force a quiescent state for RCU-sched. |
178 | */ | 264 | */ |
179 | void rcu_sched_force_quiescent_state(void) | 265 | void rcu_sched_force_quiescent_state(void) |
@@ -232,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
232 | return 1; | 318 | return 1; |
233 | } | 319 | } |
234 | 320 | ||
235 | /* If preemptable RCU, no point in sending reschedule IPI. */ | 321 | /* If preemptible RCU, no point in sending reschedule IPI. */ |
236 | if (rdp->preemptable) | 322 | if (rdp->preemptible) |
237 | return 0; | 323 | return 0; |
238 | 324 | ||
239 | /* The CPU is online, so send it a reschedule IPI. */ | 325 | /* The CPU is online, so send it a reschedule IPI. */ |
@@ -262,13 +348,25 @@ void rcu_enter_nohz(void) | |||
262 | unsigned long flags; | 348 | unsigned long flags; |
263 | struct rcu_dynticks *rdtp; | 349 | struct rcu_dynticks *rdtp; |
264 | 350 | ||
265 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
266 | local_irq_save(flags); | 351 | local_irq_save(flags); |
267 | rdtp = &__get_cpu_var(rcu_dynticks); | 352 | rdtp = &__get_cpu_var(rcu_dynticks); |
268 | rdtp->dynticks++; | 353 | if (--rdtp->dynticks_nesting) { |
269 | rdtp->dynticks_nesting--; | 354 | local_irq_restore(flags); |
270 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | 355 | return; |
356 | } | ||
357 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
358 | smp_mb__before_atomic_inc(); /* See above. */ | ||
359 | atomic_inc(&rdtp->dynticks); | ||
360 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
361 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
271 | local_irq_restore(flags); | 362 | local_irq_restore(flags); |
363 | |||
364 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
365 | if (in_irq() && | ||
366 | (__get_cpu_var(rcu_sched_data).nxtlist || | ||
367 | __get_cpu_var(rcu_bh_data).nxtlist || | ||
368 | rcu_preempt_needs_cpu(smp_processor_id()))) | ||
369 | set_need_resched(); | ||
272 | } | 370 | } |
273 | 371 | ||
274 | /* | 372 | /* |
@@ -284,11 +382,16 @@ void rcu_exit_nohz(void) | |||
284 | 382 | ||
285 | local_irq_save(flags); | 383 | local_irq_save(flags); |
286 | rdtp = &__get_cpu_var(rcu_dynticks); | 384 | rdtp = &__get_cpu_var(rcu_dynticks); |
287 | rdtp->dynticks++; | 385 | if (rdtp->dynticks_nesting++) { |
288 | rdtp->dynticks_nesting++; | 386 | local_irq_restore(flags); |
289 | WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); | 387 | return; |
388 | } | ||
389 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | ||
390 | atomic_inc(&rdtp->dynticks); | ||
391 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
392 | smp_mb__after_atomic_inc(); /* See above. */ | ||
393 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
290 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
291 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
292 | } | 395 | } |
293 | 396 | ||
294 | /** | 397 | /** |
@@ -302,11 +405,15 @@ void rcu_nmi_enter(void) | |||
302 | { | 405 | { |
303 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 406 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); |
304 | 407 | ||
305 | if (rdtp->dynticks & 0x1) | 408 | if (rdtp->dynticks_nmi_nesting == 0 && |
409 | (atomic_read(&rdtp->dynticks) & 0x1)) | ||
306 | return; | 410 | return; |
307 | rdtp->dynticks_nmi++; | 411 | rdtp->dynticks_nmi_nesting++; |
308 | WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); | 412 | smp_mb__before_atomic_inc(); /* Force delay from prior write. */ |
309 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | 413 | atomic_inc(&rdtp->dynticks); |
414 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
415 | smp_mb__after_atomic_inc(); /* See above. */ | ||
416 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
310 | } | 417 | } |
311 | 418 | ||
312 | /** | 419 | /** |
@@ -320,11 +427,14 @@ void rcu_nmi_exit(void) | |||
320 | { | 427 | { |
321 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 428 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); |
322 | 429 | ||
323 | if (rdtp->dynticks & 0x1) | 430 | if (rdtp->dynticks_nmi_nesting == 0 || |
431 | --rdtp->dynticks_nmi_nesting != 0) | ||
324 | return; | 432 | return; |
325 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | 433 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
326 | rdtp->dynticks_nmi++; | 434 | smp_mb__before_atomic_inc(); /* See above. */ |
327 | WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); | 435 | atomic_inc(&rdtp->dynticks); |
436 | smp_mb__after_atomic_inc(); /* Force delay to next write. */ | ||
437 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
328 | } | 438 | } |
329 | 439 | ||
330 | /** | 440 | /** |
@@ -335,13 +445,7 @@ void rcu_nmi_exit(void) | |||
335 | */ | 445 | */ |
336 | void rcu_irq_enter(void) | 446 | void rcu_irq_enter(void) |
337 | { | 447 | { |
338 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 448 | rcu_exit_nohz(); |
339 | |||
340 | if (rdtp->dynticks_nesting++) | ||
341 | return; | ||
342 | rdtp->dynticks++; | ||
343 | WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); | ||
344 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
345 | } | 449 | } |
346 | 450 | ||
347 | /** | 451 | /** |
@@ -353,18 +457,7 @@ void rcu_irq_enter(void) | |||
353 | */ | 457 | */ |
354 | void rcu_irq_exit(void) | 458 | void rcu_irq_exit(void) |
355 | { | 459 | { |
356 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 460 | rcu_enter_nohz(); |
357 | |||
358 | if (--rdtp->dynticks_nesting) | ||
359 | return; | ||
360 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
361 | rdtp->dynticks++; | ||
362 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | ||
363 | |||
364 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
365 | if (__get_cpu_var(rcu_sched_data).nxtlist || | ||
366 | __get_cpu_var(rcu_bh_data).nxtlist) | ||
367 | set_need_resched(); | ||
368 | } | 461 | } |
369 | 462 | ||
370 | #ifdef CONFIG_SMP | 463 | #ifdef CONFIG_SMP |
@@ -376,19 +469,8 @@ void rcu_irq_exit(void) | |||
376 | */ | 469 | */ |
377 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 470 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
378 | { | 471 | { |
379 | int ret; | 472 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
380 | int snap; | 473 | return 0; |
381 | int snap_nmi; | ||
382 | |||
383 | snap = rdp->dynticks->dynticks; | ||
384 | snap_nmi = rdp->dynticks->dynticks_nmi; | ||
385 | smp_mb(); /* Order sampling of snap with end of grace period. */ | ||
386 | rdp->dynticks_snap = snap; | ||
387 | rdp->dynticks_nmi_snap = snap_nmi; | ||
388 | ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); | ||
389 | if (ret) | ||
390 | rdp->dynticks_fqs++; | ||
391 | return ret; | ||
392 | } | 474 | } |
393 | 475 | ||
394 | /* | 476 | /* |
@@ -399,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
399 | */ | 481 | */ |
400 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 482 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
401 | { | 483 | { |
402 | long curr; | 484 | unsigned long curr; |
403 | long curr_nmi; | 485 | unsigned long snap; |
404 | long snap; | ||
405 | long snap_nmi; | ||
406 | 486 | ||
407 | curr = rdp->dynticks->dynticks; | 487 | curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); |
408 | snap = rdp->dynticks_snap; | 488 | snap = (unsigned long)rdp->dynticks_snap; |
409 | curr_nmi = rdp->dynticks->dynticks_nmi; | ||
410 | snap_nmi = rdp->dynticks_nmi_snap; | ||
411 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
412 | 489 | ||
413 | /* | 490 | /* |
414 | * If the CPU passed through or entered a dynticks idle phase with | 491 | * If the CPU passed through or entered a dynticks idle phase with |
@@ -418,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
418 | * read-side critical section that started before the beginning | 495 | * read-side critical section that started before the beginning |
419 | * of the current RCU grace period. | 496 | * of the current RCU grace period. |
420 | */ | 497 | */ |
421 | if ((curr != snap || (curr & 0x1) == 0) && | 498 | if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { |
422 | (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { | ||
423 | rdp->dynticks_fqs++; | 499 | rdp->dynticks_fqs++; |
424 | return 1; | 500 | return 1; |
425 | } | 501 | } |
@@ -448,9 +524,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
448 | 524 | ||
449 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 525 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
450 | 526 | ||
451 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 527 | int rcu_cpu_stall_suppress __read_mostly; |
452 | |||
453 | int rcu_cpu_stall_panicking __read_mostly; | ||
454 | 528 | ||
455 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 529 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
456 | { | 530 | { |
@@ -482,8 +556,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
482 | rcu_print_task_stall(rnp); | 556 | rcu_print_task_stall(rnp); |
483 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 557 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
484 | 558 | ||
485 | /* OK, time to rat on our buddy... */ | 559 | /* |
486 | 560 | * OK, time to rat on our buddy... | |
561 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
562 | * RCU CPU stall warnings. | ||
563 | */ | ||
487 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", | 564 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", |
488 | rsp->name); | 565 | rsp->name); |
489 | rcu_for_each_leaf_node(rsp, rnp) { | 566 | rcu_for_each_leaf_node(rsp, rnp) { |
@@ -512,6 +589,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
512 | unsigned long flags; | 589 | unsigned long flags; |
513 | struct rcu_node *rnp = rcu_get_root(rsp); | 590 | struct rcu_node *rnp = rcu_get_root(rsp); |
514 | 591 | ||
592 | /* | ||
593 | * OK, time to rat on ourselves... | ||
594 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
595 | * RCU CPU stall warnings. | ||
596 | */ | ||
515 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 597 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", |
516 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 598 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); |
517 | trigger_all_cpu_backtrace(); | 599 | trigger_all_cpu_backtrace(); |
@@ -527,31 +609,50 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
527 | 609 | ||
528 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 610 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
529 | { | 611 | { |
530 | long delta; | 612 | unsigned long j; |
613 | unsigned long js; | ||
531 | struct rcu_node *rnp; | 614 | struct rcu_node *rnp; |
532 | 615 | ||
533 | if (rcu_cpu_stall_panicking) | 616 | if (rcu_cpu_stall_suppress) |
534 | return; | 617 | return; |
535 | delta = jiffies - rsp->jiffies_stall; | 618 | j = ACCESS_ONCE(jiffies); |
619 | js = ACCESS_ONCE(rsp->jiffies_stall); | ||
536 | rnp = rdp->mynode; | 620 | rnp = rdp->mynode; |
537 | if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { | 621 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { |
538 | 622 | ||
539 | /* We haven't checked in, so go dump stack. */ | 623 | /* We haven't checked in, so go dump stack. */ |
540 | print_cpu_stall(rsp); | 624 | print_cpu_stall(rsp); |
541 | 625 | ||
542 | } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { | 626 | } else if (rcu_gp_in_progress(rsp) && |
627 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { | ||
543 | 628 | ||
544 | /* They had two time units to dump stack, so complain. */ | 629 | /* They had a few time units to dump stack, so complain. */ |
545 | print_other_cpu_stall(rsp); | 630 | print_other_cpu_stall(rsp); |
546 | } | 631 | } |
547 | } | 632 | } |
548 | 633 | ||
549 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | 634 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) |
550 | { | 635 | { |
551 | rcu_cpu_stall_panicking = 1; | 636 | rcu_cpu_stall_suppress = 1; |
552 | return NOTIFY_DONE; | 637 | return NOTIFY_DONE; |
553 | } | 638 | } |
554 | 639 | ||
640 | /** | ||
641 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
642 | * | ||
643 | * Set the stall-warning timeout way off into the future, thus preventing | ||
644 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
645 | * RCU grace periods. | ||
646 | * | ||
647 | * The caller must disable hard irqs. | ||
648 | */ | ||
649 | void rcu_cpu_stall_reset(void) | ||
650 | { | ||
651 | rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
652 | rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
653 | rcu_preempt_stall_reset(); | ||
654 | } | ||
655 | |||
555 | static struct notifier_block rcu_panic_block = { | 656 | static struct notifier_block rcu_panic_block = { |
556 | .notifier_call = rcu_panic, | 657 | .notifier_call = rcu_panic, |
557 | }; | 658 | }; |
@@ -561,22 +662,6 @@ static void __init check_cpu_stall_init(void) | |||
561 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | 662 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); |
562 | } | 663 | } |
563 | 664 | ||
564 | #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
565 | |||
566 | static void record_gp_stall_check_time(struct rcu_state *rsp) | ||
567 | { | ||
568 | } | ||
569 | |||
570 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | ||
571 | { | ||
572 | } | ||
573 | |||
574 | static void __init check_cpu_stall_init(void) | ||
575 | { | ||
576 | } | ||
577 | |||
578 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
579 | |||
580 | /* | 665 | /* |
581 | * Update CPU-local rcu_data state to record the newly noticed grace period. | 666 | * Update CPU-local rcu_data state to record the newly noticed grace period. |
582 | * This is used both when we started the grace period and when we notice | 667 | * This is used both when we started the grace period and when we notice |
@@ -587,9 +672,17 @@ static void __init check_cpu_stall_init(void) | |||
587 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 672 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
588 | { | 673 | { |
589 | if (rdp->gpnum != rnp->gpnum) { | 674 | if (rdp->gpnum != rnp->gpnum) { |
590 | rdp->qs_pending = 1; | 675 | /* |
591 | rdp->passed_quiesc = 0; | 676 | * If the current grace period is waiting for this CPU, |
677 | * set up to detect a quiescent state, otherwise don't | ||
678 | * go looking for one. | ||
679 | */ | ||
592 | rdp->gpnum = rnp->gpnum; | 680 | rdp->gpnum = rnp->gpnum; |
681 | if (rnp->qsmask & rdp->grpmask) { | ||
682 | rdp->qs_pending = 1; | ||
683 | rdp->passed_quiesc = 0; | ||
684 | } else | ||
685 | rdp->qs_pending = 0; | ||
593 | } | 686 | } |
594 | } | 687 | } |
595 | 688 | ||
@@ -648,6 +741,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
648 | 741 | ||
649 | /* Remember that we saw this grace-period completion. */ | 742 | /* Remember that we saw this grace-period completion. */ |
650 | rdp->completed = rnp->completed; | 743 | rdp->completed = rnp->completed; |
744 | |||
745 | /* | ||
746 | * If we were in an extended quiescent state, we may have | ||
747 | * missed some grace periods that others CPUs handled on | ||
748 | * our behalf. Catch up with this state to avoid noting | ||
749 | * spurious new grace periods. If another grace period | ||
750 | * has started, then rnp->gpnum will have advanced, so | ||
751 | * we will detect this later on. | ||
752 | */ | ||
753 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | ||
754 | rdp->gpnum = rdp->completed; | ||
755 | |||
756 | /* | ||
757 | * If RCU does not need a quiescent state from this CPU, | ||
758 | * then make sure that this CPU doesn't go looking for one. | ||
759 | */ | ||
760 | if ((rnp->qsmask & rdp->grpmask) == 0) | ||
761 | rdp->qs_pending = 0; | ||
651 | } | 762 | } |
652 | } | 763 | } |
653 | 764 | ||
@@ -712,7 +823,7 @@ static void | |||
712 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 823 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) |
713 | __releases(rcu_get_root(rsp)->lock) | 824 | __releases(rcu_get_root(rsp)->lock) |
714 | { | 825 | { |
715 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 826 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
716 | struct rcu_node *rnp = rcu_get_root(rsp); | 827 | struct rcu_node *rnp = rcu_get_root(rsp); |
717 | 828 | ||
718 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { | 829 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
@@ -753,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
753 | rnp->completed = rsp->completed; | 864 | rnp->completed = rsp->completed; |
754 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 865 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
755 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 866 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
867 | rcu_preempt_boost_start_gp(rnp); | ||
756 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 868 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
757 | return; | 869 | return; |
758 | } | 870 | } |
@@ -788,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
788 | rnp->completed = rsp->completed; | 900 | rnp->completed = rsp->completed; |
789 | if (rnp == rdp->mynode) | 901 | if (rnp == rdp->mynode) |
790 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 902 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
903 | rcu_preempt_boost_start_gp(rnp); | ||
791 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 904 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
792 | } | 905 | } |
793 | 906 | ||
@@ -808,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
808 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 921 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
809 | __releases(rcu_get_root(rsp)->lock) | 922 | __releases(rcu_get_root(rsp)->lock) |
810 | { | 923 | { |
924 | unsigned long gp_duration; | ||
925 | |||
811 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 926 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
927 | |||
928 | /* | ||
929 | * Ensure that all grace-period and pre-grace-period activity | ||
930 | * is seen before the assignment to rsp->completed. | ||
931 | */ | ||
932 | smp_mb(); /* See above block comment. */ | ||
933 | gp_duration = jiffies - rsp->gp_start; | ||
934 | if (gp_duration > rsp->gp_max) | ||
935 | rsp->gp_max = gp_duration; | ||
812 | rsp->completed = rsp->gpnum; | 936 | rsp->completed = rsp->gpnum; |
813 | rsp->signaled = RCU_GP_IDLE; | 937 | rsp->signaled = RCU_GP_IDLE; |
814 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 938 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
@@ -838,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
838 | return; | 962 | return; |
839 | } | 963 | } |
840 | rnp->qsmask &= ~mask; | 964 | rnp->qsmask &= ~mask; |
841 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 965 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
842 | 966 | ||
843 | /* Other bits still set at this level, so done. */ | 967 | /* Other bits still set at this level, so done. */ |
844 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 968 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -951,65 +1075,49 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
951 | #ifdef CONFIG_HOTPLUG_CPU | 1075 | #ifdef CONFIG_HOTPLUG_CPU |
952 | 1076 | ||
953 | /* | 1077 | /* |
954 | * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the | 1078 | * Move a dying CPU's RCU callbacks to online CPU's callback list. |
955 | * specified flavor of RCU. The callbacks will be adopted by the next | 1079 | * Synchronization is not required because this function executes |
956 | * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever | 1080 | * in stop_machine() context. |
957 | * comes first. Because this is invoked from the CPU_DYING notifier, | ||
958 | * irqs are already disabled. | ||
959 | */ | 1081 | */ |
960 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1082 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
961 | { | 1083 | { |
962 | int i; | 1084 | int i; |
963 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 1085 | /* current DYING CPU is cleared in the cpu_online_mask */ |
1086 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
1087 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1088 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
964 | 1089 | ||
965 | if (rdp->nxtlist == NULL) | 1090 | if (rdp->nxtlist == NULL) |
966 | return; /* irqs disabled, so comparison is stable. */ | 1091 | return; /* irqs disabled, so comparison is stable. */ |
967 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1092 | |
968 | *rsp->orphan_cbs_tail = rdp->nxtlist; | 1093 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; |
969 | rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 1094 | receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; |
1095 | receive_rdp->qlen += rdp->qlen; | ||
1096 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1097 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1098 | |||
970 | rdp->nxtlist = NULL; | 1099 | rdp->nxtlist = NULL; |
971 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1100 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
972 | rdp->nxttail[i] = &rdp->nxtlist; | 1101 | rdp->nxttail[i] = &rdp->nxtlist; |
973 | rsp->orphan_qlen += rdp->qlen; | ||
974 | rdp->qlen = 0; | 1102 | rdp->qlen = 0; |
975 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | ||
976 | } | ||
977 | |||
978 | /* | ||
979 | * Adopt previously orphaned RCU callbacks. | ||
980 | */ | ||
981 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
982 | { | ||
983 | unsigned long flags; | ||
984 | struct rcu_data *rdp; | ||
985 | |||
986 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
987 | rdp = rsp->rda[smp_processor_id()]; | ||
988 | if (rsp->orphan_cbs_list == NULL) { | ||
989 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
990 | return; | ||
991 | } | ||
992 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | ||
993 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | ||
994 | rdp->qlen += rsp->orphan_qlen; | ||
995 | rsp->orphan_cbs_list = NULL; | ||
996 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | ||
997 | rsp->orphan_qlen = 0; | ||
998 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
999 | } | 1103 | } |
1000 | 1104 | ||
1001 | /* | 1105 | /* |
1002 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy | 1106 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy |
1003 | * and move all callbacks from the outgoing CPU to the current one. | 1107 | * and move all callbacks from the outgoing CPU to the current one. |
1108 | * There can only be one CPU hotplug operation at a time, so no other | ||
1109 | * CPU can be attempting to update rcu_cpu_kthread_task. | ||
1004 | */ | 1110 | */ |
1005 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | 1111 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) |
1006 | { | 1112 | { |
1007 | unsigned long flags; | 1113 | unsigned long flags; |
1008 | unsigned long mask; | 1114 | unsigned long mask; |
1009 | int need_report = 0; | 1115 | int need_report = 0; |
1010 | struct rcu_data *rdp = rsp->rda[cpu]; | 1116 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1011 | struct rcu_node *rnp; | 1117 | struct rcu_node *rnp; |
1012 | 1118 | ||
1119 | rcu_stop_cpu_kthread(cpu); | ||
1120 | |||
1013 | /* Exclude any attempts to start a new grace period. */ | 1121 | /* Exclude any attempts to start a new grace period. */ |
1014 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1122 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1015 | 1123 | ||
@@ -1046,8 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1046 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1154 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1047 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1155 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1048 | rcu_report_exp_rnp(rsp, rnp); | 1156 | rcu_report_exp_rnp(rsp, rnp); |
1049 | 1157 | rcu_node_kthread_setaffinity(rnp, -1); | |
1050 | rcu_adopt_orphan_cbs(rsp); | ||
1051 | } | 1158 | } |
1052 | 1159 | ||
1053 | /* | 1160 | /* |
@@ -1065,11 +1172,7 @@ static void rcu_offline_cpu(int cpu) | |||
1065 | 1172 | ||
1066 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1173 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1067 | 1174 | ||
1068 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 1175 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) |
1069 | { | ||
1070 | } | ||
1071 | |||
1072 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1073 | { | 1176 | { |
1074 | } | 1177 | } |
1075 | 1178 | ||
@@ -1113,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1113 | next = list->next; | 1216 | next = list->next; |
1114 | prefetch(next); | 1217 | prefetch(next); |
1115 | debug_rcu_head_unqueue(list); | 1218 | debug_rcu_head_unqueue(list); |
1116 | list->func(list); | 1219 | __rcu_reclaim(list); |
1117 | list = next; | 1220 | list = next; |
1118 | if (++count >= rdp->blimit) | 1221 | if (++count >= rdp->blimit) |
1119 | break; | 1222 | break; |
@@ -1123,6 +1226,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1123 | 1226 | ||
1124 | /* Update count, and requeue any remaining callbacks. */ | 1227 | /* Update count, and requeue any remaining callbacks. */ |
1125 | rdp->qlen -= count; | 1228 | rdp->qlen -= count; |
1229 | rdp->n_cbs_invoked += count; | ||
1126 | if (list != NULL) { | 1230 | if (list != NULL) { |
1127 | *tail = rdp->nxtlist; | 1231 | *tail = rdp->nxtlist; |
1128 | rdp->nxtlist = list; | 1232 | rdp->nxtlist = list; |
@@ -1148,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1148 | 1252 | ||
1149 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1253 | /* Re-raise the RCU softirq if there are callbacks remaining. */ |
1150 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1254 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1151 | raise_softirq(RCU_SOFTIRQ); | 1255 | invoke_rcu_core(); |
1152 | } | 1256 | } |
1153 | 1257 | ||
1154 | /* | 1258 | /* |
@@ -1194,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
1194 | } | 1298 | } |
1195 | rcu_preempt_check_callbacks(cpu); | 1299 | rcu_preempt_check_callbacks(cpu); |
1196 | if (rcu_pending(cpu)) | 1300 | if (rcu_pending(cpu)) |
1197 | raise_softirq(RCU_SOFTIRQ); | 1301 | invoke_rcu_core(); |
1198 | } | 1302 | } |
1199 | 1303 | ||
1200 | #ifdef CONFIG_SMP | 1304 | #ifdef CONFIG_SMP |
@@ -1202,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
1202 | /* | 1306 | /* |
1203 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1307 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
1204 | * have not yet encountered a quiescent state, using the function specified. | 1308 | * have not yet encountered a quiescent state, using the function specified. |
1309 | * Also initiate boosting for any threads blocked on the root rcu_node. | ||
1310 | * | ||
1205 | * The caller must have suppressed start of new grace periods. | 1311 | * The caller must have suppressed start of new grace periods. |
1206 | */ | 1312 | */ |
1207 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 1313 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
@@ -1220,13 +1326,14 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1220 | return; | 1326 | return; |
1221 | } | 1327 | } |
1222 | if (rnp->qsmask == 0) { | 1328 | if (rnp->qsmask == 0) { |
1223 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1329 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
1224 | continue; | 1330 | continue; |
1225 | } | 1331 | } |
1226 | cpu = rnp->grplo; | 1332 | cpu = rnp->grplo; |
1227 | bit = 1; | 1333 | bit = 1; |
1228 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 1334 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
1229 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1335 | if ((rnp->qsmask & bit) != 0 && |
1336 | f(per_cpu_ptr(rsp->rda, cpu))) | ||
1230 | mask |= bit; | 1337 | mask |= bit; |
1231 | } | 1338 | } |
1232 | if (mask != 0) { | 1339 | if (mask != 0) { |
@@ -1237,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1237 | } | 1344 | } |
1238 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1345 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1239 | } | 1346 | } |
1347 | rnp = rcu_get_root(rsp); | ||
1348 | if (rnp->qsmask == 0) { | ||
1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1350 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1351 | } | ||
1240 | } | 1352 | } |
1241 | 1353 | ||
1242 | /* | 1354 | /* |
@@ -1351,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1351 | } | 1463 | } |
1352 | 1464 | ||
1353 | /* If there are callbacks ready, invoke them. */ | 1465 | /* If there are callbacks ready, invoke them. */ |
1354 | rcu_do_batch(rsp, rdp); | 1466 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1467 | invoke_rcu_callbacks(rsp, rdp); | ||
1355 | } | 1468 | } |
1356 | 1469 | ||
1357 | /* | 1470 | /* |
@@ -1359,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1359 | */ | 1472 | */ |
1360 | static void rcu_process_callbacks(struct softirq_action *unused) | 1473 | static void rcu_process_callbacks(struct softirq_action *unused) |
1361 | { | 1474 | { |
1362 | /* | ||
1363 | * Memory references from any prior RCU read-side critical sections | ||
1364 | * executed by the interrupted code must be seen before any RCU | ||
1365 | * grace-period manipulations below. | ||
1366 | */ | ||
1367 | smp_mb(); /* See above block comment. */ | ||
1368 | |||
1369 | __rcu_process_callbacks(&rcu_sched_state, | 1475 | __rcu_process_callbacks(&rcu_sched_state, |
1370 | &__get_cpu_var(rcu_sched_data)); | 1476 | &__get_cpu_var(rcu_sched_data)); |
1371 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1477 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); |
1372 | rcu_preempt_process_callbacks(); | 1478 | rcu_preempt_process_callbacks(); |
1373 | 1479 | ||
1374 | /* | ||
1375 | * Memory references from any later RCU read-side critical sections | ||
1376 | * executed by the interrupted code must be seen after any RCU | ||
1377 | * grace-period manipulations above. | ||
1378 | */ | ||
1379 | smp_mb(); /* See above block comment. */ | ||
1380 | |||
1381 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | 1480 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ |
1382 | rcu_needs_cpu_flush(); | 1481 | rcu_needs_cpu_flush(); |
1383 | } | 1482 | } |
1384 | 1483 | ||
1484 | /* | ||
1485 | * Wake up the current CPU's kthread. This replaces raise_softirq() | ||
1486 | * in earlier versions of RCU. Note that because we are running on | ||
1487 | * the current CPU with interrupts disabled, the rcu_cpu_kthread_task | ||
1488 | * cannot disappear out from under us. | ||
1489 | */ | ||
1490 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | ||
1491 | { | ||
1492 | if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) | ||
1493 | return; | ||
1494 | if (likely(!rsp->boost)) { | ||
1495 | rcu_do_batch(rsp, rdp); | ||
1496 | return; | ||
1497 | } | ||
1498 | invoke_rcu_callbacks_kthread(); | ||
1499 | } | ||
1500 | |||
1501 | static void invoke_rcu_core(void) | ||
1502 | { | ||
1503 | raise_softirq(RCU_SOFTIRQ); | ||
1504 | } | ||
1505 | |||
1385 | static void | 1506 | static void |
1386 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1507 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
1387 | struct rcu_state *rsp) | 1508 | struct rcu_state *rsp) |
@@ -1402,21 +1523,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1402 | * a quiescent state betweentimes. | 1523 | * a quiescent state betweentimes. |
1403 | */ | 1524 | */ |
1404 | local_irq_save(flags); | 1525 | local_irq_save(flags); |
1405 | rdp = rsp->rda[smp_processor_id()]; | 1526 | rdp = this_cpu_ptr(rsp->rda); |
1406 | rcu_process_gp_end(rsp, rdp); | ||
1407 | check_for_new_grace_period(rsp, rdp); | ||
1408 | 1527 | ||
1409 | /* Add the callback to our list. */ | 1528 | /* Add the callback to our list. */ |
1410 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1529 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1411 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1530 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1531 | rdp->qlen++; | ||
1412 | 1532 | ||
1413 | /* Start a new grace period if one not already started. */ | 1533 | /* If interrupts were disabled, don't dive into RCU core. */ |
1414 | if (!rcu_gp_in_progress(rsp)) { | 1534 | if (irqs_disabled_flags(flags)) { |
1415 | unsigned long nestflag; | 1535 | local_irq_restore(flags); |
1416 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 1536 | return; |
1417 | |||
1418 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1419 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | ||
1420 | } | 1537 | } |
1421 | 1538 | ||
1422 | /* | 1539 | /* |
@@ -1426,13 +1543,28 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1426 | * invoking force_quiescent_state() if the newly enqueued callback | 1543 | * invoking force_quiescent_state() if the newly enqueued callback |
1427 | * is the only one waiting for a grace period to complete. | 1544 | * is the only one waiting for a grace period to complete. |
1428 | */ | 1545 | */ |
1429 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1546 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
1430 | rdp->blimit = LONG_MAX; | 1547 | |
1431 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 1548 | /* Are we ignoring a completed grace period? */ |
1432 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 1549 | rcu_process_gp_end(rsp, rdp); |
1433 | force_quiescent_state(rsp, 0); | 1550 | check_for_new_grace_period(rsp, rdp); |
1434 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1551 | |
1435 | rdp->qlen_last_fqs_check = rdp->qlen; | 1552 | /* Start a new grace period if one not already started. */ |
1553 | if (!rcu_gp_in_progress(rsp)) { | ||
1554 | unsigned long nestflag; | ||
1555 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
1556 | |||
1557 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
1558 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
1559 | } else { | ||
1560 | /* Give the grace period a kick. */ | ||
1561 | rdp->blimit = LONG_MAX; | ||
1562 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
1563 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
1564 | force_quiescent_state(rsp, 0); | ||
1565 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1566 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1567 | } | ||
1436 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | 1568 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1437 | force_quiescent_state(rsp, 1); | 1569 | force_quiescent_state(rsp, 1); |
1438 | local_irq_restore(flags); | 1570 | local_irq_restore(flags); |
@@ -1547,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1547 | * or RCU-bh, force a local reschedule. | 1679 | * or RCU-bh, force a local reschedule. |
1548 | */ | 1680 | */ |
1549 | rdp->n_rp_qs_pending++; | 1681 | rdp->n_rp_qs_pending++; |
1550 | if (!rdp->preemptable && | 1682 | if (!rdp->preemptible && |
1551 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | 1683 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, |
1552 | jiffies)) | 1684 | jiffies)) |
1553 | set_need_resched(); | 1685 | set_need_resched(); |
@@ -1662,13 +1794,12 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
1662 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 1794 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU |
1663 | * might complete its grace period before all of the other CPUs | 1795 | * might complete its grace period before all of the other CPUs |
1664 | * did their increment, causing this function to return too | 1796 | * did their increment, causing this function to return too |
1665 | * early. | 1797 | * early. Note that on_each_cpu() disables irqs, which prevents |
1798 | * any CPUs from coming online or going offline until each online | ||
1799 | * CPU has queued its RCU-barrier callback. | ||
1666 | */ | 1800 | */ |
1667 | atomic_set(&rcu_barrier_cpu_count, 1); | 1801 | atomic_set(&rcu_barrier_cpu_count, 1); |
1668 | preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ | ||
1669 | rcu_adopt_orphan_cbs(rsp); | ||
1670 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 1802 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); |
1671 | preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ | ||
1672 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 1803 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
1673 | complete(&rcu_barrier_completion); | 1804 | complete(&rcu_barrier_completion); |
1674 | wait_for_completion(&rcu_barrier_completion); | 1805 | wait_for_completion(&rcu_barrier_completion); |
@@ -1701,7 +1832,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1701 | { | 1832 | { |
1702 | unsigned long flags; | 1833 | unsigned long flags; |
1703 | int i; | 1834 | int i; |
1704 | struct rcu_data *rdp = rsp->rda[cpu]; | 1835 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1705 | struct rcu_node *rnp = rcu_get_root(rsp); | 1836 | struct rcu_node *rnp = rcu_get_root(rsp); |
1706 | 1837 | ||
1707 | /* Set up local state, ensuring consistent view of global state. */ | 1838 | /* Set up local state, ensuring consistent view of global state. */ |
@@ -1725,11 +1856,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1725 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 1856 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
1726 | */ | 1857 | */ |
1727 | static void __cpuinit | 1858 | static void __cpuinit |
1728 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | 1859 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) |
1729 | { | 1860 | { |
1730 | unsigned long flags; | 1861 | unsigned long flags; |
1731 | unsigned long mask; | 1862 | unsigned long mask; |
1732 | struct rcu_data *rdp = rsp->rda[cpu]; | 1863 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1733 | struct rcu_node *rnp = rcu_get_root(rsp); | 1864 | struct rcu_node *rnp = rcu_get_root(rsp); |
1734 | 1865 | ||
1735 | /* Set up local state, ensuring consistent view of global state. */ | 1866 | /* Set up local state, ensuring consistent view of global state. */ |
@@ -1737,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1737 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1868 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
1738 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1869 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
1739 | rdp->beenonline = 1; /* We have now been online. */ | 1870 | rdp->beenonline = 1; /* We have now been online. */ |
1740 | rdp->preemptable = preemptable; | 1871 | rdp->preemptible = preemptible; |
1741 | rdp->qlen_last_fqs_check = 0; | 1872 | rdp->qlen_last_fqs_check = 0; |
1742 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1873 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1743 | rdp->blimit = blimit; | 1874 | rdp->blimit = blimit; |
@@ -1771,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1771 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1902 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
1772 | } | 1903 | } |
1773 | 1904 | ||
1774 | static void __cpuinit rcu_online_cpu(int cpu) | 1905 | static void __cpuinit rcu_prepare_cpu(int cpu) |
1775 | { | 1906 | { |
1776 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); | 1907 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); |
1777 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); | 1908 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); |
@@ -1785,27 +1916,34 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1785 | unsigned long action, void *hcpu) | 1916 | unsigned long action, void *hcpu) |
1786 | { | 1917 | { |
1787 | long cpu = (long)hcpu; | 1918 | long cpu = (long)hcpu; |
1919 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
1920 | struct rcu_node *rnp = rdp->mynode; | ||
1788 | 1921 | ||
1789 | switch (action) { | 1922 | switch (action) { |
1790 | case CPU_UP_PREPARE: | 1923 | case CPU_UP_PREPARE: |
1791 | case CPU_UP_PREPARE_FROZEN: | 1924 | case CPU_UP_PREPARE_FROZEN: |
1792 | rcu_online_cpu(cpu); | 1925 | rcu_prepare_cpu(cpu); |
1926 | rcu_prepare_kthreads(cpu); | ||
1927 | break; | ||
1928 | case CPU_ONLINE: | ||
1929 | case CPU_DOWN_FAILED: | ||
1930 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1931 | rcu_cpu_kthread_setrt(cpu, 1); | ||
1932 | break; | ||
1933 | case CPU_DOWN_PREPARE: | ||
1934 | rcu_node_kthread_setaffinity(rnp, cpu); | ||
1935 | rcu_cpu_kthread_setrt(cpu, 0); | ||
1793 | break; | 1936 | break; |
1794 | case CPU_DYING: | 1937 | case CPU_DYING: |
1795 | case CPU_DYING_FROZEN: | 1938 | case CPU_DYING_FROZEN: |
1796 | /* | 1939 | /* |
1797 | * preempt_disable() in _rcu_barrier() prevents stop_machine(), | 1940 | * The whole machine is "stopped" except this CPU, so we can |
1798 | * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" | 1941 | * touch any data without introducing corruption. We send the |
1799 | * returns, all online cpus have queued rcu_barrier_func(). | 1942 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
1800 | * The dying CPU clears its cpu_online_mask bit and | ||
1801 | * moves all of its RCU callbacks to ->orphan_cbs_list | ||
1802 | * in the context of stop_machine(), so subsequent calls | ||
1803 | * to _rcu_barrier() will adopt these callbacks and only | ||
1804 | * then queue rcu_barrier_func() on all remaining CPUs. | ||
1805 | */ | 1943 | */ |
1806 | rcu_send_cbs_to_orphanage(&rcu_bh_state); | 1944 | rcu_send_cbs_to_online(&rcu_bh_state); |
1807 | rcu_send_cbs_to_orphanage(&rcu_sched_state); | 1945 | rcu_send_cbs_to_online(&rcu_sched_state); |
1808 | rcu_preempt_send_cbs_to_orphanage(); | 1946 | rcu_preempt_send_cbs_to_online(); |
1809 | break; | 1947 | break; |
1810 | case CPU_DEAD: | 1948 | case CPU_DEAD: |
1811 | case CPU_DEAD_FROZEN: | 1949 | case CPU_DEAD_FROZEN: |
@@ -1843,8 +1981,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1843 | { | 1981 | { |
1844 | int i; | 1982 | int i; |
1845 | 1983 | ||
1846 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) | 1984 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
1847 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 1985 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
1986 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | ||
1848 | } | 1987 | } |
1849 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 1988 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
1850 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 1989 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
@@ -1865,7 +2004,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1865 | /* | 2004 | /* |
1866 | * Helper function for rcu_init() that initializes one rcu_state structure. | 2005 | * Helper function for rcu_init() that initializes one rcu_state structure. |
1867 | */ | 2006 | */ |
1868 | static void __init rcu_init_one(struct rcu_state *rsp) | 2007 | static void __init rcu_init_one(struct rcu_state *rsp, |
2008 | struct rcu_data __percpu *rda) | ||
1869 | { | 2009 | { |
1870 | static char *buf[] = { "rcu_node_level_0", | 2010 | static char *buf[] = { "rcu_node_level_0", |
1871 | "rcu_node_level_1", | 2011 | "rcu_node_level_1", |
@@ -1911,46 +2051,29 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1911 | j / rsp->levelspread[i - 1]; | 2051 | j / rsp->levelspread[i - 1]; |
1912 | } | 2052 | } |
1913 | rnp->level = i; | 2053 | rnp->level = i; |
1914 | INIT_LIST_HEAD(&rnp->blocked_tasks[0]); | 2054 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
1915 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); | ||
1916 | INIT_LIST_HEAD(&rnp->blocked_tasks[2]); | ||
1917 | INIT_LIST_HEAD(&rnp->blocked_tasks[3]); | ||
1918 | } | 2055 | } |
1919 | } | 2056 | } |
1920 | 2057 | ||
2058 | rsp->rda = rda; | ||
1921 | rnp = rsp->level[NUM_RCU_LVLS - 1]; | 2059 | rnp = rsp->level[NUM_RCU_LVLS - 1]; |
1922 | for_each_possible_cpu(i) { | 2060 | for_each_possible_cpu(i) { |
1923 | while (i > rnp->grphi) | 2061 | while (i > rnp->grphi) |
1924 | rnp++; | 2062 | rnp++; |
1925 | rsp->rda[i]->mynode = rnp; | 2063 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; |
1926 | rcu_boot_init_percpu_data(i, rsp); | 2064 | rcu_boot_init_percpu_data(i, rsp); |
1927 | } | 2065 | } |
1928 | } | 2066 | } |
1929 | 2067 | ||
1930 | /* | ||
1931 | * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used | ||
1932 | * nowhere else! Assigns leaf node pointers into each CPU's rcu_data | ||
1933 | * structure. | ||
1934 | */ | ||
1935 | #define RCU_INIT_FLAVOR(rsp, rcu_data) \ | ||
1936 | do { \ | ||
1937 | int i; \ | ||
1938 | \ | ||
1939 | for_each_possible_cpu(i) { \ | ||
1940 | (rsp)->rda[i] = &per_cpu(rcu_data, i); \ | ||
1941 | } \ | ||
1942 | rcu_init_one(rsp); \ | ||
1943 | } while (0) | ||
1944 | |||
1945 | void __init rcu_init(void) | 2068 | void __init rcu_init(void) |
1946 | { | 2069 | { |
1947 | int cpu; | 2070 | int cpu; |
1948 | 2071 | ||
1949 | rcu_bootup_announce(); | 2072 | rcu_bootup_announce(); |
1950 | RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); | 2073 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
1951 | RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); | 2074 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
1952 | __rcu_init_preempt(); | 2075 | __rcu_init_preempt(); |
1953 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 2076 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
1954 | 2077 | ||
1955 | /* | 2078 | /* |
1956 | * We don't need protection against CPU-hotplug here because | 2079 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..01b2ccda26fb 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -31,46 +31,51 @@ | |||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. |
33 | * In theory, it should be possible to add more levels straightforwardly. | 33 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this has not been tested, so there is probably some | 34 | * In practice, this did work well going from three levels to four. |
35 | * bug somewhere. | 35 | * Of course, your mileage may vary. |
36 | */ | 36 | */ |
37 | #define MAX_RCU_LVLS 4 | 37 | #define MAX_RCU_LVLS 4 |
38 | #define RCU_FANOUT (CONFIG_RCU_FANOUT) | 38 | #if CONFIG_RCU_FANOUT > 16 |
39 | #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) | 39 | #define RCU_FANOUT_LEAF 16 |
40 | #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) | 40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ |
41 | #define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) | 41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) |
42 | 42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | |
43 | #if NR_CPUS <= RCU_FANOUT | 43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) |
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | ||
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | ||
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | ||
47 | |||
48 | #if NR_CPUS <= RCU_FANOUT_1 | ||
44 | # define NUM_RCU_LVLS 1 | 49 | # define NUM_RCU_LVLS 1 |
45 | # define NUM_RCU_LVL_0 1 | 50 | # define NUM_RCU_LVL_0 1 |
46 | # define NUM_RCU_LVL_1 (NR_CPUS) | 51 | # define NUM_RCU_LVL_1 (NR_CPUS) |
47 | # define NUM_RCU_LVL_2 0 | 52 | # define NUM_RCU_LVL_2 0 |
48 | # define NUM_RCU_LVL_3 0 | 53 | # define NUM_RCU_LVL_3 0 |
49 | # define NUM_RCU_LVL_4 0 | 54 | # define NUM_RCU_LVL_4 0 |
50 | #elif NR_CPUS <= RCU_FANOUT_SQ | 55 | #elif NR_CPUS <= RCU_FANOUT_2 |
51 | # define NUM_RCU_LVLS 2 | 56 | # define NUM_RCU_LVLS 2 |
52 | # define NUM_RCU_LVL_0 1 | 57 | # define NUM_RCU_LVL_0 1 |
53 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 58 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
54 | # define NUM_RCU_LVL_2 (NR_CPUS) | 59 | # define NUM_RCU_LVL_2 (NR_CPUS) |
55 | # define NUM_RCU_LVL_3 0 | 60 | # define NUM_RCU_LVL_3 0 |
56 | # define NUM_RCU_LVL_4 0 | 61 | # define NUM_RCU_LVL_4 0 |
57 | #elif NR_CPUS <= RCU_FANOUT_CUBE | 62 | #elif NR_CPUS <= RCU_FANOUT_3 |
58 | # define NUM_RCU_LVLS 3 | 63 | # define NUM_RCU_LVLS 3 |
59 | # define NUM_RCU_LVL_0 1 | 64 | # define NUM_RCU_LVL_0 1 |
60 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 65 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
61 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 66 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
62 | # define NUM_RCU_LVL_3 NR_CPUS | 67 | # define NUM_RCU_LVL_3 (NR_CPUS) |
63 | # define NUM_RCU_LVL_4 0 | 68 | # define NUM_RCU_LVL_4 0 |
64 | #elif NR_CPUS <= RCU_FANOUT_FOURTH | 69 | #elif NR_CPUS <= RCU_FANOUT_4 |
65 | # define NUM_RCU_LVLS 4 | 70 | # define NUM_RCU_LVLS 4 |
66 | # define NUM_RCU_LVL_0 1 | 71 | # define NUM_RCU_LVL_0 1 |
67 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) | 72 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
68 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | 73 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
69 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | 74 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
70 | # define NUM_RCU_LVL_4 NR_CPUS | 75 | # define NUM_RCU_LVL_4 (NR_CPUS) |
71 | #else | 76 | #else |
72 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | 77 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
73 | #endif /* #if (NR_CPUS) <= RCU_FANOUT */ | 78 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ |
74 | 79 | ||
75 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | 80 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
76 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | 81 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
@@ -79,13 +84,19 @@ | |||
79 | * Dynticks per-CPU state. | 84 | * Dynticks per-CPU state. |
80 | */ | 85 | */ |
81 | struct rcu_dynticks { | 86 | struct rcu_dynticks { |
82 | int dynticks_nesting; /* Track nesting level, sort of. */ | 87 | int dynticks_nesting; /* Track irq/process nesting level. */ |
83 | int dynticks; /* Even value for dynticks-idle, else odd. */ | 88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
84 | int dynticks_nmi; /* Even value for either dynticks-idle or */ | 89 | atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ |
85 | /* not in nmi handler, else odd. So this */ | ||
86 | /* remains even for nmi from irq handler. */ | ||
87 | }; | 90 | }; |
88 | 91 | ||
92 | /* RCU's kthread states for tracing. */ | ||
93 | #define RCU_KTHREAD_STOPPED 0 | ||
94 | #define RCU_KTHREAD_RUNNING 1 | ||
95 | #define RCU_KTHREAD_WAITING 2 | ||
96 | #define RCU_KTHREAD_OFFCPU 3 | ||
97 | #define RCU_KTHREAD_YIELDING 4 | ||
98 | #define RCU_KTHREAD_MAX 4 | ||
99 | |||
89 | /* | 100 | /* |
90 | * Definition for node within the RCU grace-period-detection hierarchy. | 101 | * Definition for node within the RCU grace-period-detection hierarchy. |
91 | */ | 102 | */ |
@@ -104,10 +115,13 @@ struct rcu_node { | |||
104 | /* an rcu_data structure, otherwise, each */ | 115 | /* an rcu_data structure, otherwise, each */ |
105 | /* bit corresponds to a child rcu_node */ | 116 | /* bit corresponds to a child rcu_node */ |
106 | /* structure. */ | 117 | /* structure. */ |
107 | unsigned long expmask; /* Groups that have ->blocked_tasks[] */ | 118 | unsigned long expmask; /* Groups that have ->blkd_tasks */ |
108 | /* elements that need to drain to allow the */ | 119 | /* elements that need to drain to allow the */ |
109 | /* current expedited grace period to */ | 120 | /* current expedited grace period to */ |
110 | /* complete (only for TREE_PREEMPT_RCU). */ | 121 | /* complete (only for TREE_PREEMPT_RCU). */ |
122 | atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ | ||
123 | /* Since this has meaning only for leaf */ | ||
124 | /* rcu_node structures, 32 bits suffices. */ | ||
111 | unsigned long qsmaskinit; | 125 | unsigned long qsmaskinit; |
112 | /* Per-GP initial value for qsmask & expmask. */ | 126 | /* Per-GP initial value for qsmask & expmask. */ |
113 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 127 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
@@ -117,11 +131,62 @@ struct rcu_node { | |||
117 | u8 grpnum; /* CPU/group number for next level up. */ | 131 | u8 grpnum; /* CPU/group number for next level up. */ |
118 | u8 level; /* root is at level 0. */ | 132 | u8 level; /* root is at level 0. */ |
119 | struct rcu_node *parent; | 133 | struct rcu_node *parent; |
120 | struct list_head blocked_tasks[4]; | 134 | struct list_head blkd_tasks; |
121 | /* Tasks blocked in RCU read-side critsect. */ | 135 | /* Tasks blocked in RCU read-side critical */ |
122 | /* Grace period number (->gpnum) x blocked */ | 136 | /* section. Tasks are placed at the head */ |
123 | /* by tasks on the (x & 0x1) element of the */ | 137 | /* of this list and age towards the tail. */ |
124 | /* blocked_tasks[] array. */ | 138 | struct list_head *gp_tasks; |
139 | /* Pointer to the first task blocking the */ | ||
140 | /* current grace period, or NULL if there */ | ||
141 | /* is no such task. */ | ||
142 | struct list_head *exp_tasks; | ||
143 | /* Pointer to the first task blocking the */ | ||
144 | /* current expedited grace period, or NULL */ | ||
145 | /* if there is no such task. If there */ | ||
146 | /* is no current expedited grace period, */ | ||
147 | /* then there can cannot be any such task. */ | ||
148 | #ifdef CONFIG_RCU_BOOST | ||
149 | struct list_head *boost_tasks; | ||
150 | /* Pointer to first task that needs to be */ | ||
151 | /* priority boosted, or NULL if no priority */ | ||
152 | /* boosting is needed for this rcu_node */ | ||
153 | /* structure. If there are no tasks */ | ||
154 | /* queued on this rcu_node structure that */ | ||
155 | /* are blocking the current grace period, */ | ||
156 | /* there can be no such task. */ | ||
157 | unsigned long boost_time; | ||
158 | /* When to start boosting (jiffies). */ | ||
159 | struct task_struct *boost_kthread_task; | ||
160 | /* kthread that takes care of priority */ | ||
161 | /* boosting for this rcu_node structure. */ | ||
162 | unsigned int boost_kthread_status; | ||
163 | /* State of boost_kthread_task for tracing. */ | ||
164 | unsigned long n_tasks_boosted; | ||
165 | /* Total number of tasks boosted. */ | ||
166 | unsigned long n_exp_boosts; | ||
167 | /* Number of tasks boosted for expedited GP. */ | ||
168 | unsigned long n_normal_boosts; | ||
169 | /* Number of tasks boosted for normal GP. */ | ||
170 | unsigned long n_balk_blkd_tasks; | ||
171 | /* Refused to boost: no blocked tasks. */ | ||
172 | unsigned long n_balk_exp_gp_tasks; | ||
173 | /* Refused to boost: nothing blocking GP. */ | ||
174 | unsigned long n_balk_boost_tasks; | ||
175 | /* Refused to boost: already boosting. */ | ||
176 | unsigned long n_balk_notblocked; | ||
177 | /* Refused to boost: RCU RS CS still running. */ | ||
178 | unsigned long n_balk_notyet; | ||
179 | /* Refused to boost: not yet time. */ | ||
180 | unsigned long n_balk_nos; | ||
181 | /* Refused to boost: not sure why, though. */ | ||
182 | /* This can happen due to race conditions. */ | ||
183 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
184 | struct task_struct *node_kthread_task; | ||
185 | /* kthread that takes care of this rcu_node */ | ||
186 | /* structure, for example, awakening the */ | ||
187 | /* per-CPU kthreads as needed. */ | ||
188 | unsigned int node_kthread_status; | ||
189 | /* State of node_kthread_task for tracing. */ | ||
125 | } ____cacheline_internodealigned_in_smp; | 190 | } ____cacheline_internodealigned_in_smp; |
126 | 191 | ||
127 | /* | 192 | /* |
@@ -170,7 +235,7 @@ struct rcu_data { | |||
170 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 235 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
171 | bool qs_pending; /* Core waits for quiesc state. */ | 236 | bool qs_pending; /* Core waits for quiesc state. */ |
172 | bool beenonline; /* CPU online at least once. */ | 237 | bool beenonline; /* CPU online at least once. */ |
173 | bool preemptable; /* Preemptable RCU? */ | 238 | bool preemptible; /* Preemptible RCU? */ |
174 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 239 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
175 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 240 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
176 | 241 | ||
@@ -202,6 +267,9 @@ struct rcu_data { | |||
202 | long qlen; /* # of queued callbacks */ | 267 | long qlen; /* # of queued callbacks */ |
203 | long qlen_last_fqs_check; | 268 | long qlen_last_fqs_check; |
204 | /* qlen at last check for QS forcing */ | 269 | /* qlen at last check for QS forcing */ |
270 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
271 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ | ||
272 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ | ||
205 | unsigned long n_force_qs_snap; | 273 | unsigned long n_force_qs_snap; |
206 | /* did other CPU force QS recently? */ | 274 | /* did other CPU force QS recently? */ |
207 | long blimit; /* Upper limit on a processed batch */ | 275 | long blimit; /* Upper limit on a processed batch */ |
@@ -210,7 +278,6 @@ struct rcu_data { | |||
210 | /* 3) dynticks interface. */ | 278 | /* 3) dynticks interface. */ |
211 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | 279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ |
212 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | 280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ |
213 | int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ | ||
214 | #endif /* #ifdef CONFIG_NO_HZ */ | 281 | #endif /* #ifdef CONFIG_NO_HZ */ |
215 | 282 | ||
216 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 283 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
@@ -246,7 +313,6 @@ struct rcu_data { | |||
246 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 313 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
247 | 314 | ||
248 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 315 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
249 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
250 | 316 | ||
251 | #ifdef CONFIG_PROVE_RCU | 317 | #ifdef CONFIG_PROVE_RCU |
252 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | 318 | #define RCU_STALL_DELAY_DELTA (5 * HZ) |
@@ -254,19 +320,26 @@ struct rcu_data { | |||
254 | #define RCU_STALL_DELAY_DELTA 0 | 320 | #define RCU_STALL_DELAY_DELTA 0 |
255 | #endif | 321 | #endif |
256 | 322 | ||
257 | #define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) | 323 | #define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ |
324 | RCU_STALL_DELAY_DELTA) | ||
258 | /* for rsp->jiffies_stall */ | 325 | /* for rsp->jiffies_stall */ |
259 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) | 326 | #define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) |
260 | /* for rsp->jiffies_stall */ | 327 | /* for rsp->jiffies_stall */ |
261 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 328 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ |
262 | /* to take at least one */ | 329 | /* to take at least one */ |
263 | /* scheduling clock irq */ | 330 | /* scheduling clock irq */ |
264 | /* before ratting on them. */ | 331 | /* before ratting on them. */ |
265 | 332 | ||
266 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 333 | #define rcu_wait(cond) \ |
267 | 334 | do { \ | |
268 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | 335 | for (;;) { \ |
269 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | 336 | set_current_state(TASK_INTERRUPTIBLE); \ |
337 | if (cond) \ | ||
338 | break; \ | ||
339 | schedule(); \ | ||
340 | } \ | ||
341 | __set_current_state(TASK_RUNNING); \ | ||
342 | } while (0) | ||
270 | 343 | ||
271 | /* | 344 | /* |
272 | * RCU global state, including node hierarchy. This hierarchy is | 345 | * RCU global state, including node hierarchy. This hierarchy is |
@@ -283,7 +356,7 @@ struct rcu_state { | |||
283 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ | 356 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ |
284 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 357 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
285 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ | 358 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ |
286 | struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ | 359 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
287 | 360 | ||
288 | /* The following fields are guarded by the root rcu_node's lock. */ | 361 | /* The following fields are guarded by the root rcu_node's lock. */ |
289 | 362 | ||
@@ -296,21 +369,14 @@ struct rcu_state { | |||
296 | /* period because */ | 369 | /* period because */ |
297 | /* force_quiescent_state() */ | 370 | /* force_quiescent_state() */ |
298 | /* was running. */ | 371 | /* was running. */ |
372 | u8 boost; /* Subject to priority boost. */ | ||
299 | unsigned long gpnum; /* Current gp number. */ | 373 | unsigned long gpnum; /* Current gp number. */ |
300 | unsigned long completed; /* # of last completed gp. */ | 374 | unsigned long completed; /* # of last completed gp. */ |
301 | 375 | ||
302 | /* End of fields guarded by root rcu_node's lock. */ | 376 | /* End of fields guarded by root rcu_node's lock. */ |
303 | 377 | ||
304 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 378 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
305 | /* starting new GP. Also */ | 379 | /* starting new GP. */ |
306 | /* protects the following */ | ||
307 | /* orphan_cbs fields. */ | ||
308 | struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ | ||
309 | /* orphaned by all CPUs in */ | ||
310 | /* a given leaf rcu_node */ | ||
311 | /* going offline. */ | ||
312 | struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | ||
313 | long orphan_qlen; /* Number of orphaned cbs. */ | ||
314 | raw_spinlock_t fqslock; /* Only one task forcing */ | 380 | raw_spinlock_t fqslock; /* Only one task forcing */ |
315 | /* quiescent states. */ | 381 | /* quiescent states. */ |
316 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 382 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -321,12 +387,12 @@ struct rcu_state { | |||
321 | /* due to lock unavailable. */ | 387 | /* due to lock unavailable. */ |
322 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | 388 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ |
323 | /* due to no GP active. */ | 389 | /* due to no GP active. */ |
324 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
325 | unsigned long gp_start; /* Time at which GP started, */ | 390 | unsigned long gp_start; /* Time at which GP started, */ |
326 | /* but in jiffies. */ | 391 | /* but in jiffies. */ |
327 | unsigned long jiffies_stall; /* Time at which to check */ | 392 | unsigned long jiffies_stall; /* Time at which to check */ |
328 | /* for CPU stalls. */ | 393 | /* for CPU stalls. */ |
329 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 394 | unsigned long gp_max; /* Maximum GP duration in */ |
395 | /* jiffies. */ | ||
330 | char *name; /* Name of structure. */ | 396 | char *name; /* Name of structure. */ |
331 | }; | 397 | }; |
332 | 398 | ||
@@ -357,15 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | |||
357 | static void rcu_bootup_announce(void); | 423 | static void rcu_bootup_announce(void); |
358 | long rcu_batches_completed(void); | 424 | long rcu_batches_completed(void); |
359 | static void rcu_preempt_note_context_switch(int cpu); | 425 | static void rcu_preempt_note_context_switch(int cpu); |
360 | static int rcu_preempted_readers(struct rcu_node *rnp); | 426 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
361 | #ifdef CONFIG_HOTPLUG_CPU | 427 | #ifdef CONFIG_HOTPLUG_CPU |
362 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 428 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
363 | unsigned long flags); | 429 | unsigned long flags); |
430 | static void rcu_stop_cpu_kthread(int cpu); | ||
364 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 431 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
365 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
366 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 432 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
367 | static void rcu_print_task_stall(struct rcu_node *rnp); | 433 | static void rcu_print_task_stall(struct rcu_node *rnp); |
368 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 434 | static void rcu_preempt_stall_reset(void); |
369 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 435 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
370 | #ifdef CONFIG_HOTPLUG_CPU | 436 | #ifdef CONFIG_HOTPLUG_CPU |
371 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 437 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
@@ -382,8 +448,23 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | |||
382 | static int rcu_preempt_pending(int cpu); | 448 | static int rcu_preempt_pending(int cpu); |
383 | static int rcu_preempt_needs_cpu(int cpu); | 449 | static int rcu_preempt_needs_cpu(int cpu); |
384 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 450 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
385 | static void rcu_preempt_send_cbs_to_orphanage(void); | 451 | static void rcu_preempt_send_cbs_to_online(void); |
386 | static void __init __rcu_init_preempt(void); | 452 | static void __init __rcu_init_preempt(void); |
387 | static void rcu_needs_cpu_flush(void); | 453 | static void rcu_needs_cpu_flush(void); |
454 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | ||
455 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | ||
456 | static void invoke_rcu_callbacks_kthread(void); | ||
457 | #ifdef CONFIG_RCU_BOOST | ||
458 | static void rcu_preempt_do_callbacks(void); | ||
459 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
460 | cpumask_var_t cm); | ||
461 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
462 | struct rcu_node *rnp, | ||
463 | int rnp_index); | ||
464 | static void invoke_rcu_node_kthread(struct rcu_node *rnp); | ||
465 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | ||
466 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
467 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | ||
468 | static void __cpuinit rcu_prepare_kthreads(int cpu); | ||
388 | 469 | ||
389 | #endif /* #ifndef RCU_TREE_NONCORE */ | 470 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..8aafbb80b8b0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) |
3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/stop_machine.h> | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * Check the RCU kernel configuration parameters and print informative | 31 | * Check the RCU kernel configuration parameters and print informative |
@@ -53,11 +54,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
53 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
54 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); | 55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); |
55 | #endif | 56 | #endif |
56 | #ifndef CONFIG_RCU_CPU_STALL_DETECTOR | 57 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
57 | printk(KERN_INFO | ||
58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | ||
59 | #endif | ||
60 | #ifndef CONFIG_RCU_CPU_STALL_VERBOSE | ||
61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 58 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
62 | #endif | 59 | #endif |
63 | #if NUM_RCU_LVL_4 != 0 | 60 | #if NUM_RCU_LVL_4 != 0 |
@@ -69,7 +66,9 @@ static void __init rcu_bootup_announce_oddness(void) | |||
69 | 66 | ||
70 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); | 67 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); |
71 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 68 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
69 | static struct rcu_state *rcu_state = &rcu_preempt_state; | ||
72 | 70 | ||
71 | static void rcu_read_unlock_special(struct task_struct *t); | ||
73 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 72 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
74 | 73 | ||
75 | /* | 74 | /* |
@@ -77,7 +76,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); | |||
77 | */ | 76 | */ |
78 | static void __init rcu_bootup_announce(void) | 77 | static void __init rcu_bootup_announce(void) |
79 | { | 78 | { |
80 | printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); | 79 | printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); |
81 | rcu_bootup_announce_oddness(); | 80 | rcu_bootup_announce_oddness(); |
82 | } | 81 | } |
83 | 82 | ||
@@ -110,7 +109,7 @@ void rcu_force_quiescent_state(void) | |||
110 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 109 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
111 | 110 | ||
112 | /* | 111 | /* |
113 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 112 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
114 | * that this just means that the task currently running on the CPU is | 113 | * that this just means that the task currently running on the CPU is |
115 | * not in a quiescent state. There might be any number of tasks blocked | 114 | * not in a quiescent state. There might be any number of tasks blocked |
116 | * while in an RCU read-side critical section. | 115 | * while in an RCU read-side critical section. |
@@ -133,12 +132,12 @@ static void rcu_preempt_qs(int cpu) | |||
133 | * We have entered the scheduler, and the current task might soon be | 132 | * We have entered the scheduler, and the current task might soon be |
134 | * context-switched away from. If this task is in an RCU read-side | 133 | * context-switched away from. If this task is in an RCU read-side |
135 | * critical section, we will no longer be able to rely on the CPU to | 134 | * critical section, we will no longer be able to rely on the CPU to |
136 | * record that fact, so we enqueue the task on the appropriate entry | 135 | * record that fact, so we enqueue the task on the blkd_tasks list. |
137 | * of the blocked_tasks[] array. The task will dequeue itself when | 136 | * The task will dequeue itself when it exits the outermost enclosing |
138 | * it exits the outermost enclosing RCU read-side critical section. | 137 | * RCU read-side critical section. Therefore, the current grace period |
139 | * Therefore, the current grace period cannot be permitted to complete | 138 | * cannot be permitted to complete until the blkd_tasks list entries |
140 | * until the blocked_tasks[] entry indexed by the low-order bit of | 139 | * predating the current grace period drain, in other words, until |
141 | * rnp->gpnum empties. | 140 | * rnp->gp_tasks becomes NULL. |
142 | * | 141 | * |
143 | * Caller must disable preemption. | 142 | * Caller must disable preemption. |
144 | */ | 143 | */ |
@@ -146,15 +145,14 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
146 | { | 145 | { |
147 | struct task_struct *t = current; | 146 | struct task_struct *t = current; |
148 | unsigned long flags; | 147 | unsigned long flags; |
149 | int phase; | ||
150 | struct rcu_data *rdp; | 148 | struct rcu_data *rdp; |
151 | struct rcu_node *rnp; | 149 | struct rcu_node *rnp; |
152 | 150 | ||
153 | if (t->rcu_read_lock_nesting && | 151 | if (t->rcu_read_lock_nesting > 0 && |
154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 152 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
155 | 153 | ||
156 | /* Possibly blocking in an RCU read-side critical section. */ | 154 | /* Possibly blocking in an RCU read-side critical section. */ |
157 | rdp = rcu_preempt_state.rda[cpu]; | 155 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
158 | rnp = rdp->mynode; | 156 | rnp = rdp->mynode; |
159 | raw_spin_lock_irqsave(&rnp->lock, flags); | 157 | raw_spin_lock_irqsave(&rnp->lock, flags); |
160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 158 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -168,16 +166,39 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
168 | * (i.e., this CPU has not yet passed through a quiescent | 166 | * (i.e., this CPU has not yet passed through a quiescent |
169 | * state for the current grace period), then as long | 167 | * state for the current grace period), then as long |
170 | * as that task remains queued, the current grace period | 168 | * as that task remains queued, the current grace period |
171 | * cannot end. | 169 | * cannot end. Note that there is some uncertainty as |
170 | * to exactly when the current grace period started. | ||
171 | * We take a conservative approach, which can result | ||
172 | * in unnecessarily waiting on tasks that started very | ||
173 | * slightly after the current grace period began. C'est | ||
174 | * la vie!!! | ||
172 | * | 175 | * |
173 | * But first, note that the current CPU must still be | 176 | * But first, note that the current CPU must still be |
174 | * on line! | 177 | * on line! |
175 | */ | 178 | */ |
176 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | 179 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); |
177 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 180 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
178 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 181 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { |
179 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 182 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); |
183 | rnp->gp_tasks = &t->rcu_node_entry; | ||
184 | #ifdef CONFIG_RCU_BOOST | ||
185 | if (rnp->boost_tasks != NULL) | ||
186 | rnp->boost_tasks = rnp->gp_tasks; | ||
187 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
188 | } else { | ||
189 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
190 | if (rnp->qsmask & rdp->grpmask) | ||
191 | rnp->gp_tasks = &t->rcu_node_entry; | ||
192 | } | ||
180 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 193 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
194 | } else if (t->rcu_read_lock_nesting < 0 && | ||
195 | t->rcu_read_unlock_special) { | ||
196 | |||
197 | /* | ||
198 | * Complete exit from RCU read-side critical section on | ||
199 | * behalf of preempted instance of __rcu_read_unlock(). | ||
200 | */ | ||
201 | rcu_read_unlock_special(t); | ||
181 | } | 202 | } |
182 | 203 | ||
183 | /* | 204 | /* |
@@ -195,13 +216,13 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
195 | } | 216 | } |
196 | 217 | ||
197 | /* | 218 | /* |
198 | * Tree-preemptable RCU implementation for rcu_read_lock(). | 219 | * Tree-preemptible RCU implementation for rcu_read_lock(). |
199 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | 220 | * Just increment ->rcu_read_lock_nesting, shared state will be updated |
200 | * if we block. | 221 | * if we block. |
201 | */ | 222 | */ |
202 | void __rcu_read_lock(void) | 223 | void __rcu_read_lock(void) |
203 | { | 224 | { |
204 | ACCESS_ONCE(current->rcu_read_lock_nesting)++; | 225 | current->rcu_read_lock_nesting++; |
205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ | 226 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ |
206 | } | 227 | } |
207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | 228 | EXPORT_SYMBOL_GPL(__rcu_read_lock); |
@@ -211,12 +232,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); | |||
211 | * for the specified rcu_node structure. If the caller needs a reliable | 232 | * for the specified rcu_node structure. If the caller needs a reliable |
212 | * answer, it must hold the rcu_node's ->lock. | 233 | * answer, it must hold the rcu_node's ->lock. |
213 | */ | 234 | */ |
214 | static int rcu_preempted_readers(struct rcu_node *rnp) | 235 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
215 | { | 236 | { |
216 | int phase = rnp->gpnum & 0x1; | 237 | return rnp->gp_tasks != NULL; |
217 | |||
218 | return !list_empty(&rnp->blocked_tasks[phase]) || | ||
219 | !list_empty(&rnp->blocked_tasks[phase + 2]); | ||
220 | } | 238 | } |
221 | 239 | ||
222 | /* | 240 | /* |
@@ -232,7 +250,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
232 | unsigned long mask; | 250 | unsigned long mask; |
233 | struct rcu_node *rnp_p; | 251 | struct rcu_node *rnp_p; |
234 | 252 | ||
235 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 253 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
236 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 254 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
237 | return; /* Still need more quiescent states! */ | 255 | return; /* Still need more quiescent states! */ |
238 | } | 256 | } |
@@ -256,15 +274,31 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
256 | } | 274 | } |
257 | 275 | ||
258 | /* | 276 | /* |
277 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
278 | * returning NULL if at the end of the list. | ||
279 | */ | ||
280 | static struct list_head *rcu_next_node_entry(struct task_struct *t, | ||
281 | struct rcu_node *rnp) | ||
282 | { | ||
283 | struct list_head *np; | ||
284 | |||
285 | np = t->rcu_node_entry.next; | ||
286 | if (np == &rnp->blkd_tasks) | ||
287 | np = NULL; | ||
288 | return np; | ||
289 | } | ||
290 | |||
291 | /* | ||
259 | * Handle special cases during rcu_read_unlock(), such as needing to | 292 | * Handle special cases during rcu_read_unlock(), such as needing to |
260 | * notify RCU core processing or task having blocked during the RCU | 293 | * notify RCU core processing or task having blocked during the RCU |
261 | * read-side critical section. | 294 | * read-side critical section. |
262 | */ | 295 | */ |
263 | static void rcu_read_unlock_special(struct task_struct *t) | 296 | static noinline void rcu_read_unlock_special(struct task_struct *t) |
264 | { | 297 | { |
265 | int empty; | 298 | int empty; |
266 | int empty_exp; | 299 | int empty_exp; |
267 | unsigned long flags; | 300 | unsigned long flags; |
301 | struct list_head *np; | ||
268 | struct rcu_node *rnp; | 302 | struct rcu_node *rnp; |
269 | int special; | 303 | int special; |
270 | 304 | ||
@@ -284,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
284 | } | 318 | } |
285 | 319 | ||
286 | /* Hardware IRQ handlers cannot block. */ | 320 | /* Hardware IRQ handlers cannot block. */ |
287 | if (in_irq()) { | 321 | if (in_irq() || in_serving_softirq()) { |
288 | local_irq_restore(flags); | 322 | local_irq_restore(flags); |
289 | return; | 323 | return; |
290 | } | 324 | } |
@@ -305,10 +339,24 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
305 | break; | 339 | break; |
306 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 340 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
307 | } | 341 | } |
308 | empty = !rcu_preempted_readers(rnp); | 342 | empty = !rcu_preempt_blocked_readers_cgp(rnp); |
309 | empty_exp = !rcu_preempted_readers_exp(rnp); | 343 | empty_exp = !rcu_preempted_readers_exp(rnp); |
310 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 344 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
345 | np = rcu_next_node_entry(t, rnp); | ||
311 | list_del_init(&t->rcu_node_entry); | 346 | list_del_init(&t->rcu_node_entry); |
347 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
348 | rnp->gp_tasks = np; | ||
349 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
350 | rnp->exp_tasks = np; | ||
351 | #ifdef CONFIG_RCU_BOOST | ||
352 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
353 | rnp->boost_tasks = np; | ||
354 | /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ | ||
355 | if (t->rcu_boosted) { | ||
356 | special |= RCU_READ_UNLOCK_BOOSTED; | ||
357 | t->rcu_boosted = 0; | ||
358 | } | ||
359 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
312 | t->rcu_blocked_node = NULL; | 360 | t->rcu_blocked_node = NULL; |
313 | 361 | ||
314 | /* | 362 | /* |
@@ -321,6 +369,14 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
321 | else | 369 | else |
322 | rcu_report_unblock_qs_rnp(rnp, flags); | 370 | rcu_report_unblock_qs_rnp(rnp, flags); |
323 | 371 | ||
372 | #ifdef CONFIG_RCU_BOOST | ||
373 | /* Unboost if we were boosted. */ | ||
374 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
375 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
376 | t->rcu_boost_mutex = NULL; | ||
377 | } | ||
378 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
379 | |||
324 | /* | 380 | /* |
325 | * If this was the last task on the expedited lists, | 381 | * If this was the last task on the expedited lists, |
326 | * then we need to report up the rcu_node hierarchy. | 382 | * then we need to report up the rcu_node hierarchy. |
@@ -333,7 +389,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
333 | } | 389 | } |
334 | 390 | ||
335 | /* | 391 | /* |
336 | * Tree-preemptable RCU implementation for rcu_read_unlock(). | 392 | * Tree-preemptible RCU implementation for rcu_read_unlock(). |
337 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | 393 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost |
338 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | 394 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then |
339 | * invoke rcu_read_unlock_special() to clean up after a context switch | 395 | * invoke rcu_read_unlock_special() to clean up after a context switch |
@@ -344,17 +400,26 @@ void __rcu_read_unlock(void) | |||
344 | struct task_struct *t = current; | 400 | struct task_struct *t = current; |
345 | 401 | ||
346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | 402 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ |
347 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 403 | if (t->rcu_read_lock_nesting != 1) |
348 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 404 | --t->rcu_read_lock_nesting; |
349 | rcu_read_unlock_special(t); | 405 | else { |
406 | t->rcu_read_lock_nesting = INT_MIN; | ||
407 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
408 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
409 | rcu_read_unlock_special(t); | ||
410 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
411 | t->rcu_read_lock_nesting = 0; | ||
412 | } | ||
350 | #ifdef CONFIG_PROVE_LOCKING | 413 | #ifdef CONFIG_PROVE_LOCKING |
351 | WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); | 414 | { |
415 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
416 | |||
417 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
418 | } | ||
352 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | 419 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ |
353 | } | 420 | } |
354 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 421 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
355 | 422 | ||
356 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
357 | |||
358 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | 423 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE |
359 | 424 | ||
360 | /* | 425 | /* |
@@ -364,18 +429,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); | |||
364 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | 429 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) |
365 | { | 430 | { |
366 | unsigned long flags; | 431 | unsigned long flags; |
367 | struct list_head *lp; | ||
368 | int phase; | ||
369 | struct task_struct *t; | 432 | struct task_struct *t; |
370 | 433 | ||
371 | if (rcu_preempted_readers(rnp)) { | 434 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
372 | raw_spin_lock_irqsave(&rnp->lock, flags); | 435 | return; |
373 | phase = rnp->gpnum & 0x1; | 436 | raw_spin_lock_irqsave(&rnp->lock, flags); |
374 | lp = &rnp->blocked_tasks[phase]; | 437 | t = list_entry(rnp->gp_tasks, |
375 | list_for_each_entry(t, lp, rcu_node_entry) | 438 | struct task_struct, rcu_node_entry); |
376 | sched_show_task(t); | 439 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
377 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 440 | sched_show_task(t); |
378 | } | 441 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
379 | } | 442 | } |
380 | 443 | ||
381 | /* | 444 | /* |
@@ -405,19 +468,25 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
405 | */ | 468 | */ |
406 | static void rcu_print_task_stall(struct rcu_node *rnp) | 469 | static void rcu_print_task_stall(struct rcu_node *rnp) |
407 | { | 470 | { |
408 | struct list_head *lp; | ||
409 | int phase; | ||
410 | struct task_struct *t; | 471 | struct task_struct *t; |
411 | 472 | ||
412 | if (rcu_preempted_readers(rnp)) { | 473 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
413 | phase = rnp->gpnum & 0x1; | 474 | return; |
414 | lp = &rnp->blocked_tasks[phase]; | 475 | t = list_entry(rnp->gp_tasks, |
415 | list_for_each_entry(t, lp, rcu_node_entry) | 476 | struct task_struct, rcu_node_entry); |
416 | printk(" P%d", t->pid); | 477 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
417 | } | 478 | printk(" P%d", t->pid); |
418 | } | 479 | } |
419 | 480 | ||
420 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 481 | /* |
482 | * Suppress preemptible RCU's CPU stall warnings by pushing the | ||
483 | * time of the next stall-warning message comfortably far into the | ||
484 | * future. | ||
485 | */ | ||
486 | static void rcu_preempt_stall_reset(void) | ||
487 | { | ||
488 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
489 | } | ||
421 | 490 | ||
422 | /* | 491 | /* |
423 | * Check that the list of blocked tasks for the newly completed grace | 492 | * Check that the list of blocked tasks for the newly completed grace |
@@ -425,10 +494,15 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
425 | * period that still has RCU readers blocked! This function must be | 494 | * period that still has RCU readers blocked! This function must be |
426 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | 495 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock |
427 | * must be held by the caller. | 496 | * must be held by the caller. |
497 | * | ||
498 | * Also, if there are blocked tasks on the list, they automatically | ||
499 | * block the newly created grace period, so set up ->gp_tasks accordingly. | ||
428 | */ | 500 | */ |
429 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | 501 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) |
430 | { | 502 | { |
431 | WARN_ON_ONCE(rcu_preempted_readers(rnp)); | 503 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); |
504 | if (!list_empty(&rnp->blkd_tasks)) | ||
505 | rnp->gp_tasks = rnp->blkd_tasks.next; | ||
432 | WARN_ON_ONCE(rnp->qsmask); | 506 | WARN_ON_ONCE(rnp->qsmask); |
433 | } | 507 | } |
434 | 508 | ||
@@ -452,50 +526,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
452 | struct rcu_node *rnp, | 526 | struct rcu_node *rnp, |
453 | struct rcu_data *rdp) | 527 | struct rcu_data *rdp) |
454 | { | 528 | { |
455 | int i; | ||
456 | struct list_head *lp; | 529 | struct list_head *lp; |
457 | struct list_head *lp_root; | 530 | struct list_head *lp_root; |
458 | int retval = 0; | 531 | int retval = 0; |
459 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 532 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
460 | struct task_struct *tp; | 533 | struct task_struct *t; |
461 | 534 | ||
462 | if (rnp == rnp_root) { | 535 | if (rnp == rnp_root) { |
463 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | 536 | WARN_ONCE(1, "Last CPU thought to be offlined?"); |
464 | return 0; /* Shouldn't happen: at least one CPU online. */ | 537 | return 0; /* Shouldn't happen: at least one CPU online. */ |
465 | } | 538 | } |
466 | WARN_ON_ONCE(rnp != rdp->mynode && | 539 | |
467 | (!list_empty(&rnp->blocked_tasks[0]) || | 540 | /* If we are on an internal node, complain bitterly. */ |
468 | !list_empty(&rnp->blocked_tasks[1]) || | 541 | WARN_ON_ONCE(rnp != rdp->mynode); |
469 | !list_empty(&rnp->blocked_tasks[2]) || | ||
470 | !list_empty(&rnp->blocked_tasks[3]))); | ||
471 | 542 | ||
472 | /* | 543 | /* |
473 | * Move tasks up to root rcu_node. Rely on the fact that the | 544 | * Move tasks up to root rcu_node. Don't try to get fancy for |
474 | * root rcu_node can be at most one ahead of the rest of the | 545 | * this corner-case operation -- just put this node's tasks |
475 | * rcu_nodes in terms of gp_num value. This fact allows us to | 546 | * at the head of the root node's list, and update the root node's |
476 | * move the blocked_tasks[] array directly, element by element. | 547 | * ->gp_tasks and ->exp_tasks pointers to those of this node's, |
548 | * if non-NULL. This might result in waiting for more tasks than | ||
549 | * absolutely necessary, but this is a good performance/complexity | ||
550 | * tradeoff. | ||
477 | */ | 551 | */ |
478 | if (rcu_preempted_readers(rnp)) | 552 | if (rcu_preempt_blocked_readers_cgp(rnp)) |
479 | retval |= RCU_OFL_TASKS_NORM_GP; | 553 | retval |= RCU_OFL_TASKS_NORM_GP; |
480 | if (rcu_preempted_readers_exp(rnp)) | 554 | if (rcu_preempted_readers_exp(rnp)) |
481 | retval |= RCU_OFL_TASKS_EXP_GP; | 555 | retval |= RCU_OFL_TASKS_EXP_GP; |
482 | for (i = 0; i < 4; i++) { | 556 | lp = &rnp->blkd_tasks; |
483 | lp = &rnp->blocked_tasks[i]; | 557 | lp_root = &rnp_root->blkd_tasks; |
484 | lp_root = &rnp_root->blocked_tasks[i]; | 558 | while (!list_empty(lp)) { |
485 | while (!list_empty(lp)) { | 559 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); |
486 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 560 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
487 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 561 | list_del(&t->rcu_node_entry); |
488 | list_del(&tp->rcu_node_entry); | 562 | t->rcu_blocked_node = rnp_root; |
489 | tp->rcu_blocked_node = rnp_root; | 563 | list_add(&t->rcu_node_entry, lp_root); |
490 | list_add(&tp->rcu_node_entry, lp_root); | 564 | if (&t->rcu_node_entry == rnp->gp_tasks) |
491 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 565 | rnp_root->gp_tasks = rnp->gp_tasks; |
492 | } | 566 | if (&t->rcu_node_entry == rnp->exp_tasks) |
567 | rnp_root->exp_tasks = rnp->exp_tasks; | ||
568 | #ifdef CONFIG_RCU_BOOST | ||
569 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
570 | rnp_root->boost_tasks = rnp->boost_tasks; | ||
571 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
572 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
493 | } | 573 | } |
574 | |||
575 | #ifdef CONFIG_RCU_BOOST | ||
576 | /* In case root is being boosted and leaf is not. */ | ||
577 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
578 | if (rnp_root->boost_tasks != NULL && | ||
579 | rnp_root->boost_tasks != rnp_root->gp_tasks) | ||
580 | rnp_root->boost_tasks = rnp_root->gp_tasks; | ||
581 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
582 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
583 | |||
584 | rnp->gp_tasks = NULL; | ||
585 | rnp->exp_tasks = NULL; | ||
494 | return retval; | 586 | return retval; |
495 | } | 587 | } |
496 | 588 | ||
497 | /* | 589 | /* |
498 | * Do CPU-offline processing for preemptable RCU. | 590 | * Do CPU-offline processing for preemptible RCU. |
499 | */ | 591 | */ |
500 | static void rcu_preempt_offline_cpu(int cpu) | 592 | static void rcu_preempt_offline_cpu(int cpu) |
501 | { | 593 | { |
@@ -519,12 +611,13 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
519 | rcu_preempt_qs(cpu); | 611 | rcu_preempt_qs(cpu); |
520 | return; | 612 | return; |
521 | } | 613 | } |
522 | if (per_cpu(rcu_preempt_data, cpu).qs_pending) | 614 | if (t->rcu_read_lock_nesting > 0 && |
615 | per_cpu(rcu_preempt_data, cpu).qs_pending) | ||
523 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 616 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; |
524 | } | 617 | } |
525 | 618 | ||
526 | /* | 619 | /* |
527 | * Process callbacks for preemptable RCU. | 620 | * Process callbacks for preemptible RCU. |
528 | */ | 621 | */ |
529 | static void rcu_preempt_process_callbacks(void) | 622 | static void rcu_preempt_process_callbacks(void) |
530 | { | 623 | { |
@@ -532,8 +625,17 @@ static void rcu_preempt_process_callbacks(void) | |||
532 | &__get_cpu_var(rcu_preempt_data)); | 625 | &__get_cpu_var(rcu_preempt_data)); |
533 | } | 626 | } |
534 | 627 | ||
628 | #ifdef CONFIG_RCU_BOOST | ||
629 | |||
630 | static void rcu_preempt_do_callbacks(void) | ||
631 | { | ||
632 | rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); | ||
633 | } | ||
634 | |||
635 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
636 | |||
535 | /* | 637 | /* |
536 | * Queue a preemptable-RCU callback for invocation after a grace period. | 638 | * Queue a preemptible-RCU callback for invocation after a grace period. |
537 | */ | 639 | */ |
538 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 640 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
539 | { | 641 | { |
@@ -546,9 +648,11 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
546 | * | 648 | * |
547 | * Control will return to the caller some time after a full grace | 649 | * Control will return to the caller some time after a full grace |
548 | * period has elapsed, in other words after all currently executing RCU | 650 | * period has elapsed, in other words after all currently executing RCU |
549 | * read-side critical sections have completed. RCU read-side critical | 651 | * read-side critical sections have completed. Note, however, that |
550 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 652 | * upon return from synchronize_rcu(), the caller might well be executing |
551 | * and may be nested. | 653 | * concurrently with new RCU read-side critical sections that began while |
654 | * synchronize_rcu() was waiting. RCU read-side critical sections are | ||
655 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | ||
552 | */ | 656 | */ |
553 | void synchronize_rcu(void) | 657 | void synchronize_rcu(void) |
554 | { | 658 | { |
@@ -579,8 +683,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | |||
579 | */ | 683 | */ |
580 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | 684 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) |
581 | { | 685 | { |
582 | return !list_empty(&rnp->blocked_tasks[2]) || | 686 | return rnp->exp_tasks != NULL; |
583 | !list_empty(&rnp->blocked_tasks[3]); | ||
584 | } | 687 | } |
585 | 688 | ||
586 | /* | 689 | /* |
@@ -615,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
615 | 718 | ||
616 | raw_spin_lock_irqsave(&rnp->lock, flags); | 719 | raw_spin_lock_irqsave(&rnp->lock, flags); |
617 | for (;;) { | 720 | for (;;) { |
618 | if (!sync_rcu_preempt_exp_done(rnp)) | 721 | if (!sync_rcu_preempt_exp_done(rnp)) { |
722 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
619 | break; | 723 | break; |
724 | } | ||
620 | if (rnp->parent == NULL) { | 725 | if (rnp->parent == NULL) { |
726 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
621 | wake_up(&sync_rcu_preempt_exp_wq); | 727 | wake_up(&sync_rcu_preempt_exp_wq); |
622 | break; | 728 | break; |
623 | } | 729 | } |
@@ -627,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
627 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 733 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
628 | rnp->expmask &= ~mask; | 734 | rnp->expmask &= ~mask; |
629 | } | 735 | } |
630 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
631 | } | 736 | } |
632 | 737 | ||
633 | /* | 738 | /* |
@@ -640,13 +745,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
640 | static void | 745 | static void |
641 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 746 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
642 | { | 747 | { |
643 | int must_wait; | 748 | unsigned long flags; |
749 | int must_wait = 0; | ||
644 | 750 | ||
645 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 751 | raw_spin_lock_irqsave(&rnp->lock, flags); |
646 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 752 | if (list_empty(&rnp->blkd_tasks)) |
647 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 753 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
648 | must_wait = rcu_preempted_readers_exp(rnp); | 754 | else { |
649 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 755 | rnp->exp_tasks = rnp->blkd_tasks.next; |
756 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
757 | must_wait = 1; | ||
758 | } | ||
650 | if (!must_wait) | 759 | if (!must_wait) |
651 | rcu_report_exp_rnp(rsp, rnp); | 760 | rcu_report_exp_rnp(rsp, rnp); |
652 | } | 761 | } |
@@ -654,9 +763,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
654 | /* | 763 | /* |
655 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | 764 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea |
656 | * is to invoke synchronize_sched_expedited() to push all the tasks to | 765 | * is to invoke synchronize_sched_expedited() to push all the tasks to |
657 | * the ->blocked_tasks[] lists, move all entries from the first set of | 766 | * the ->blkd_tasks lists and wait for this list to drain. |
658 | * ->blocked_tasks[] lists to the second set, and finally wait for this | ||
659 | * second set to drain. | ||
660 | */ | 767 | */ |
661 | void synchronize_rcu_expedited(void) | 768 | void synchronize_rcu_expedited(void) |
662 | { | 769 | { |
@@ -688,7 +795,7 @@ void synchronize_rcu_expedited(void) | |||
688 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | 795 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) |
689 | goto unlock_mb_ret; /* Others did our work for us. */ | 796 | goto unlock_mb_ret; /* Others did our work for us. */ |
690 | 797 | ||
691 | /* force all RCU readers onto blocked_tasks[]. */ | 798 | /* force all RCU readers onto ->blkd_tasks lists. */ |
692 | synchronize_sched_expedited(); | 799 | synchronize_sched_expedited(); |
693 | 800 | ||
694 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 801 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
@@ -700,7 +807,7 @@ void synchronize_rcu_expedited(void) | |||
700 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 807 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
701 | } | 808 | } |
702 | 809 | ||
703 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 810 | /* Snapshot current state of ->blkd_tasks lists. */ |
704 | rcu_for_each_leaf_node(rsp, rnp) | 811 | rcu_for_each_leaf_node(rsp, rnp) |
705 | sync_rcu_preempt_exp_init(rsp, rnp); | 812 | sync_rcu_preempt_exp_init(rsp, rnp); |
706 | if (NUM_RCU_NODES > 1) | 813 | if (NUM_RCU_NODES > 1) |
@@ -708,7 +815,7 @@ void synchronize_rcu_expedited(void) | |||
708 | 815 | ||
709 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 816 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
710 | 817 | ||
711 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 818 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
712 | rnp = rcu_get_root(rsp); | 819 | rnp = rcu_get_root(rsp); |
713 | wait_event(sync_rcu_preempt_exp_wq, | 820 | wait_event(sync_rcu_preempt_exp_wq, |
714 | sync_rcu_preempt_exp_done(rnp)); | 821 | sync_rcu_preempt_exp_done(rnp)); |
@@ -724,7 +831,7 @@ mb_ret: | |||
724 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 831 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
725 | 832 | ||
726 | /* | 833 | /* |
727 | * Check to see if there is any immediate preemptable-RCU-related work | 834 | * Check to see if there is any immediate preemptible-RCU-related work |
728 | * to be done. | 835 | * to be done. |
729 | */ | 836 | */ |
730 | static int rcu_preempt_pending(int cpu) | 837 | static int rcu_preempt_pending(int cpu) |
@@ -734,7 +841,7 @@ static int rcu_preempt_pending(int cpu) | |||
734 | } | 841 | } |
735 | 842 | ||
736 | /* | 843 | /* |
737 | * Does preemptable RCU need the CPU to stay out of dynticks mode? | 844 | * Does preemptible RCU need the CPU to stay out of dynticks mode? |
738 | */ | 845 | */ |
739 | static int rcu_preempt_needs_cpu(int cpu) | 846 | static int rcu_preempt_needs_cpu(int cpu) |
740 | { | 847 | { |
@@ -751,7 +858,7 @@ void rcu_barrier(void) | |||
751 | EXPORT_SYMBOL_GPL(rcu_barrier); | 858 | EXPORT_SYMBOL_GPL(rcu_barrier); |
752 | 859 | ||
753 | /* | 860 | /* |
754 | * Initialize preemptable RCU's per-CPU data. | 861 | * Initialize preemptible RCU's per-CPU data. |
755 | */ | 862 | */ |
756 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 863 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
757 | { | 864 | { |
@@ -759,23 +866,23 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
759 | } | 866 | } |
760 | 867 | ||
761 | /* | 868 | /* |
762 | * Move preemptable RCU's callbacks to ->orphan_cbs_list. | 869 | * Move preemptible RCU's callbacks from dying CPU to other online CPU. |
763 | */ | 870 | */ |
764 | static void rcu_preempt_send_cbs_to_orphanage(void) | 871 | static void rcu_preempt_send_cbs_to_online(void) |
765 | { | 872 | { |
766 | rcu_send_cbs_to_orphanage(&rcu_preempt_state); | 873 | rcu_send_cbs_to_online(&rcu_preempt_state); |
767 | } | 874 | } |
768 | 875 | ||
769 | /* | 876 | /* |
770 | * Initialize preemptable RCU's state structures. | 877 | * Initialize preemptible RCU's state structures. |
771 | */ | 878 | */ |
772 | static void __init __rcu_init_preempt(void) | 879 | static void __init __rcu_init_preempt(void) |
773 | { | 880 | { |
774 | RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); | 881 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
775 | } | 882 | } |
776 | 883 | ||
777 | /* | 884 | /* |
778 | * Check for a task exiting while in a preemptable-RCU read-side | 885 | * Check for a task exiting while in a preemptible-RCU read-side |
779 | * critical section, clean up if so. No need to issue warnings, | 886 | * critical section, clean up if so. No need to issue warnings, |
780 | * as debug_check_no_locks_held() already does this if lockdep | 887 | * as debug_check_no_locks_held() already does this if lockdep |
781 | * is enabled. | 888 | * is enabled. |
@@ -787,11 +894,13 @@ void exit_rcu(void) | |||
787 | if (t->rcu_read_lock_nesting == 0) | 894 | if (t->rcu_read_lock_nesting == 0) |
788 | return; | 895 | return; |
789 | t->rcu_read_lock_nesting = 1; | 896 | t->rcu_read_lock_nesting = 1; |
790 | rcu_read_unlock(); | 897 | __rcu_read_unlock(); |
791 | } | 898 | } |
792 | 899 | ||
793 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 900 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
794 | 901 | ||
902 | static struct rcu_state *rcu_state = &rcu_sched_state; | ||
903 | |||
795 | /* | 904 | /* |
796 | * Tell them what RCU they are running. | 905 | * Tell them what RCU they are running. |
797 | */ | 906 | */ |
@@ -821,7 +930,7 @@ void rcu_force_quiescent_state(void) | |||
821 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 930 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
822 | 931 | ||
823 | /* | 932 | /* |
824 | * Because preemptable RCU does not exist, we never have to check for | 933 | * Because preemptible RCU does not exist, we never have to check for |
825 | * CPUs being in quiescent states. | 934 | * CPUs being in quiescent states. |
826 | */ | 935 | */ |
827 | static void rcu_preempt_note_context_switch(int cpu) | 936 | static void rcu_preempt_note_context_switch(int cpu) |
@@ -829,10 +938,10 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
829 | } | 938 | } |
830 | 939 | ||
831 | /* | 940 | /* |
832 | * Because preemptable RCU does not exist, there are never any preempted | 941 | * Because preemptible RCU does not exist, there are never any preempted |
833 | * RCU readers. | 942 | * RCU readers. |
834 | */ | 943 | */ |
835 | static int rcu_preempted_readers(struct rcu_node *rnp) | 944 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
836 | { | 945 | { |
837 | return 0; | 946 | return 0; |
838 | } | 947 | } |
@@ -847,10 +956,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
847 | 956 | ||
848 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 957 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
849 | 958 | ||
850 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
851 | |||
852 | /* | 959 | /* |
853 | * Because preemptable RCU does not exist, we never have to check for | 960 | * Because preemptible RCU does not exist, we never have to check for |
854 | * tasks blocked within RCU read-side critical sections. | 961 | * tasks blocked within RCU read-side critical sections. |
855 | */ | 962 | */ |
856 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | 963 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) |
@@ -858,17 +965,23 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
858 | } | 965 | } |
859 | 966 | ||
860 | /* | 967 | /* |
861 | * Because preemptable RCU does not exist, we never have to check for | 968 | * Because preemptible RCU does not exist, we never have to check for |
862 | * tasks blocked within RCU read-side critical sections. | 969 | * tasks blocked within RCU read-side critical sections. |
863 | */ | 970 | */ |
864 | static void rcu_print_task_stall(struct rcu_node *rnp) | 971 | static void rcu_print_task_stall(struct rcu_node *rnp) |
865 | { | 972 | { |
866 | } | 973 | } |
867 | 974 | ||
868 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 975 | /* |
976 | * Because preemptible RCU does not exist, there is no need to suppress | ||
977 | * its CPU stall warnings. | ||
978 | */ | ||
979 | static void rcu_preempt_stall_reset(void) | ||
980 | { | ||
981 | } | ||
869 | 982 | ||
870 | /* | 983 | /* |
871 | * Because there is no preemptable RCU, there can be no readers blocked, | 984 | * Because there is no preemptible RCU, there can be no readers blocked, |
872 | * so there is no need to check for blocked tasks. So check only for | 985 | * so there is no need to check for blocked tasks. So check only for |
873 | * bogus qsmask values. | 986 | * bogus qsmask values. |
874 | */ | 987 | */ |
@@ -880,7 +993,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
880 | #ifdef CONFIG_HOTPLUG_CPU | 993 | #ifdef CONFIG_HOTPLUG_CPU |
881 | 994 | ||
882 | /* | 995 | /* |
883 | * Because preemptable RCU does not exist, it never needs to migrate | 996 | * Because preemptible RCU does not exist, it never needs to migrate |
884 | * tasks that were blocked within RCU read-side critical sections, and | 997 | * tasks that were blocked within RCU read-side critical sections, and |
885 | * such non-existent tasks cannot possibly have been blocking the current | 998 | * such non-existent tasks cannot possibly have been blocking the current |
886 | * grace period. | 999 | * grace period. |
@@ -893,7 +1006,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
893 | } | 1006 | } |
894 | 1007 | ||
895 | /* | 1008 | /* |
896 | * Because preemptable RCU does not exist, it never needs CPU-offline | 1009 | * Because preemptible RCU does not exist, it never needs CPU-offline |
897 | * processing. | 1010 | * processing. |
898 | */ | 1011 | */ |
899 | static void rcu_preempt_offline_cpu(int cpu) | 1012 | static void rcu_preempt_offline_cpu(int cpu) |
@@ -903,7 +1016,7 @@ static void rcu_preempt_offline_cpu(int cpu) | |||
903 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1016 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
904 | 1017 | ||
905 | /* | 1018 | /* |
906 | * Because preemptable RCU does not exist, it never has any callbacks | 1019 | * Because preemptible RCU does not exist, it never has any callbacks |
907 | * to check. | 1020 | * to check. |
908 | */ | 1021 | */ |
909 | static void rcu_preempt_check_callbacks(int cpu) | 1022 | static void rcu_preempt_check_callbacks(int cpu) |
@@ -911,7 +1024,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
911 | } | 1024 | } |
912 | 1025 | ||
913 | /* | 1026 | /* |
914 | * Because preemptable RCU does not exist, it never has any callbacks | 1027 | * Because preemptible RCU does not exist, it never has any callbacks |
915 | * to process. | 1028 | * to process. |
916 | */ | 1029 | */ |
917 | static void rcu_preempt_process_callbacks(void) | 1030 | static void rcu_preempt_process_callbacks(void) |
@@ -919,17 +1032,8 @@ static void rcu_preempt_process_callbacks(void) | |||
919 | } | 1032 | } |
920 | 1033 | ||
921 | /* | 1034 | /* |
922 | * In classic RCU, call_rcu() is just call_rcu_sched(). | ||
923 | */ | ||
924 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
925 | { | ||
926 | call_rcu_sched(head, func); | ||
927 | } | ||
928 | EXPORT_SYMBOL_GPL(call_rcu); | ||
929 | |||
930 | /* | ||
931 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1035 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
932 | * But because preemptable RCU does not exist, map to rcu-sched. | 1036 | * But because preemptible RCU does not exist, map to rcu-sched. |
933 | */ | 1037 | */ |
934 | void synchronize_rcu_expedited(void) | 1038 | void synchronize_rcu_expedited(void) |
935 | { | 1039 | { |
@@ -940,7 +1044,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
940 | #ifdef CONFIG_HOTPLUG_CPU | 1044 | #ifdef CONFIG_HOTPLUG_CPU |
941 | 1045 | ||
942 | /* | 1046 | /* |
943 | * Because preemptable RCU does not exist, there is never any need to | 1047 | * Because preemptible RCU does not exist, there is never any need to |
944 | * report on tasks preempted in RCU read-side critical sections during | 1048 | * report on tasks preempted in RCU read-side critical sections during |
945 | * expedited RCU grace periods. | 1049 | * expedited RCU grace periods. |
946 | */ | 1050 | */ |
@@ -952,7 +1056,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
952 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1056 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
953 | 1057 | ||
954 | /* | 1058 | /* |
955 | * Because preemptable RCU does not exist, it never has any work to do. | 1059 | * Because preemptible RCU does not exist, it never has any work to do. |
956 | */ | 1060 | */ |
957 | static int rcu_preempt_pending(int cpu) | 1061 | static int rcu_preempt_pending(int cpu) |
958 | { | 1062 | { |
@@ -960,7 +1064,7 @@ static int rcu_preempt_pending(int cpu) | |||
960 | } | 1064 | } |
961 | 1065 | ||
962 | /* | 1066 | /* |
963 | * Because preemptable RCU does not exist, it never needs any CPU. | 1067 | * Because preemptible RCU does not exist, it never needs any CPU. |
964 | */ | 1068 | */ |
965 | static int rcu_preempt_needs_cpu(int cpu) | 1069 | static int rcu_preempt_needs_cpu(int cpu) |
966 | { | 1070 | { |
@@ -968,7 +1072,7 @@ static int rcu_preempt_needs_cpu(int cpu) | |||
968 | } | 1072 | } |
969 | 1073 | ||
970 | /* | 1074 | /* |
971 | * Because preemptable RCU does not exist, rcu_barrier() is just | 1075 | * Because preemptible RCU does not exist, rcu_barrier() is just |
972 | * another name for rcu_barrier_sched(). | 1076 | * another name for rcu_barrier_sched(). |
973 | */ | 1077 | */ |
974 | void rcu_barrier(void) | 1078 | void rcu_barrier(void) |
@@ -978,7 +1082,7 @@ void rcu_barrier(void) | |||
978 | EXPORT_SYMBOL_GPL(rcu_barrier); | 1082 | EXPORT_SYMBOL_GPL(rcu_barrier); |
979 | 1083 | ||
980 | /* | 1084 | /* |
981 | * Because preemptable RCU does not exist, there is no per-CPU | 1085 | * Because preemptible RCU does not exist, there is no per-CPU |
982 | * data to initialize. | 1086 | * data to initialize. |
983 | */ | 1087 | */ |
984 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 1088 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
@@ -986,14 +1090,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
986 | } | 1090 | } |
987 | 1091 | ||
988 | /* | 1092 | /* |
989 | * Because there is no preemptable RCU, there are no callbacks to move. | 1093 | * Because there is no preemptible RCU, there are no callbacks to move. |
990 | */ | 1094 | */ |
991 | static void rcu_preempt_send_cbs_to_orphanage(void) | 1095 | static void rcu_preempt_send_cbs_to_online(void) |
992 | { | 1096 | { |
993 | } | 1097 | } |
994 | 1098 | ||
995 | /* | 1099 | /* |
996 | * Because preemptable RCU does not exist, it need not be initialized. | 1100 | * Because preemptible RCU does not exist, it need not be initialized. |
997 | */ | 1101 | */ |
998 | static void __init __rcu_init_preempt(void) | 1102 | static void __init __rcu_init_preempt(void) |
999 | { | 1103 | { |
@@ -1001,6 +1105,791 @@ static void __init __rcu_init_preempt(void) | |||
1001 | 1105 | ||
1002 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1106 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1003 | 1107 | ||
1108 | #ifdef CONFIG_RCU_BOOST | ||
1109 | |||
1110 | #include "rtmutex_common.h" | ||
1111 | |||
1112 | #ifdef CONFIG_RCU_TRACE | ||
1113 | |||
1114 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1115 | { | ||
1116 | if (list_empty(&rnp->blkd_tasks)) | ||
1117 | rnp->n_balk_blkd_tasks++; | ||
1118 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
1119 | rnp->n_balk_exp_gp_tasks++; | ||
1120 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
1121 | rnp->n_balk_boost_tasks++; | ||
1122 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
1123 | rnp->n_balk_notblocked++; | ||
1124 | else if (rnp->gp_tasks != NULL && | ||
1125 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
1126 | rnp->n_balk_notyet++; | ||
1127 | else | ||
1128 | rnp->n_balk_nos++; | ||
1129 | } | ||
1130 | |||
1131 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
1132 | |||
1133 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1134 | { | ||
1135 | } | ||
1136 | |||
1137 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
1138 | |||
1139 | /* | ||
1140 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | ||
1141 | * or ->boost_tasks, advancing the pointer to the next task in the | ||
1142 | * ->blkd_tasks list. | ||
1143 | * | ||
1144 | * Note that irqs must be enabled: boosting the task can block. | ||
1145 | * Returns 1 if there are more tasks needing to be boosted. | ||
1146 | */ | ||
1147 | static int rcu_boost(struct rcu_node *rnp) | ||
1148 | { | ||
1149 | unsigned long flags; | ||
1150 | struct rt_mutex mtx; | ||
1151 | struct task_struct *t; | ||
1152 | struct list_head *tb; | ||
1153 | |||
1154 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) | ||
1155 | return 0; /* Nothing left to boost. */ | ||
1156 | |||
1157 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1158 | |||
1159 | /* | ||
1160 | * Recheck under the lock: all tasks in need of boosting | ||
1161 | * might exit their RCU read-side critical sections on their own. | ||
1162 | */ | ||
1163 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { | ||
1164 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1165 | return 0; | ||
1166 | } | ||
1167 | |||
1168 | /* | ||
1169 | * Preferentially boost tasks blocking expedited grace periods. | ||
1170 | * This cannot starve the normal grace periods because a second | ||
1171 | * expedited grace period must boost all blocked tasks, including | ||
1172 | * those blocking the pre-existing normal grace period. | ||
1173 | */ | ||
1174 | if (rnp->exp_tasks != NULL) { | ||
1175 | tb = rnp->exp_tasks; | ||
1176 | rnp->n_exp_boosts++; | ||
1177 | } else { | ||
1178 | tb = rnp->boost_tasks; | ||
1179 | rnp->n_normal_boosts++; | ||
1180 | } | ||
1181 | rnp->n_tasks_boosted++; | ||
1182 | |||
1183 | /* | ||
1184 | * We boost task t by manufacturing an rt_mutex that appears to | ||
1185 | * be held by task t. We leave a pointer to that rt_mutex where | ||
1186 | * task t can find it, and task t will release the mutex when it | ||
1187 | * exits its outermost RCU read-side critical section. Then | ||
1188 | * simply acquiring this artificial rt_mutex will boost task | ||
1189 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
1190 | * | ||
1191 | * Note that task t must acquire rnp->lock to remove itself from | ||
1192 | * the ->blkd_tasks list, which it will do from exit() if from | ||
1193 | * nowhere else. We therefore are guaranteed that task t will | ||
1194 | * stay around at least until we drop rnp->lock. Note that | ||
1195 | * rnp->lock also resolves races between our priority boosting | ||
1196 | * and task t's exiting its outermost RCU read-side critical | ||
1197 | * section. | ||
1198 | */ | ||
1199 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
1200 | rt_mutex_init_proxy_locked(&mtx, t); | ||
1201 | t->rcu_boost_mutex = &mtx; | ||
1202 | t->rcu_boosted = 1; | ||
1203 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1204 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | ||
1205 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
1206 | |||
1207 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | ||
1208 | } | ||
1209 | |||
1210 | /* | ||
1211 | * Timer handler to initiate waking up of boost kthreads that | ||
1212 | * have yielded the CPU due to excessive numbers of tasks to | ||
1213 | * boost. We wake up the per-rcu_node kthread, which in turn | ||
1214 | * will wake up the booster kthread. | ||
1215 | */ | ||
1216 | static void rcu_boost_kthread_timer(unsigned long arg) | ||
1217 | { | ||
1218 | invoke_rcu_node_kthread((struct rcu_node *)arg); | ||
1219 | } | ||
1220 | |||
1221 | /* | ||
1222 | * Priority-boosting kthread. One per leaf rcu_node and one for the | ||
1223 | * root rcu_node. | ||
1224 | */ | ||
1225 | static int rcu_boost_kthread(void *arg) | ||
1226 | { | ||
1227 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1228 | int spincnt = 0; | ||
1229 | int more2boost; | ||
1230 | |||
1231 | for (;;) { | ||
1232 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | ||
1233 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | ||
1234 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | ||
1235 | more2boost = rcu_boost(rnp); | ||
1236 | if (more2boost) | ||
1237 | spincnt++; | ||
1238 | else | ||
1239 | spincnt = 0; | ||
1240 | if (spincnt > 10) { | ||
1241 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | ||
1242 | spincnt = 0; | ||
1243 | } | ||
1244 | } | ||
1245 | /* NOTREACHED */ | ||
1246 | return 0; | ||
1247 | } | ||
1248 | |||
1249 | /* | ||
1250 | * Check to see if it is time to start boosting RCU readers that are | ||
1251 | * blocking the current grace period, and, if so, tell the per-rcu_node | ||
1252 | * kthread to start boosting them. If there is an expedited grace | ||
1253 | * period in progress, it is always time to boost. | ||
1254 | * | ||
1255 | * The caller must hold rnp->lock, which this function releases, | ||
1256 | * but irqs remain disabled. The ->boost_kthread_task is immortal, | ||
1257 | * so we don't need to worry about it going away. | ||
1258 | */ | ||
1259 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1260 | { | ||
1261 | struct task_struct *t; | ||
1262 | |||
1263 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | ||
1264 | rnp->n_balk_exp_gp_tasks++; | ||
1265 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1266 | return; | ||
1267 | } | ||
1268 | if (rnp->exp_tasks != NULL || | ||
1269 | (rnp->gp_tasks != NULL && | ||
1270 | rnp->boost_tasks == NULL && | ||
1271 | rnp->qsmask == 0 && | ||
1272 | ULONG_CMP_GE(jiffies, rnp->boost_time))) { | ||
1273 | if (rnp->exp_tasks == NULL) | ||
1274 | rnp->boost_tasks = rnp->gp_tasks; | ||
1275 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1276 | t = rnp->boost_kthread_task; | ||
1277 | if (t != NULL) | ||
1278 | wake_up_process(t); | ||
1279 | } else { | ||
1280 | rcu_initiate_boost_trace(rnp); | ||
1281 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1282 | } | ||
1283 | } | ||
1284 | |||
1285 | /* | ||
1286 | * Wake up the per-CPU kthread to invoke RCU callbacks. | ||
1287 | */ | ||
1288 | static void invoke_rcu_callbacks_kthread(void) | ||
1289 | { | ||
1290 | unsigned long flags; | ||
1291 | |||
1292 | local_irq_save(flags); | ||
1293 | __this_cpu_write(rcu_cpu_has_work, 1); | ||
1294 | if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { | ||
1295 | local_irq_restore(flags); | ||
1296 | return; | ||
1297 | } | ||
1298 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | ||
1299 | local_irq_restore(flags); | ||
1300 | } | ||
1301 | |||
1302 | /* | ||
1303 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | ||
1304 | * held, so no one should be messing with the existence of the boost | ||
1305 | * kthread. | ||
1306 | */ | ||
1307 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
1308 | cpumask_var_t cm) | ||
1309 | { | ||
1310 | struct task_struct *t; | ||
1311 | |||
1312 | t = rnp->boost_kthread_task; | ||
1313 | if (t != NULL) | ||
1314 | set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); | ||
1315 | } | ||
1316 | |||
1317 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
1318 | |||
1319 | /* | ||
1320 | * Do priority-boost accounting for the start of a new grace period. | ||
1321 | */ | ||
1322 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1323 | { | ||
1324 | rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
1325 | } | ||
1326 | |||
1327 | /* | ||
1328 | * Create an RCU-boost kthread for the specified node if one does not | ||
1329 | * already exist. We only create this kthread for preemptible RCU. | ||
1330 | * Returns zero if all is well, a negated errno otherwise. | ||
1331 | */ | ||
1332 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
1333 | struct rcu_node *rnp, | ||
1334 | int rnp_index) | ||
1335 | { | ||
1336 | unsigned long flags; | ||
1337 | struct sched_param sp; | ||
1338 | struct task_struct *t; | ||
1339 | |||
1340 | if (&rcu_preempt_state != rsp) | ||
1341 | return 0; | ||
1342 | rsp->boost = 1; | ||
1343 | if (rnp->boost_kthread_task != NULL) | ||
1344 | return 0; | ||
1345 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | ||
1346 | "rcub%d", rnp_index); | ||
1347 | if (IS_ERR(t)) | ||
1348 | return PTR_ERR(t); | ||
1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1350 | rnp->boost_kthread_task = t; | ||
1351 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1352 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1353 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1354 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
1355 | return 0; | ||
1356 | } | ||
1357 | |||
1358 | #ifdef CONFIG_HOTPLUG_CPU | ||
1359 | |||
1360 | /* | ||
1361 | * Stop the RCU's per-CPU kthread when its CPU goes offline,. | ||
1362 | */ | ||
1363 | static void rcu_stop_cpu_kthread(int cpu) | ||
1364 | { | ||
1365 | struct task_struct *t; | ||
1366 | |||
1367 | /* Stop the CPU's kthread. */ | ||
1368 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1369 | if (t != NULL) { | ||
1370 | per_cpu(rcu_cpu_kthread_task, cpu) = NULL; | ||
1371 | kthread_stop(t); | ||
1372 | } | ||
1373 | } | ||
1374 | |||
1375 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1376 | |||
1377 | static void rcu_kthread_do_work(void) | ||
1378 | { | ||
1379 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | ||
1380 | rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | ||
1381 | rcu_preempt_do_callbacks(); | ||
1382 | } | ||
1383 | |||
1384 | /* | ||
1385 | * Wake up the specified per-rcu_node-structure kthread. | ||
1386 | * Because the per-rcu_node kthreads are immortal, we don't need | ||
1387 | * to do anything to keep them alive. | ||
1388 | */ | ||
1389 | static void invoke_rcu_node_kthread(struct rcu_node *rnp) | ||
1390 | { | ||
1391 | struct task_struct *t; | ||
1392 | |||
1393 | t = rnp->node_kthread_task; | ||
1394 | if (t != NULL) | ||
1395 | wake_up_process(t); | ||
1396 | } | ||
1397 | |||
1398 | /* | ||
1399 | * Set the specified CPU's kthread to run RT or not, as specified by | ||
1400 | * the to_rt argument. The CPU-hotplug locks are held, so the task | ||
1401 | * is not going away. | ||
1402 | */ | ||
1403 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1404 | { | ||
1405 | int policy; | ||
1406 | struct sched_param sp; | ||
1407 | struct task_struct *t; | ||
1408 | |||
1409 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1410 | if (t == NULL) | ||
1411 | return; | ||
1412 | if (to_rt) { | ||
1413 | policy = SCHED_FIFO; | ||
1414 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1415 | } else { | ||
1416 | policy = SCHED_NORMAL; | ||
1417 | sp.sched_priority = 0; | ||
1418 | } | ||
1419 | sched_setscheduler_nocheck(t, policy, &sp); | ||
1420 | } | ||
1421 | |||
1422 | /* | ||
1423 | * Timer handler to initiate the waking up of per-CPU kthreads that | ||
1424 | * have yielded the CPU due to excess numbers of RCU callbacks. | ||
1425 | * We wake up the per-rcu_node kthread, which in turn will wake up | ||
1426 | * the booster kthread. | ||
1427 | */ | ||
1428 | static void rcu_cpu_kthread_timer(unsigned long arg) | ||
1429 | { | ||
1430 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); | ||
1431 | struct rcu_node *rnp = rdp->mynode; | ||
1432 | |||
1433 | atomic_or(rdp->grpmask, &rnp->wakemask); | ||
1434 | invoke_rcu_node_kthread(rnp); | ||
1435 | } | ||
1436 | |||
1437 | /* | ||
1438 | * Drop to non-real-time priority and yield, but only after posting a | ||
1439 | * timer that will cause us to regain our real-time priority if we | ||
1440 | * remain preempted. Either way, we restore our real-time priority | ||
1441 | * before returning. | ||
1442 | */ | ||
1443 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | ||
1444 | { | ||
1445 | struct sched_param sp; | ||
1446 | struct timer_list yield_timer; | ||
1447 | |||
1448 | setup_timer_on_stack(&yield_timer, f, arg); | ||
1449 | mod_timer(&yield_timer, jiffies + 2); | ||
1450 | sp.sched_priority = 0; | ||
1451 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | ||
1452 | set_user_nice(current, 19); | ||
1453 | schedule(); | ||
1454 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1455 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
1456 | del_timer(&yield_timer); | ||
1457 | } | ||
1458 | |||
1459 | /* | ||
1460 | * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. | ||
1461 | * This can happen while the corresponding CPU is either coming online | ||
1462 | * or going offline. We cannot wait until the CPU is fully online | ||
1463 | * before starting the kthread, because the various notifier functions | ||
1464 | * can wait for RCU grace periods. So we park rcu_cpu_kthread() until | ||
1465 | * the corresponding CPU is online. | ||
1466 | * | ||
1467 | * Return 1 if the kthread needs to stop, 0 otherwise. | ||
1468 | * | ||
1469 | * Caller must disable bh. This function can momentarily enable it. | ||
1470 | */ | ||
1471 | static int rcu_cpu_kthread_should_stop(int cpu) | ||
1472 | { | ||
1473 | while (cpu_is_offline(cpu) || | ||
1474 | !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || | ||
1475 | smp_processor_id() != cpu) { | ||
1476 | if (kthread_should_stop()) | ||
1477 | return 1; | ||
1478 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
1479 | per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); | ||
1480 | local_bh_enable(); | ||
1481 | schedule_timeout_uninterruptible(1); | ||
1482 | if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) | ||
1483 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
1484 | local_bh_disable(); | ||
1485 | } | ||
1486 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1487 | return 0; | ||
1488 | } | ||
1489 | |||
1490 | /* | ||
1491 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | ||
1492 | * earlier RCU softirq. | ||
1493 | */ | ||
1494 | static int rcu_cpu_kthread(void *arg) | ||
1495 | { | ||
1496 | int cpu = (int)(long)arg; | ||
1497 | unsigned long flags; | ||
1498 | int spincnt = 0; | ||
1499 | unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); | ||
1500 | char work; | ||
1501 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | ||
1502 | |||
1503 | for (;;) { | ||
1504 | *statusp = RCU_KTHREAD_WAITING; | ||
1505 | rcu_wait(*workp != 0 || kthread_should_stop()); | ||
1506 | local_bh_disable(); | ||
1507 | if (rcu_cpu_kthread_should_stop(cpu)) { | ||
1508 | local_bh_enable(); | ||
1509 | break; | ||
1510 | } | ||
1511 | *statusp = RCU_KTHREAD_RUNNING; | ||
1512 | per_cpu(rcu_cpu_kthread_loops, cpu)++; | ||
1513 | local_irq_save(flags); | ||
1514 | work = *workp; | ||
1515 | *workp = 0; | ||
1516 | local_irq_restore(flags); | ||
1517 | if (work) | ||
1518 | rcu_kthread_do_work(); | ||
1519 | local_bh_enable(); | ||
1520 | if (*workp != 0) | ||
1521 | spincnt++; | ||
1522 | else | ||
1523 | spincnt = 0; | ||
1524 | if (spincnt > 10) { | ||
1525 | *statusp = RCU_KTHREAD_YIELDING; | ||
1526 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | ||
1527 | spincnt = 0; | ||
1528 | } | ||
1529 | } | ||
1530 | *statusp = RCU_KTHREAD_STOPPED; | ||
1531 | return 0; | ||
1532 | } | ||
1533 | |||
1534 | /* | ||
1535 | * Spawn a per-CPU kthread, setting up affinity and priority. | ||
1536 | * Because the CPU hotplug lock is held, no other CPU will be attempting | ||
1537 | * to manipulate rcu_cpu_kthread_task. There might be another CPU | ||
1538 | * attempting to access it during boot, but the locking in kthread_bind() | ||
1539 | * will enforce sufficient ordering. | ||
1540 | * | ||
1541 | * Please note that we cannot simply refuse to wake up the per-CPU | ||
1542 | * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, | ||
1543 | * which can result in softlockup complaints if the task ends up being | ||
1544 | * idle for more than a couple of minutes. | ||
1545 | * | ||
1546 | * However, please note also that we cannot bind the per-CPU kthread to its | ||
1547 | * CPU until that CPU is fully online. We also cannot wait until the | ||
1548 | * CPU is fully online before we create its per-CPU kthread, as this would | ||
1549 | * deadlock the system when CPU notifiers tried waiting for grace | ||
1550 | * periods. So we bind the per-CPU kthread to its CPU only if the CPU | ||
1551 | * is online. If its CPU is not yet fully online, then the code in | ||
1552 | * rcu_cpu_kthread() will wait until it is fully online, and then do | ||
1553 | * the binding. | ||
1554 | */ | ||
1555 | static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | ||
1556 | { | ||
1557 | struct sched_param sp; | ||
1558 | struct task_struct *t; | ||
1559 | |||
1560 | if (!rcu_scheduler_fully_active || | ||
1561 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | ||
1562 | return 0; | ||
1563 | t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); | ||
1564 | if (IS_ERR(t)) | ||
1565 | return PTR_ERR(t); | ||
1566 | if (cpu_online(cpu)) | ||
1567 | kthread_bind(t, cpu); | ||
1568 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1569 | WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); | ||
1570 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1571 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1572 | per_cpu(rcu_cpu_kthread_task, cpu) = t; | ||
1573 | wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ | ||
1574 | return 0; | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * Per-rcu_node kthread, which is in charge of waking up the per-CPU | ||
1579 | * kthreads when needed. We ignore requests to wake up kthreads | ||
1580 | * for offline CPUs, which is OK because force_quiescent_state() | ||
1581 | * takes care of this case. | ||
1582 | */ | ||
1583 | static int rcu_node_kthread(void *arg) | ||
1584 | { | ||
1585 | int cpu; | ||
1586 | unsigned long flags; | ||
1587 | unsigned long mask; | ||
1588 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1589 | struct sched_param sp; | ||
1590 | struct task_struct *t; | ||
1591 | |||
1592 | for (;;) { | ||
1593 | rnp->node_kthread_status = RCU_KTHREAD_WAITING; | ||
1594 | rcu_wait(atomic_read(&rnp->wakemask) != 0); | ||
1595 | rnp->node_kthread_status = RCU_KTHREAD_RUNNING; | ||
1596 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1597 | mask = atomic_xchg(&rnp->wakemask, 0); | ||
1598 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1599 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { | ||
1600 | if ((mask & 0x1) == 0) | ||
1601 | continue; | ||
1602 | preempt_disable(); | ||
1603 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1604 | if (!cpu_online(cpu) || t == NULL) { | ||
1605 | preempt_enable(); | ||
1606 | continue; | ||
1607 | } | ||
1608 | per_cpu(rcu_cpu_has_work, cpu) = 1; | ||
1609 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1610 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1611 | preempt_enable(); | ||
1612 | } | ||
1613 | } | ||
1614 | /* NOTREACHED */ | ||
1615 | rnp->node_kthread_status = RCU_KTHREAD_STOPPED; | ||
1616 | return 0; | ||
1617 | } | ||
1618 | |||
1619 | /* | ||
1620 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | ||
1621 | * served by the rcu_node in question. The CPU hotplug lock is still | ||
1622 | * held, so the value of rnp->qsmaskinit will be stable. | ||
1623 | * | ||
1624 | * We don't include outgoingcpu in the affinity set, use -1 if there is | ||
1625 | * no outgoing CPU. If there are no CPUs left in the affinity set, | ||
1626 | * this function allows the kthread to execute on any CPU. | ||
1627 | */ | ||
1628 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1629 | { | ||
1630 | cpumask_var_t cm; | ||
1631 | int cpu; | ||
1632 | unsigned long mask = rnp->qsmaskinit; | ||
1633 | |||
1634 | if (rnp->node_kthread_task == NULL) | ||
1635 | return; | ||
1636 | if (!alloc_cpumask_var(&cm, GFP_KERNEL)) | ||
1637 | return; | ||
1638 | cpumask_clear(cm); | ||
1639 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | ||
1640 | if ((mask & 0x1) && cpu != outgoingcpu) | ||
1641 | cpumask_set_cpu(cpu, cm); | ||
1642 | if (cpumask_weight(cm) == 0) { | ||
1643 | cpumask_setall(cm); | ||
1644 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) | ||
1645 | cpumask_clear_cpu(cpu, cm); | ||
1646 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | ||
1647 | } | ||
1648 | set_cpus_allowed_ptr(rnp->node_kthread_task, cm); | ||
1649 | rcu_boost_kthread_setaffinity(rnp, cm); | ||
1650 | free_cpumask_var(cm); | ||
1651 | } | ||
1652 | |||
1653 | /* | ||
1654 | * Spawn a per-rcu_node kthread, setting priority and affinity. | ||
1655 | * Called during boot before online/offline can happen, or, if | ||
1656 | * during runtime, with the main CPU-hotplug locks held. So only | ||
1657 | * one of these can be executing at a time. | ||
1658 | */ | ||
1659 | static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | ||
1660 | struct rcu_node *rnp) | ||
1661 | { | ||
1662 | unsigned long flags; | ||
1663 | int rnp_index = rnp - &rsp->node[0]; | ||
1664 | struct sched_param sp; | ||
1665 | struct task_struct *t; | ||
1666 | |||
1667 | if (!rcu_scheduler_fully_active || | ||
1668 | rnp->qsmaskinit == 0) | ||
1669 | return 0; | ||
1670 | if (rnp->node_kthread_task == NULL) { | ||
1671 | t = kthread_create(rcu_node_kthread, (void *)rnp, | ||
1672 | "rcun%d", rnp_index); | ||
1673 | if (IS_ERR(t)) | ||
1674 | return PTR_ERR(t); | ||
1675 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1676 | rnp->node_kthread_task = t; | ||
1677 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1678 | sp.sched_priority = 99; | ||
1679 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1680 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
1681 | } | ||
1682 | return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); | ||
1683 | } | ||
1684 | |||
1685 | /* | ||
1686 | * Spawn all kthreads -- called as soon as the scheduler is running. | ||
1687 | */ | ||
1688 | static int __init rcu_spawn_kthreads(void) | ||
1689 | { | ||
1690 | int cpu; | ||
1691 | struct rcu_node *rnp; | ||
1692 | |||
1693 | rcu_scheduler_fully_active = 1; | ||
1694 | for_each_possible_cpu(cpu) { | ||
1695 | per_cpu(rcu_cpu_has_work, cpu) = 0; | ||
1696 | if (cpu_online(cpu)) | ||
1697 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
1698 | } | ||
1699 | rnp = rcu_get_root(rcu_state); | ||
1700 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1701 | if (NUM_RCU_NODES > 1) { | ||
1702 | rcu_for_each_leaf_node(rcu_state, rnp) | ||
1703 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1704 | } | ||
1705 | return 0; | ||
1706 | } | ||
1707 | early_initcall(rcu_spawn_kthreads); | ||
1708 | |||
1709 | static void __cpuinit rcu_prepare_kthreads(int cpu) | ||
1710 | { | ||
1711 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
1712 | struct rcu_node *rnp = rdp->mynode; | ||
1713 | |||
1714 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | ||
1715 | if (rcu_scheduler_fully_active) { | ||
1716 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
1717 | if (rnp->node_kthread_task == NULL) | ||
1718 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1719 | } | ||
1720 | } | ||
1721 | |||
1722 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
1723 | |||
1724 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1725 | { | ||
1726 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1727 | } | ||
1728 | |||
1729 | static void invoke_rcu_callbacks_kthread(void) | ||
1730 | { | ||
1731 | WARN_ON_ONCE(1); | ||
1732 | } | ||
1733 | |||
1734 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1735 | { | ||
1736 | } | ||
1737 | |||
1738 | #ifdef CONFIG_HOTPLUG_CPU | ||
1739 | |||
1740 | static void rcu_stop_cpu_kthread(int cpu) | ||
1741 | { | ||
1742 | } | ||
1743 | |||
1744 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1745 | |||
1746 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1747 | { | ||
1748 | } | ||
1749 | |||
1750 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1751 | { | ||
1752 | } | ||
1753 | |||
1754 | static int __init rcu_scheduler_really_started(void) | ||
1755 | { | ||
1756 | rcu_scheduler_fully_active = 1; | ||
1757 | return 0; | ||
1758 | } | ||
1759 | early_initcall(rcu_scheduler_really_started); | ||
1760 | |||
1761 | static void __cpuinit rcu_prepare_kthreads(int cpu) | ||
1762 | { | ||
1763 | } | ||
1764 | |||
1765 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
1766 | |||
1767 | #ifndef CONFIG_SMP | ||
1768 | |||
1769 | void synchronize_sched_expedited(void) | ||
1770 | { | ||
1771 | cond_resched(); | ||
1772 | } | ||
1773 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1774 | |||
1775 | #else /* #ifndef CONFIG_SMP */ | ||
1776 | |||
1777 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
1778 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
1779 | |||
1780 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
1781 | { | ||
1782 | /* | ||
1783 | * There must be a full memory barrier on each affected CPU | ||
1784 | * between the time that try_stop_cpus() is called and the | ||
1785 | * time that it returns. | ||
1786 | * | ||
1787 | * In the current initial implementation of cpu_stop, the | ||
1788 | * above condition is already met when the control reaches | ||
1789 | * this point and the following smp_mb() is not strictly | ||
1790 | * necessary. Do smp_mb() anyway for documentation and | ||
1791 | * robustness against future implementation changes. | ||
1792 | */ | ||
1793 | smp_mb(); /* See above comment block. */ | ||
1794 | return 0; | ||
1795 | } | ||
1796 | |||
1797 | /* | ||
1798 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
1799 | * approach to force grace period to end quickly. This consumes | ||
1800 | * significant time on all CPUs, and is thus not recommended for | ||
1801 | * any sort of common-case code. | ||
1802 | * | ||
1803 | * Note that it is illegal to call this function while holding any | ||
1804 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
1805 | * observe this restriction will result in deadlock. | ||
1806 | * | ||
1807 | * This implementation can be thought of as an application of ticket | ||
1808 | * locking to RCU, with sync_sched_expedited_started and | ||
1809 | * sync_sched_expedited_done taking on the roles of the halves | ||
1810 | * of the ticket-lock word. Each task atomically increments | ||
1811 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
1812 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
1813 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
1814 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
1815 | * update sync_sched_expedited_done to match our snapshot -- but | ||
1816 | * only if someone else has not already advanced past our snapshot. | ||
1817 | * | ||
1818 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
1819 | * of sync_sched_expedited_done. If it has advanced past our | ||
1820 | * initial snapshot, then someone else must have forced a grace period | ||
1821 | * some time after we took our snapshot. In this case, our work is | ||
1822 | * done for us, and we can simply return. Otherwise, we try again, | ||
1823 | * but keep our initial snapshot for purposes of checking for someone | ||
1824 | * doing our work for us. | ||
1825 | * | ||
1826 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
1827 | */ | ||
1828 | void synchronize_sched_expedited(void) | ||
1829 | { | ||
1830 | int firstsnap, s, snap, trycount = 0; | ||
1831 | |||
1832 | /* Note that atomic_inc_return() implies full memory barrier. */ | ||
1833 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | ||
1834 | get_online_cpus(); | ||
1835 | |||
1836 | /* | ||
1837 | * Each pass through the following loop attempts to force a | ||
1838 | * context switch on each CPU. | ||
1839 | */ | ||
1840 | while (try_stop_cpus(cpu_online_mask, | ||
1841 | synchronize_sched_expedited_cpu_stop, | ||
1842 | NULL) == -EAGAIN) { | ||
1843 | put_online_cpus(); | ||
1844 | |||
1845 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
1846 | if (trycount++ < 10) | ||
1847 | udelay(trycount * num_online_cpus()); | ||
1848 | else { | ||
1849 | synchronize_sched(); | ||
1850 | return; | ||
1851 | } | ||
1852 | |||
1853 | /* Check to see if someone else did our work for us. */ | ||
1854 | s = atomic_read(&sync_sched_expedited_done); | ||
1855 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
1856 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1857 | return; | ||
1858 | } | ||
1859 | |||
1860 | /* | ||
1861 | * Refetching sync_sched_expedited_started allows later | ||
1862 | * callers to piggyback on our grace period. We subtract | ||
1863 | * 1 to get the same token that the last incrementer got. | ||
1864 | * We retry after they started, so our grace period works | ||
1865 | * for them, and they started after our first try, so their | ||
1866 | * grace period works for us. | ||
1867 | */ | ||
1868 | get_online_cpus(); | ||
1869 | snap = atomic_read(&sync_sched_expedited_started) - 1; | ||
1870 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
1871 | } | ||
1872 | |||
1873 | /* | ||
1874 | * Everyone up to our most recent fetch is covered by our grace | ||
1875 | * period. Update the counter, but only if our work is still | ||
1876 | * relevant -- which it won't be if someone who started later | ||
1877 | * than we did beat us to the punch. | ||
1878 | */ | ||
1879 | do { | ||
1880 | s = atomic_read(&sync_sched_expedited_done); | ||
1881 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
1882 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1883 | break; | ||
1884 | } | ||
1885 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
1886 | |||
1887 | put_online_cpus(); | ||
1888 | } | ||
1889 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1890 | |||
1891 | #endif /* #else #ifndef CONFIG_SMP */ | ||
1892 | |||
1004 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | 1893 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
1005 | 1894 | ||
1006 | /* | 1895 | /* |
@@ -1047,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | |||
1047 | * | 1936 | * |
1048 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 1937 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
1049 | * disabled, we do one pass of force_quiescent_state(), then do a | 1938 | * disabled, we do one pass of force_quiescent_state(), then do a |
1050 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | 1939 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
1051 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | 1940 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
1052 | */ | 1941 | */ |
1053 | int rcu_needs_cpu(int cpu) | 1942 | int rcu_needs_cpu(int cpu) |
1054 | { | 1943 | { |
1055 | int c = 0; | 1944 | int c = 0; |
1056 | int snap; | 1945 | int snap; |
1057 | int snap_nmi; | ||
1058 | int thatcpu; | 1946 | int thatcpu; |
1059 | 1947 | ||
1060 | /* Check for being in the holdoff period. */ | 1948 | /* Check for being in the holdoff period. */ |
@@ -1065,10 +1953,10 @@ int rcu_needs_cpu(int cpu) | |||
1065 | for_each_online_cpu(thatcpu) { | 1953 | for_each_online_cpu(thatcpu) { |
1066 | if (thatcpu == cpu) | 1954 | if (thatcpu == cpu) |
1067 | continue; | 1955 | continue; |
1068 | snap = per_cpu(rcu_dynticks, thatcpu).dynticks; | 1956 | snap = atomic_add_return(0, &per_cpu(rcu_dynticks, |
1069 | snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; | 1957 | thatcpu).dynticks); |
1070 | smp_mb(); /* Order sampling of snap with end of grace period. */ | 1958 | smp_mb(); /* Order sampling of snap with end of grace period. */ |
1071 | if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { | 1959 | if ((snap & 0x1) != 0) { |
1072 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 1960 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
1073 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 1961 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
1074 | return rcu_needs_cpu_quick_check(cpu); | 1962 | return rcu_needs_cpu_quick_check(cpu); |
@@ -1099,7 +1987,7 @@ int rcu_needs_cpu(int cpu) | |||
1099 | 1987 | ||
1100 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 1988 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ |
1101 | if (c) | 1989 | if (c) |
1102 | raise_softirq(RCU_SOFTIRQ); | 1990 | invoke_rcu_core(); |
1103 | return c; | 1991 | return c; |
1104 | } | 1992 | } |
1105 | 1993 | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..4e144876dc68 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,6 +46,22 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | #ifdef CONFIG_RCU_BOOST | ||
50 | |||
51 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
52 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); | ||
53 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
54 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
55 | |||
56 | static char convert_kthread_status(unsigned int kthread_status) | ||
57 | { | ||
58 | if (kthread_status > RCU_KTHREAD_MAX) | ||
59 | return '?'; | ||
60 | return "SRWOY"[kthread_status]; | ||
61 | } | ||
62 | |||
63 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
64 | |||
49 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 65 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
50 | { | 66 | { |
51 | if (!rdp->beenonline) | 67 | if (!rdp->beenonline) |
@@ -57,14 +73,33 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
57 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 73 | rdp->passed_quiesc, rdp->passed_quiesc_completed, |
58 | rdp->qs_pending); | 74 | rdp->qs_pending); |
59 | #ifdef CONFIG_NO_HZ | 75 | #ifdef CONFIG_NO_HZ |
60 | seq_printf(m, " dt=%d/%d dn=%d df=%lu", | 76 | seq_printf(m, " dt=%d/%d/%d df=%lu", |
61 | rdp->dynticks->dynticks, | 77 | atomic_read(&rdp->dynticks->dynticks), |
62 | rdp->dynticks->dynticks_nesting, | 78 | rdp->dynticks->dynticks_nesting, |
63 | rdp->dynticks->dynticks_nmi, | 79 | rdp->dynticks->dynticks_nmi_nesting, |
64 | rdp->dynticks_fqs); | 80 | rdp->dynticks_fqs); |
65 | #endif /* #ifdef CONFIG_NO_HZ */ | 81 | #endif /* #ifdef CONFIG_NO_HZ */ |
66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 82 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
67 | seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); | 83 | seq_printf(m, " ql=%ld qs=%c%c%c%c", |
84 | rdp->qlen, | ||
85 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
86 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
87 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
88 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
89 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
90 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
91 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
92 | #ifdef CONFIG_RCU_BOOST | ||
93 | seq_printf(m, " kt=%d/%c/%d ktl=%x", | ||
94 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
95 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
96 | rdp->cpu)), | ||
97 | per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), | ||
98 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | ||
99 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
100 | seq_printf(m, " b=%ld", rdp->blimit); | ||
101 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | ||
102 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
68 | } | 103 | } |
69 | 104 | ||
70 | #define PRINT_RCU_DATA(name, func, m) \ | 105 | #define PRINT_RCU_DATA(name, func, m) \ |
@@ -113,22 +148,42 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
113 | rdp->qs_pending); | 148 | rdp->qs_pending); |
114 | #ifdef CONFIG_NO_HZ | 149 | #ifdef CONFIG_NO_HZ |
115 | seq_printf(m, ",%d,%d,%d,%lu", | 150 | seq_printf(m, ",%d,%d,%d,%lu", |
116 | rdp->dynticks->dynticks, | 151 | atomic_read(&rdp->dynticks->dynticks), |
117 | rdp->dynticks->dynticks_nesting, | 152 | rdp->dynticks->dynticks_nesting, |
118 | rdp->dynticks->dynticks_nmi, | 153 | rdp->dynticks->dynticks_nmi_nesting, |
119 | rdp->dynticks_fqs); | 154 | rdp->dynticks_fqs); |
120 | #endif /* #ifdef CONFIG_NO_HZ */ | 155 | #endif /* #ifdef CONFIG_NO_HZ */ |
121 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 156 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
122 | seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); | 157 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, |
158 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
159 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
160 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
161 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
162 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
163 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
164 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
165 | #ifdef CONFIG_RCU_BOOST | ||
166 | seq_printf(m, ",%d,\"%c\"", | ||
167 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
168 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
169 | rdp->cpu))); | ||
170 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
171 | seq_printf(m, ",%ld", rdp->blimit); | ||
172 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
173 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
123 | } | 174 | } |
124 | 175 | ||
125 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 176 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
126 | { | 177 | { |
127 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); | 178 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); |
128 | #ifdef CONFIG_NO_HZ | 179 | #ifdef CONFIG_NO_HZ |
129 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); | 180 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
130 | #endif /* #ifdef CONFIG_NO_HZ */ | 181 | #endif /* #ifdef CONFIG_NO_HZ */ |
131 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); | 182 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); |
183 | #ifdef CONFIG_RCU_BOOST | ||
184 | seq_puts(m, "\"kt\",\"ktl\""); | ||
185 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
186 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | ||
132 | #ifdef CONFIG_TREE_PREEMPT_RCU | 187 | #ifdef CONFIG_TREE_PREEMPT_RCU |
133 | seq_puts(m, "\"rcu_preempt:\"\n"); | 188 | seq_puts(m, "\"rcu_preempt:\"\n"); |
134 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 189 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); |
@@ -153,34 +208,97 @@ static const struct file_operations rcudata_csv_fops = { | |||
153 | .release = single_release, | 208 | .release = single_release, |
154 | }; | 209 | }; |
155 | 210 | ||
211 | #ifdef CONFIG_RCU_BOOST | ||
212 | |||
213 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | ||
214 | { | ||
215 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " | ||
216 | "j=%04x bt=%04x\n", | ||
217 | rnp->grplo, rnp->grphi, | ||
218 | "T."[list_empty(&rnp->blkd_tasks)], | ||
219 | "N."[!rnp->gp_tasks], | ||
220 | "E."[!rnp->exp_tasks], | ||
221 | "B."[!rnp->boost_tasks], | ||
222 | convert_kthread_status(rnp->boost_kthread_status), | ||
223 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | ||
224 | rnp->n_normal_boosts, | ||
225 | (int)(jiffies & 0xffff), | ||
226 | (int)(rnp->boost_time & 0xffff)); | ||
227 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | ||
228 | " balk", | ||
229 | rnp->n_balk_blkd_tasks, | ||
230 | rnp->n_balk_exp_gp_tasks, | ||
231 | rnp->n_balk_boost_tasks, | ||
232 | rnp->n_balk_notblocked, | ||
233 | rnp->n_balk_notyet, | ||
234 | rnp->n_balk_nos); | ||
235 | } | ||
236 | |||
237 | static int show_rcu_node_boost(struct seq_file *m, void *unused) | ||
238 | { | ||
239 | struct rcu_node *rnp; | ||
240 | |||
241 | rcu_for_each_leaf_node(&rcu_preempt_state, rnp) | ||
242 | print_one_rcu_node_boost(m, rnp); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | static int rcu_node_boost_open(struct inode *inode, struct file *file) | ||
247 | { | ||
248 | return single_open(file, show_rcu_node_boost, NULL); | ||
249 | } | ||
250 | |||
251 | static const struct file_operations rcu_node_boost_fops = { | ||
252 | .owner = THIS_MODULE, | ||
253 | .open = rcu_node_boost_open, | ||
254 | .read = seq_read, | ||
255 | .llseek = seq_lseek, | ||
256 | .release = single_release, | ||
257 | }; | ||
258 | |||
259 | /* | ||
260 | * Create the rcuboost debugfs entry. Standard error return. | ||
261 | */ | ||
262 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
263 | { | ||
264 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
265 | &rcu_node_boost_fops); | ||
266 | } | ||
267 | |||
268 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
269 | |||
270 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
271 | { | ||
272 | return 0; /* There cannot be an error if we didn't create it! */ | ||
273 | } | ||
274 | |||
275 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
276 | |||
156 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 277 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
157 | { | 278 | { |
158 | unsigned long gpnum; | 279 | unsigned long gpnum; |
159 | int level = 0; | 280 | int level = 0; |
160 | int phase; | ||
161 | struct rcu_node *rnp; | 281 | struct rcu_node *rnp; |
162 | 282 | ||
163 | gpnum = rsp->gpnum; | 283 | gpnum = rsp->gpnum; |
164 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 284 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
165 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | 285 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
166 | rsp->completed, gpnum, rsp->signaled, | 286 | rsp->completed, gpnum, rsp->signaled, |
167 | (long)(rsp->jiffies_force_qs - jiffies), | 287 | (long)(rsp->jiffies_force_qs - jiffies), |
168 | (int)(jiffies & 0xffff), | 288 | (int)(jiffies & 0xffff), |
169 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 289 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
170 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 290 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
171 | rsp->n_force_qs_lh, rsp->orphan_qlen); | 291 | rsp->n_force_qs_lh); |
172 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 292 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
173 | if (rnp->level != level) { | 293 | if (rnp->level != level) { |
174 | seq_puts(m, "\n"); | 294 | seq_puts(m, "\n"); |
175 | level = rnp->level; | 295 | level = rnp->level; |
176 | } | 296 | } |
177 | phase = gpnum & 0x1; | 297 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", |
178 | seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", | ||
179 | rnp->qsmask, rnp->qsmaskinit, | 298 | rnp->qsmask, rnp->qsmaskinit, |
180 | "T."[list_empty(&rnp->blocked_tasks[phase])], | 299 | ".G"[rnp->gp_tasks != NULL], |
181 | "E."[list_empty(&rnp->blocked_tasks[phase + 2])], | 300 | ".E"[rnp->exp_tasks != NULL], |
182 | "T."[list_empty(&rnp->blocked_tasks[!phase])], | 301 | ".T"[!list_empty(&rnp->blkd_tasks)], |
183 | "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], | ||
184 | rnp->grplo, rnp->grphi, rnp->grpnum); | 302 | rnp->grplo, rnp->grphi, rnp->grpnum); |
185 | } | 303 | } |
186 | seq_puts(m, "\n"); | 304 | seq_puts(m, "\n"); |
@@ -212,16 +330,35 @@ static const struct file_operations rcuhier_fops = { | |||
212 | .release = single_release, | 330 | .release = single_release, |
213 | }; | 331 | }; |
214 | 332 | ||
333 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | ||
334 | { | ||
335 | unsigned long flags; | ||
336 | unsigned long completed; | ||
337 | unsigned long gpnum; | ||
338 | unsigned long gpage; | ||
339 | unsigned long gpmax; | ||
340 | struct rcu_node *rnp = &rsp->node[0]; | ||
341 | |||
342 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
343 | completed = rsp->completed; | ||
344 | gpnum = rsp->gpnum; | ||
345 | if (rsp->completed == rsp->gpnum) | ||
346 | gpage = 0; | ||
347 | else | ||
348 | gpage = jiffies - rsp->gp_start; | ||
349 | gpmax = rsp->gp_max; | ||
350 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
351 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | ||
352 | rsp->name, completed, gpnum, gpage, gpmax); | ||
353 | } | ||
354 | |||
215 | static int show_rcugp(struct seq_file *m, void *unused) | 355 | static int show_rcugp(struct seq_file *m, void *unused) |
216 | { | 356 | { |
217 | #ifdef CONFIG_TREE_PREEMPT_RCU | 357 | #ifdef CONFIG_TREE_PREEMPT_RCU |
218 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", | 358 | show_one_rcugp(m, &rcu_preempt_state); |
219 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | ||
220 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 359 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
221 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", | 360 | show_one_rcugp(m, &rcu_sched_state); |
222 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 361 | show_one_rcugp(m, &rcu_bh_state); |
223 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", | ||
224 | rcu_bh_state.completed, rcu_bh_state.gpnum); | ||
225 | return 0; | 362 | return 0; |
226 | } | 363 | } |
227 | 364 | ||
@@ -262,7 +399,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) | |||
262 | struct rcu_data *rdp; | 399 | struct rcu_data *rdp; |
263 | 400 | ||
264 | for_each_possible_cpu(cpu) { | 401 | for_each_possible_cpu(cpu) { |
265 | rdp = rsp->rda[cpu]; | 402 | rdp = per_cpu_ptr(rsp->rda, cpu); |
266 | if (rdp->beenonline) | 403 | if (rdp->beenonline) |
267 | print_one_rcu_pending(m, rdp); | 404 | print_one_rcu_pending(m, rdp); |
268 | } | 405 | } |
@@ -294,9 +431,32 @@ static const struct file_operations rcu_pending_fops = { | |||
294 | .release = single_release, | 431 | .release = single_release, |
295 | }; | 432 | }; |
296 | 433 | ||
434 | static int show_rcutorture(struct seq_file *m, void *unused) | ||
435 | { | ||
436 | seq_printf(m, "rcutorture test sequence: %lu %s\n", | ||
437 | rcutorture_testseq >> 1, | ||
438 | (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); | ||
439 | seq_printf(m, "rcutorture update version number: %lu\n", | ||
440 | rcutorture_vernum); | ||
441 | return 0; | ||
442 | } | ||
443 | |||
444 | static int rcutorture_open(struct inode *inode, struct file *file) | ||
445 | { | ||
446 | return single_open(file, show_rcutorture, NULL); | ||
447 | } | ||
448 | |||
449 | static const struct file_operations rcutorture_fops = { | ||
450 | .owner = THIS_MODULE, | ||
451 | .open = rcutorture_open, | ||
452 | .read = seq_read, | ||
453 | .llseek = seq_lseek, | ||
454 | .release = single_release, | ||
455 | }; | ||
456 | |||
297 | static struct dentry *rcudir; | 457 | static struct dentry *rcudir; |
298 | 458 | ||
299 | static int __init rcuclassic_trace_init(void) | 459 | static int __init rcutree_trace_init(void) |
300 | { | 460 | { |
301 | struct dentry *retval; | 461 | struct dentry *retval; |
302 | 462 | ||
@@ -314,6 +474,9 @@ static int __init rcuclassic_trace_init(void) | |||
314 | if (!retval) | 474 | if (!retval) |
315 | goto free_out; | 475 | goto free_out; |
316 | 476 | ||
477 | if (rcu_boost_trace_create_file(rcudir)) | ||
478 | goto free_out; | ||
479 | |||
317 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 480 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); |
318 | if (!retval) | 481 | if (!retval) |
319 | goto free_out; | 482 | goto free_out; |
@@ -327,20 +490,25 @@ static int __init rcuclassic_trace_init(void) | |||
327 | NULL, &rcu_pending_fops); | 490 | NULL, &rcu_pending_fops); |
328 | if (!retval) | 491 | if (!retval) |
329 | goto free_out; | 492 | goto free_out; |
493 | |||
494 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | ||
495 | NULL, &rcutorture_fops); | ||
496 | if (!retval) | ||
497 | goto free_out; | ||
330 | return 0; | 498 | return 0; |
331 | free_out: | 499 | free_out: |
332 | debugfs_remove_recursive(rcudir); | 500 | debugfs_remove_recursive(rcudir); |
333 | return 1; | 501 | return 1; |
334 | } | 502 | } |
335 | 503 | ||
336 | static void __exit rcuclassic_trace_cleanup(void) | 504 | static void __exit rcutree_trace_cleanup(void) |
337 | { | 505 | { |
338 | debugfs_remove_recursive(rcudir); | 506 | debugfs_remove_recursive(rcudir); |
339 | } | 507 | } |
340 | 508 | ||
341 | 509 | ||
342 | module_init(rcuclassic_trace_init); | 510 | module_init(rcutree_trace_init); |
343 | module_exit(rcuclassic_trace_cleanup); | 511 | module_exit(rcutree_trace_cleanup); |
344 | 512 | ||
345 | MODULE_AUTHOR("Paul E. McKenney"); | 513 | MODULE_AUTHOR("Paul E. McKenney"); |
346 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); | 514 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); |
diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb929..859ea5a9605f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { | |||
70 | */ | 70 | */ |
71 | static struct page **relay_alloc_page_array(unsigned int n_pages) | 71 | static struct page **relay_alloc_page_array(unsigned int n_pages) |
72 | { | 72 | { |
73 | struct page **array; | 73 | const size_t pa_size = n_pages * sizeof(struct page *); |
74 | size_t pa_size = n_pages * sizeof(struct page *); | 74 | if (pa_size > PAGE_SIZE) |
75 | 75 | return vzalloc(pa_size); | |
76 | if (pa_size > PAGE_SIZE) { | 76 | return kzalloc(pa_size, GFP_KERNEL); |
77 | array = vmalloc(pa_size); | ||
78 | if (array) | ||
79 | memset(array, 0, pa_size); | ||
80 | } else { | ||
81 | array = kzalloc(pa_size, GFP_KERNEL); | ||
82 | } | ||
83 | return array; | ||
84 | } | 77 | } |
85 | 78 | ||
86 | /* | 79 | /* |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37a768b..34683efa2cce 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, | |||
126 | pos, buf, s - buf); | 126 | pos, buf, s - buf); |
127 | } | 127 | } |
128 | 128 | ||
129 | #if BITS_PER_LONG == 32 | ||
130 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
131 | { | ||
132 | unsigned long flags; | ||
133 | u64 ret; | ||
134 | |||
135 | spin_lock_irqsave(&counter->lock, flags); | ||
136 | ret = *res_counter_member(counter, member); | ||
137 | spin_unlock_irqrestore(&counter->lock, flags); | ||
138 | |||
139 | return ret; | ||
140 | } | ||
141 | #else | ||
129 | u64 res_counter_read_u64(struct res_counter *counter, int member) | 142 | u64 res_counter_read_u64(struct res_counter *counter, int member) |
130 | { | 143 | { |
131 | return *res_counter_member(counter, member); | 144 | return *res_counter_member(counter, member); |
132 | } | 145 | } |
146 | #endif | ||
133 | 147 | ||
134 | int res_counter_memparse_write_strategy(const char *buf, | 148 | int res_counter_memparse_write_strategy(const char *buf, |
135 | unsigned long long *res) | 149 | unsigned long long *res) |
diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5dea..3ff40178dce7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -38,6 +38,14 @@ struct resource iomem_resource = { | |||
38 | }; | 38 | }; |
39 | EXPORT_SYMBOL(iomem_resource); | 39 | EXPORT_SYMBOL(iomem_resource); |
40 | 40 | ||
41 | /* constraints to be met while allocating resources */ | ||
42 | struct resource_constraint { | ||
43 | resource_size_t min, max, align; | ||
44 | resource_size_t (*alignf)(void *, const struct resource *, | ||
45 | resource_size_t, resource_size_t); | ||
46 | void *alignf_data; | ||
47 | }; | ||
48 | |||
41 | static DEFINE_RWLOCK(resource_lock); | 49 | static DEFINE_RWLOCK(resource_lock); |
42 | 50 | ||
43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 51 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
@@ -357,57 +365,148 @@ int __weak page_is_ram(unsigned long pfn) | |||
357 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 365 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
358 | } | 366 | } |
359 | 367 | ||
368 | void __weak arch_remove_reservations(struct resource *avail) | ||
369 | { | ||
370 | } | ||
371 | |||
372 | static resource_size_t simple_align_resource(void *data, | ||
373 | const struct resource *avail, | ||
374 | resource_size_t size, | ||
375 | resource_size_t align) | ||
376 | { | ||
377 | return avail->start; | ||
378 | } | ||
379 | |||
380 | static void resource_clip(struct resource *res, resource_size_t min, | ||
381 | resource_size_t max) | ||
382 | { | ||
383 | if (res->start < min) | ||
384 | res->start = min; | ||
385 | if (res->end > max) | ||
386 | res->end = max; | ||
387 | } | ||
388 | |||
389 | static bool resource_contains(struct resource *res1, struct resource *res2) | ||
390 | { | ||
391 | return res1->start <= res2->start && res1->end >= res2->end; | ||
392 | } | ||
393 | |||
360 | /* | 394 | /* |
361 | * Find empty slot in the resource tree given range and alignment. | 395 | * Find empty slot in the resource tree with the given range and |
396 | * alignment constraints | ||
362 | */ | 397 | */ |
363 | static int find_resource(struct resource *root, struct resource *new, | 398 | static int __find_resource(struct resource *root, struct resource *old, |
364 | resource_size_t size, resource_size_t min, | 399 | struct resource *new, |
365 | resource_size_t max, resource_size_t align, | 400 | resource_size_t size, |
366 | resource_size_t (*alignf)(void *, | 401 | struct resource_constraint *constraint) |
367 | const struct resource *, | ||
368 | resource_size_t, | ||
369 | resource_size_t), | ||
370 | void *alignf_data) | ||
371 | { | 402 | { |
372 | struct resource *this = root->child; | 403 | struct resource *this = root->child; |
373 | struct resource tmp = *new; | 404 | struct resource tmp = *new, avail, alloc; |
374 | 405 | ||
406 | tmp.flags = new->flags; | ||
375 | tmp.start = root->start; | 407 | tmp.start = root->start; |
376 | /* | 408 | /* |
377 | * Skip past an allocated resource that starts at 0, since the assignment | 409 | * Skip past an allocated resource that starts at 0, since the assignment |
378 | * of this->start - 1 to tmp->end below would cause an underflow. | 410 | * of this->start - 1 to tmp->end below would cause an underflow. |
379 | */ | 411 | */ |
380 | if (this && this->start == 0) { | 412 | if (this && this->start == root->start) { |
381 | tmp.start = this->end + 1; | 413 | tmp.start = (this == old) ? old->start : this->end + 1; |
382 | this = this->sibling; | 414 | this = this->sibling; |
383 | } | 415 | } |
384 | for(;;) { | 416 | for(;;) { |
385 | if (this) | 417 | if (this) |
386 | tmp.end = this->start - 1; | 418 | tmp.end = (this == old) ? this->end : this->start - 1; |
387 | else | 419 | else |
388 | tmp.end = root->end; | 420 | tmp.end = root->end; |
389 | if (tmp.start < min) | 421 | |
390 | tmp.start = min; | 422 | resource_clip(&tmp, constraint->min, constraint->max); |
391 | if (tmp.end > max) | 423 | arch_remove_reservations(&tmp); |
392 | tmp.end = max; | 424 | |
393 | tmp.start = ALIGN(tmp.start, align); | 425 | /* Check for overflow after ALIGN() */ |
394 | if (alignf) | 426 | avail = *new; |
395 | tmp.start = alignf(alignf_data, &tmp, size, align); | 427 | avail.start = ALIGN(tmp.start, constraint->align); |
396 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { | 428 | avail.end = tmp.end; |
397 | new->start = tmp.start; | 429 | if (avail.start >= tmp.start) { |
398 | new->end = tmp.start + size - 1; | 430 | alloc.start = constraint->alignf(constraint->alignf_data, &avail, |
399 | return 0; | 431 | size, constraint->align); |
432 | alloc.end = alloc.start + size - 1; | ||
433 | if (resource_contains(&avail, &alloc)) { | ||
434 | new->start = alloc.start; | ||
435 | new->end = alloc.end; | ||
436 | return 0; | ||
437 | } | ||
400 | } | 438 | } |
401 | if (!this) | 439 | if (!this) |
402 | break; | 440 | break; |
403 | tmp.start = this->end + 1; | 441 | if (this != old) |
442 | tmp.start = this->end + 1; | ||
404 | this = this->sibling; | 443 | this = this->sibling; |
405 | } | 444 | } |
406 | return -EBUSY; | 445 | return -EBUSY; |
407 | } | 446 | } |
408 | 447 | ||
448 | /* | ||
449 | * Find empty slot in the resource tree given range and alignment. | ||
450 | */ | ||
451 | static int find_resource(struct resource *root, struct resource *new, | ||
452 | resource_size_t size, | ||
453 | struct resource_constraint *constraint) | ||
454 | { | ||
455 | return __find_resource(root, NULL, new, size, constraint); | ||
456 | } | ||
457 | |||
458 | /** | ||
459 | * reallocate_resource - allocate a slot in the resource tree given range & alignment. | ||
460 | * The resource will be relocated if the new size cannot be reallocated in the | ||
461 | * current location. | ||
462 | * | ||
463 | * @root: root resource descriptor | ||
464 | * @old: resource descriptor desired by caller | ||
465 | * @newsize: new size of the resource descriptor | ||
466 | * @constraint: the size and alignment constraints to be met. | ||
467 | */ | ||
468 | int reallocate_resource(struct resource *root, struct resource *old, | ||
469 | resource_size_t newsize, | ||
470 | struct resource_constraint *constraint) | ||
471 | { | ||
472 | int err=0; | ||
473 | struct resource new = *old; | ||
474 | struct resource *conflict; | ||
475 | |||
476 | write_lock(&resource_lock); | ||
477 | |||
478 | if ((err = __find_resource(root, old, &new, newsize, constraint))) | ||
479 | goto out; | ||
480 | |||
481 | if (resource_contains(&new, old)) { | ||
482 | old->start = new.start; | ||
483 | old->end = new.end; | ||
484 | goto out; | ||
485 | } | ||
486 | |||
487 | if (old->child) { | ||
488 | err = -EBUSY; | ||
489 | goto out; | ||
490 | } | ||
491 | |||
492 | if (resource_contains(old, &new)) { | ||
493 | old->start = new.start; | ||
494 | old->end = new.end; | ||
495 | } else { | ||
496 | __release_resource(old); | ||
497 | *old = new; | ||
498 | conflict = __request_resource(root, old); | ||
499 | BUG_ON(conflict); | ||
500 | } | ||
501 | out: | ||
502 | write_unlock(&resource_lock); | ||
503 | return err; | ||
504 | } | ||
505 | |||
506 | |||
409 | /** | 507 | /** |
410 | * allocate_resource - allocate empty slot in the resource tree given range & alignment | 508 | * allocate_resource - allocate empty slot in the resource tree given range & alignment. |
509 | * The resource will be reallocated with a new size if it was already allocated | ||
411 | * @root: root resource descriptor | 510 | * @root: root resource descriptor |
412 | * @new: resource descriptor desired by caller | 511 | * @new: resource descriptor desired by caller |
413 | * @size: requested resource region size | 512 | * @size: requested resource region size |
@@ -427,9 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
427 | void *alignf_data) | 526 | void *alignf_data) |
428 | { | 527 | { |
429 | int err; | 528 | int err; |
529 | struct resource_constraint constraint; | ||
530 | |||
531 | if (!alignf) | ||
532 | alignf = simple_align_resource; | ||
533 | |||
534 | constraint.min = min; | ||
535 | constraint.max = max; | ||
536 | constraint.align = align; | ||
537 | constraint.alignf = alignf; | ||
538 | constraint.alignf_data = alignf_data; | ||
539 | |||
540 | if ( new->parent ) { | ||
541 | /* resource is already allocated, try reallocating with | ||
542 | the new constraints */ | ||
543 | return reallocate_resource(root, new, size, &constraint); | ||
544 | } | ||
430 | 545 | ||
431 | write_lock(&resource_lock); | 546 | write_lock(&resource_lock); |
432 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | 547 | err = find_resource(root, new, size, &constraint); |
433 | if (err >= 0 && __request_resource(root, new)) | 548 | if (err >= 0 && __request_resource(root, new)) |
434 | err = -EBUSY; | 549 | err = -EBUSY; |
435 | write_unlock(&resource_lock); | 550 | write_unlock(&resource_lock); |
@@ -453,6 +568,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou | |||
453 | 568 | ||
454 | if (first == parent) | 569 | if (first == parent) |
455 | return first; | 570 | return first; |
571 | if (WARN_ON(first == new)) /* duplicated insertion */ | ||
572 | return first; | ||
456 | 573 | ||
457 | if ((first->start > new->start) || (first->end < new->end)) | 574 | if ((first->start > new->start) || (first->end < new->end)) |
458 | break; | 575 | break; |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c8..3c7cbc2c33be 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | |||
215 | put_pid(waiter->deadlock_task_pid); | 215 | put_pid(waiter->deadlock_task_pid); |
216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | 216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); |
217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | 217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); |
218 | TRACE_WARN_ON(waiter->task); | ||
219 | memset(waiter, 0x22, sizeof(*waiter)); | 218 | memset(waiter, 0x22, sizeof(*waiter)); |
220 | } | 219 | } |
221 | 220 | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index a56f629b057a..5c9ccd380966 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/kthread.h> | 9 | #include <linux/kthread.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
12 | #include <linux/smp_lock.h> | ||
13 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
14 | #include <linux/sysdev.h> | 13 | #include <linux/sysdev.h> |
15 | #include <linux/timer.h> | 14 | #include <linux/timer.h> |
@@ -27,7 +26,6 @@ struct test_thread_data { | |||
27 | int opcode; | 26 | int opcode; |
28 | int opdata; | 27 | int opdata; |
29 | int mutexes[MAX_RT_TEST_MUTEXES]; | 28 | int mutexes[MAX_RT_TEST_MUTEXES]; |
30 | int bkl; | ||
31 | int event; | 29 | int event; |
32 | struct sys_device sysdev; | 30 | struct sys_device sysdev; |
33 | }; | 31 | }; |
@@ -46,9 +44,8 @@ enum test_opcodes { | |||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | 44 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ |
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | 45 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ |
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | 46 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ |
49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | 47 | /* 9, 10 - reserved for BKL commemoration */ |
50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | 48 | RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ |
51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | 49 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ |
53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | 50 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ |
54 | }; | 51 | }; |
@@ -74,11 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
74 | td->mutexes[i] = 0; | 71 | td->mutexes[i] = 0; |
75 | } | 72 | } |
76 | } | 73 | } |
77 | |||
78 | if (!lockwakeup && td->bkl == 4) { | ||
79 | unlock_kernel(); | ||
80 | td->bkl = 0; | ||
81 | } | ||
82 | return 0; | 74 | return 0; |
83 | 75 | ||
84 | case RTTEST_RESETEVENT: | 76 | case RTTEST_RESETEVENT: |
@@ -129,21 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
129 | td->mutexes[id] = 0; | 121 | td->mutexes[id] = 0; |
130 | return 0; | 122 | return 0; |
131 | 123 | ||
132 | case RTTEST_LOCKBKL: | ||
133 | if (td->bkl) | ||
134 | return 0; | ||
135 | td->bkl = 1; | ||
136 | lock_kernel(); | ||
137 | td->bkl = 4; | ||
138 | return 0; | ||
139 | |||
140 | case RTTEST_UNLOCKBKL: | ||
141 | if (td->bkl != 4) | ||
142 | break; | ||
143 | unlock_kernel(); | ||
144 | td->bkl = 0; | ||
145 | return 0; | ||
146 | |||
147 | default: | 124 | default: |
148 | break; | 125 | break; |
149 | } | 126 | } |
@@ -190,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) | |||
190 | td->event = atomic_add_return(1, &rttest_event); | 167 | td->event = atomic_add_return(1, &rttest_event); |
191 | break; | 168 | break; |
192 | 169 | ||
193 | case RTTEST_LOCKBKL: | ||
194 | default: | 170 | default: |
195 | break; | 171 | break; |
196 | } | 172 | } |
@@ -223,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) | |||
223 | td->event = atomic_add_return(1, &rttest_event); | 199 | td->event = atomic_add_return(1, &rttest_event); |
224 | return; | 200 | return; |
225 | 201 | ||
226 | case RTTEST_LOCKBKL: | ||
227 | return; | ||
228 | default: | 202 | default: |
229 | return; | 203 | return; |
230 | } | 204 | } |
@@ -374,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
374 | spin_lock(&rttest_lock); | 348 | spin_lock(&rttest_lock); |
375 | 349 | ||
376 | curr += sprintf(curr, | 350 | curr += sprintf(curr, |
377 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | 351 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", |
378 | td->opcode, td->event, tsk->state, | 352 | td->opcode, td->event, tsk->state, |
379 | (MAX_RT_PRIO - 1) - tsk->prio, | 353 | (MAX_RT_PRIO - 1) - tsk->prio, |
380 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | 354 | (MAX_RT_PRIO - 1) - tsk->normal_prio, |
381 | tsk->pi_blocked_on, td->bkl); | 355 | tsk->pi_blocked_on); |
382 | 356 | ||
383 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | 357 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) |
384 | curr += sprintf(curr, "%d", td->mutexes[i]); | 358 | curr += sprintf(curr, "%d", td->mutexes[i]); |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786a..ab449117aaf2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -20,41 +20,34 @@ | |||
20 | /* | 20 | /* |
21 | * lock->owner state tracking: | 21 | * lock->owner state tracking: |
22 | * | 22 | * |
23 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | 23 | * lock->owner holds the task_struct pointer of the owner. Bit 0 |
24 | * are used to keep track of the "owner is pending" and "lock has | 24 | * is used to keep track of the "lock has waiters" state. |
25 | * waiters" state. | ||
26 | * | 25 | * |
27 | * owner bit1 bit0 | 26 | * owner bit0 |
28 | * NULL 0 0 lock is free (fast acquire possible) | 27 | * NULL 0 lock is free (fast acquire possible) |
29 | * NULL 0 1 invalid state | 28 | * NULL 1 lock is free and has waiters and the top waiter |
30 | * NULL 1 0 Transitional State* | 29 | * is going to take the lock* |
31 | * NULL 1 1 invalid state | 30 | * taskpointer 0 lock is held (fast release possible) |
32 | * taskpointer 0 0 lock is held (fast release possible) | 31 | * taskpointer 1 lock is held and has waiters** |
33 | * taskpointer 0 1 task is pending owner | ||
34 | * taskpointer 1 0 lock is held and has waiters | ||
35 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
36 | * | ||
37 | * Pending ownership is assigned to the top (highest priority) | ||
38 | * waiter of the lock, when the lock is released. The thread is woken | ||
39 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
40 | * cleared) a competing higher priority thread can steal the lock | ||
41 | * which puts the woken up thread back on the waiters list. | ||
42 | * | 32 | * |
43 | * The fast atomic compare exchange based acquire and release is only | 33 | * The fast atomic compare exchange based acquire and release is only |
44 | * possible when bit 0 and 1 of lock->owner are 0. | 34 | * possible when bit 0 of lock->owner is 0. |
35 | * | ||
36 | * (*) It also can be a transitional state when grabbing the lock | ||
37 | * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, | ||
38 | * we need to set the bit0 before looking at the lock, and the owner may be | ||
39 | * NULL in this small time, hence this can be a transitional state. | ||
45 | * | 40 | * |
46 | * (*) There's a small time where the owner can be NULL and the | 41 | * (**) There is a small time when bit 0 is set but there are no |
47 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | 42 | * waiters. This can happen when grabbing the lock in the slow path. |
48 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | 43 | * To prevent a cmpxchg of the owner releasing the lock, we need to |
49 | * bit before looking at the lock, hence the reason this is a transitional | 44 | * set this bit before looking at the lock. |
50 | * state. | ||
51 | */ | 45 | */ |
52 | 46 | ||
53 | static void | 47 | static void |
54 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | 48 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) |
55 | unsigned long mask) | ||
56 | { | 49 | { |
57 | unsigned long val = (unsigned long)owner | mask; | 50 | unsigned long val = (unsigned long)owner; |
58 | 51 | ||
59 | if (rt_mutex_has_waiters(lock)) | 52 | if (rt_mutex_has_waiters(lock)) |
60 | val |= RT_MUTEX_HAS_WAITERS; | 53 | val |= RT_MUTEX_HAS_WAITERS; |
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
203 | * reached or the state of the chain has changed while we | 196 | * reached or the state of the chain has changed while we |
204 | * dropped the locks. | 197 | * dropped the locks. |
205 | */ | 198 | */ |
206 | if (!waiter || !waiter->task) | 199 | if (!waiter) |
207 | goto out_unlock_pi; | 200 | goto out_unlock_pi; |
208 | 201 | ||
209 | /* | 202 | /* |
210 | * Check the orig_waiter state. After we dropped the locks, | 203 | * Check the orig_waiter state. After we dropped the locks, |
211 | * the previous owner of the lock might have released the lock | 204 | * the previous owner of the lock might have released the lock. |
212 | * and made us the pending owner: | ||
213 | */ | 205 | */ |
214 | if (orig_waiter && !orig_waiter->task) | 206 | if (orig_waiter && !rt_mutex_owner(orig_lock)) |
215 | goto out_unlock_pi; | 207 | goto out_unlock_pi; |
216 | 208 | ||
217 | /* | 209 | /* |
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
254 | 246 | ||
255 | /* Release the task */ | 247 | /* Release the task */ |
256 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 248 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
249 | if (!rt_mutex_owner(lock)) { | ||
250 | /* | ||
251 | * If the requeue above changed the top waiter, then we need | ||
252 | * to wake the new top waiter up to try to get the lock. | ||
253 | */ | ||
254 | |||
255 | if (top_waiter != rt_mutex_top_waiter(lock)) | ||
256 | wake_up_process(rt_mutex_top_waiter(lock)->task); | ||
257 | raw_spin_unlock(&lock->wait_lock); | ||
258 | goto out_put_task; | ||
259 | } | ||
257 | put_task_struct(task); | 260 | put_task_struct(task); |
258 | 261 | ||
259 | /* Grab the next task */ | 262 | /* Grab the next task */ |
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
296 | } | 299 | } |
297 | 300 | ||
298 | /* | 301 | /* |
299 | * Optimization: check if we can steal the lock from the | ||
300 | * assigned pending owner [which might not have taken the | ||
301 | * lock yet]: | ||
302 | */ | ||
303 | static inline int try_to_steal_lock(struct rt_mutex *lock, | ||
304 | struct task_struct *task) | ||
305 | { | ||
306 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
307 | struct rt_mutex_waiter *next; | ||
308 | unsigned long flags; | ||
309 | |||
310 | if (!rt_mutex_owner_pending(lock)) | ||
311 | return 0; | ||
312 | |||
313 | if (pendowner == task) | ||
314 | return 1; | ||
315 | |||
316 | raw_spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
317 | if (task->prio >= pendowner->prio) { | ||
318 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Check if a waiter is enqueued on the pending owners | ||
324 | * pi_waiters list. Remove it and readjust pending owners | ||
325 | * priority. | ||
326 | */ | ||
327 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
328 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
329 | return 1; | ||
330 | } | ||
331 | |||
332 | /* No chain handling, pending owner is not blocked on anything: */ | ||
333 | next = rt_mutex_top_waiter(lock); | ||
334 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
335 | __rt_mutex_adjust_prio(pendowner); | ||
336 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
337 | |||
338 | /* | ||
339 | * We are going to steal the lock and a waiter was | ||
340 | * enqueued on the pending owners pi_waiters queue. So | ||
341 | * we have to enqueue this waiter into | ||
342 | * task->pi_waiters list. This covers the case, | ||
343 | * where task is boosted because it holds another | ||
344 | * lock and gets unboosted because the booster is | ||
345 | * interrupted, so we would delay a waiter with higher | ||
346 | * priority as task->normal_prio. | ||
347 | * | ||
348 | * Note: in the rare case of a SCHED_OTHER task changing | ||
349 | * its priority and thus stealing the lock, next->task | ||
350 | * might be task: | ||
351 | */ | ||
352 | if (likely(next->task != task)) { | ||
353 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
354 | plist_add(&next->pi_list_entry, &task->pi_waiters); | ||
355 | __rt_mutex_adjust_prio(task); | ||
356 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
357 | } | ||
358 | return 1; | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Try to take an rt-mutex | 302 | * Try to take an rt-mutex |
363 | * | 303 | * |
364 | * This fails | ||
365 | * - when the lock has a real owner | ||
366 | * - when a different pending owner exists and has higher priority than current | ||
367 | * | ||
368 | * Must be called with lock->wait_lock held. | 304 | * Must be called with lock->wait_lock held. |
305 | * | ||
306 | * @lock: the lock to be acquired. | ||
307 | * @task: the task which wants to acquire the lock | ||
308 | * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) | ||
369 | */ | 309 | */ |
370 | static int try_to_take_rt_mutex(struct rt_mutex *lock) | 310 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
311 | struct rt_mutex_waiter *waiter) | ||
371 | { | 312 | { |
372 | /* | 313 | /* |
373 | * We have to be careful here if the atomic speedups are | 314 | * We have to be careful here if the atomic speedups are |
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
390 | */ | 331 | */ |
391 | mark_rt_mutex_waiters(lock); | 332 | mark_rt_mutex_waiters(lock); |
392 | 333 | ||
393 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) | 334 | if (rt_mutex_owner(lock)) |
394 | return 0; | 335 | return 0; |
395 | 336 | ||
337 | /* | ||
338 | * It will get the lock because of one of these conditions: | ||
339 | * 1) there is no waiter | ||
340 | * 2) higher priority than waiters | ||
341 | * 3) it is top waiter | ||
342 | */ | ||
343 | if (rt_mutex_has_waiters(lock)) { | ||
344 | if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { | ||
345 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | ||
346 | return 0; | ||
347 | } | ||
348 | } | ||
349 | |||
350 | if (waiter || rt_mutex_has_waiters(lock)) { | ||
351 | unsigned long flags; | ||
352 | struct rt_mutex_waiter *top; | ||
353 | |||
354 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
355 | |||
356 | /* remove the queued waiter. */ | ||
357 | if (waiter) { | ||
358 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
359 | task->pi_blocked_on = NULL; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * We have to enqueue the top waiter(if it exists) into | ||
364 | * task->pi_waiters list. | ||
365 | */ | ||
366 | if (rt_mutex_has_waiters(lock)) { | ||
367 | top = rt_mutex_top_waiter(lock); | ||
368 | top->pi_list_entry.prio = top->list_entry.prio; | ||
369 | plist_add(&top->pi_list_entry, &task->pi_waiters); | ||
370 | } | ||
371 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
372 | } | ||
373 | |||
396 | /* We got the lock. */ | 374 | /* We got the lock. */ |
397 | debug_rt_mutex_lock(lock); | 375 | debug_rt_mutex_lock(lock); |
398 | 376 | ||
399 | rt_mutex_set_owner(lock, current, 0); | 377 | rt_mutex_set_owner(lock, task); |
400 | 378 | ||
401 | rt_mutex_deadlock_account_lock(lock, current); | 379 | rt_mutex_deadlock_account_lock(lock, task); |
402 | 380 | ||
403 | return 1; | 381 | return 1; |
404 | } | 382 | } |
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
436 | 414 | ||
437 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 415 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
438 | 416 | ||
417 | if (!owner) | ||
418 | return 0; | ||
419 | |||
439 | if (waiter == rt_mutex_top_waiter(lock)) { | 420 | if (waiter == rt_mutex_top_waiter(lock)) { |
440 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 421 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
441 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | 422 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); |
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
472 | /* | 453 | /* |
473 | * Wake up the next waiter on the lock. | 454 | * Wake up the next waiter on the lock. |
474 | * | 455 | * |
475 | * Remove the top waiter from the current tasks waiter list and from | 456 | * Remove the top waiter from the current tasks waiter list and wake it up. |
476 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
477 | * | 457 | * |
478 | * Called with lock->wait_lock held. | 458 | * Called with lock->wait_lock held. |
479 | */ | 459 | */ |
480 | static void wakeup_next_waiter(struct rt_mutex *lock) | 460 | static void wakeup_next_waiter(struct rt_mutex *lock) |
481 | { | 461 | { |
482 | struct rt_mutex_waiter *waiter; | 462 | struct rt_mutex_waiter *waiter; |
483 | struct task_struct *pendowner; | ||
484 | unsigned long flags; | 463 | unsigned long flags; |
485 | 464 | ||
486 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 465 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
487 | 466 | ||
488 | waiter = rt_mutex_top_waiter(lock); | 467 | waiter = rt_mutex_top_waiter(lock); |
489 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
490 | 468 | ||
491 | /* | 469 | /* |
492 | * Remove it from current->pi_waiters. We do not adjust a | 470 | * Remove it from current->pi_waiters. We do not adjust a |
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
495 | * lock->wait_lock. | 473 | * lock->wait_lock. |
496 | */ | 474 | */ |
497 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | 475 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); |
498 | pendowner = waiter->task; | ||
499 | waiter->task = NULL; | ||
500 | 476 | ||
501 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | 477 | rt_mutex_set_owner(lock, NULL); |
502 | 478 | ||
503 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 479 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
504 | 480 | ||
505 | /* | 481 | wake_up_process(waiter->task); |
506 | * Clear the pi_blocked_on variable and enqueue a possible | ||
507 | * waiter into the pi_waiters list of the pending owner. This | ||
508 | * prevents that in case the pending owner gets unboosted a | ||
509 | * waiter with higher priority than pending-owner->normal_prio | ||
510 | * is blocked on the unboosted (pending) owner. | ||
511 | */ | ||
512 | raw_spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
513 | |||
514 | WARN_ON(!pendowner->pi_blocked_on); | ||
515 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
516 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
517 | |||
518 | pendowner->pi_blocked_on = NULL; | ||
519 | |||
520 | if (rt_mutex_has_waiters(lock)) { | ||
521 | struct rt_mutex_waiter *next; | ||
522 | |||
523 | next = rt_mutex_top_waiter(lock); | ||
524 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
525 | } | ||
526 | raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
527 | |||
528 | wake_up_process(pendowner); | ||
529 | } | 482 | } |
530 | 483 | ||
531 | /* | 484 | /* |
532 | * Remove a waiter from a lock | 485 | * Remove a waiter from a lock and give up |
533 | * | 486 | * |
534 | * Must be called with lock->wait_lock held | 487 | * Must be called with lock->wait_lock held and |
488 | * have just failed to try_to_take_rt_mutex(). | ||
535 | */ | 489 | */ |
536 | static void remove_waiter(struct rt_mutex *lock, | 490 | static void remove_waiter(struct rt_mutex *lock, |
537 | struct rt_mutex_waiter *waiter) | 491 | struct rt_mutex_waiter *waiter) |
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, | |||
543 | 497 | ||
544 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 498 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
545 | plist_del(&waiter->list_entry, &lock->wait_list); | 499 | plist_del(&waiter->list_entry, &lock->wait_list); |
546 | waiter->task = NULL; | ||
547 | current->pi_blocked_on = NULL; | 500 | current->pi_blocked_on = NULL; |
548 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 501 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
549 | 502 | ||
550 | if (first && owner != current) { | 503 | if (!owner) |
504 | return; | ||
505 | |||
506 | if (first) { | ||
551 | 507 | ||
552 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | 508 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
553 | 509 | ||
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
614 | * or TASK_UNINTERRUPTIBLE) | 570 | * or TASK_UNINTERRUPTIBLE) |
615 | * @timeout: the pre-initialized and started timer, or NULL for none | 571 | * @timeout: the pre-initialized and started timer, or NULL for none |
616 | * @waiter: the pre-initialized rt_mutex_waiter | 572 | * @waiter: the pre-initialized rt_mutex_waiter |
617 | * @detect_deadlock: passed to task_blocks_on_rt_mutex | ||
618 | * | 573 | * |
619 | * lock->wait_lock must be held by the caller. | 574 | * lock->wait_lock must be held by the caller. |
620 | */ | 575 | */ |
621 | static int __sched | 576 | static int __sched |
622 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, | 577 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, |
623 | struct hrtimer_sleeper *timeout, | 578 | struct hrtimer_sleeper *timeout, |
624 | struct rt_mutex_waiter *waiter, | 579 | struct rt_mutex_waiter *waiter) |
625 | int detect_deadlock) | ||
626 | { | 580 | { |
627 | int ret = 0; | 581 | int ret = 0; |
628 | 582 | ||
629 | for (;;) { | 583 | for (;;) { |
630 | /* Try to acquire the lock: */ | 584 | /* Try to acquire the lock: */ |
631 | if (try_to_take_rt_mutex(lock)) | 585 | if (try_to_take_rt_mutex(lock, current, waiter)) |
632 | break; | 586 | break; |
633 | 587 | ||
634 | /* | 588 | /* |
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
645 | break; | 599 | break; |
646 | } | 600 | } |
647 | 601 | ||
648 | /* | ||
649 | * waiter->task is NULL the first time we come here and | ||
650 | * when we have been woken up by the previous owner | ||
651 | * but the lock got stolen by a higher prio task. | ||
652 | */ | ||
653 | if (!waiter->task) { | ||
654 | ret = task_blocks_on_rt_mutex(lock, waiter, current, | ||
655 | detect_deadlock); | ||
656 | /* | ||
657 | * If we got woken up by the owner then start loop | ||
658 | * all over without going into schedule to try | ||
659 | * to get the lock now: | ||
660 | */ | ||
661 | if (unlikely(!waiter->task)) { | ||
662 | /* | ||
663 | * Reset the return value. We might | ||
664 | * have returned with -EDEADLK and the | ||
665 | * owner released the lock while we | ||
666 | * were walking the pi chain. | ||
667 | */ | ||
668 | ret = 0; | ||
669 | continue; | ||
670 | } | ||
671 | if (unlikely(ret)) | ||
672 | break; | ||
673 | } | ||
674 | |||
675 | raw_spin_unlock(&lock->wait_lock); | 602 | raw_spin_unlock(&lock->wait_lock); |
676 | 603 | ||
677 | debug_rt_mutex_print_deadlock(waiter); | 604 | debug_rt_mutex_print_deadlock(waiter); |
678 | 605 | ||
679 | if (waiter->task) | 606 | schedule_rt_mutex(lock); |
680 | schedule_rt_mutex(lock); | ||
681 | 607 | ||
682 | raw_spin_lock(&lock->wait_lock); | 608 | raw_spin_lock(&lock->wait_lock); |
683 | set_current_state(state); | 609 | set_current_state(state); |
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
698 | int ret = 0; | 624 | int ret = 0; |
699 | 625 | ||
700 | debug_rt_mutex_init_waiter(&waiter); | 626 | debug_rt_mutex_init_waiter(&waiter); |
701 | waiter.task = NULL; | ||
702 | 627 | ||
703 | raw_spin_lock(&lock->wait_lock); | 628 | raw_spin_lock(&lock->wait_lock); |
704 | 629 | ||
705 | /* Try to acquire the lock again: */ | 630 | /* Try to acquire the lock again: */ |
706 | if (try_to_take_rt_mutex(lock)) { | 631 | if (try_to_take_rt_mutex(lock, current, NULL)) { |
707 | raw_spin_unlock(&lock->wait_lock); | 632 | raw_spin_unlock(&lock->wait_lock); |
708 | return 0; | 633 | return 0; |
709 | } | 634 | } |
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
717 | timeout->task = NULL; | 642 | timeout->task = NULL; |
718 | } | 643 | } |
719 | 644 | ||
720 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, | 645 | ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); |
721 | detect_deadlock); | 646 | |
647 | if (likely(!ret)) | ||
648 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); | ||
722 | 649 | ||
723 | set_current_state(TASK_RUNNING); | 650 | set_current_state(TASK_RUNNING); |
724 | 651 | ||
725 | if (unlikely(waiter.task)) | 652 | if (unlikely(ret)) |
726 | remove_waiter(lock, &waiter); | 653 | remove_waiter(lock, &waiter); |
727 | 654 | ||
728 | /* | 655 | /* |
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
737 | if (unlikely(timeout)) | 664 | if (unlikely(timeout)) |
738 | hrtimer_cancel(&timeout->timer); | 665 | hrtimer_cancel(&timeout->timer); |
739 | 666 | ||
740 | /* | ||
741 | * Readjust priority, when we did not get the lock. We might | ||
742 | * have been the pending owner and boosted. Since we did not | ||
743 | * take the lock, the PI boost has to go. | ||
744 | */ | ||
745 | if (unlikely(ret)) | ||
746 | rt_mutex_adjust_prio(current); | ||
747 | |||
748 | debug_rt_mutex_free_waiter(&waiter); | 667 | debug_rt_mutex_free_waiter(&waiter); |
749 | 668 | ||
750 | return ret; | 669 | return ret; |
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) | |||
762 | 681 | ||
763 | if (likely(rt_mutex_owner(lock) != current)) { | 682 | if (likely(rt_mutex_owner(lock) != current)) { |
764 | 683 | ||
765 | ret = try_to_take_rt_mutex(lock); | 684 | ret = try_to_take_rt_mutex(lock, current, NULL); |
766 | /* | 685 | /* |
767 | * try_to_take_rt_mutex() sets the lock waiters | 686 | * try_to_take_rt_mutex() sets the lock waiters |
768 | * bit unconditionally. Clean this up. | 687 | * bit unconditionally. Clean this up. |
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
992 | { | 911 | { |
993 | __rt_mutex_init(lock, NULL); | 912 | __rt_mutex_init(lock, NULL); |
994 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | 913 | debug_rt_mutex_proxy_lock(lock, proxy_owner); |
995 | rt_mutex_set_owner(lock, proxy_owner, 0); | 914 | rt_mutex_set_owner(lock, proxy_owner); |
996 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | 915 | rt_mutex_deadlock_account_lock(lock, proxy_owner); |
997 | } | 916 | } |
998 | 917 | ||
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
1008 | struct task_struct *proxy_owner) | 927 | struct task_struct *proxy_owner) |
1009 | { | 928 | { |
1010 | debug_rt_mutex_proxy_unlock(lock); | 929 | debug_rt_mutex_proxy_unlock(lock); |
1011 | rt_mutex_set_owner(lock, NULL, 0); | 930 | rt_mutex_set_owner(lock, NULL); |
1012 | rt_mutex_deadlock_account_unlock(proxy_owner); | 931 | rt_mutex_deadlock_account_unlock(proxy_owner); |
1013 | } | 932 | } |
1014 | 933 | ||
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1034 | 953 | ||
1035 | raw_spin_lock(&lock->wait_lock); | 954 | raw_spin_lock(&lock->wait_lock); |
1036 | 955 | ||
1037 | mark_rt_mutex_waiters(lock); | 956 | if (try_to_take_rt_mutex(lock, task, NULL)) { |
1038 | |||
1039 | if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { | ||
1040 | /* We got the lock for task. */ | ||
1041 | debug_rt_mutex_lock(lock); | ||
1042 | rt_mutex_set_owner(lock, task, 0); | ||
1043 | raw_spin_unlock(&lock->wait_lock); | 957 | raw_spin_unlock(&lock->wait_lock); |
1044 | rt_mutex_deadlock_account_lock(lock, task); | ||
1045 | return 1; | 958 | return 1; |
1046 | } | 959 | } |
1047 | 960 | ||
1048 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); | 961 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); |
1049 | 962 | ||
1050 | if (ret && !waiter->task) { | 963 | if (ret && !rt_mutex_owner(lock)) { |
1051 | /* | 964 | /* |
1052 | * Reset the return value. We might have | 965 | * Reset the return value. We might have |
1053 | * returned with -EDEADLK and the owner | 966 | * returned with -EDEADLK and the owner |
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1056 | */ | 969 | */ |
1057 | ret = 0; | 970 | ret = 0; |
1058 | } | 971 | } |
972 | |||
973 | if (unlikely(ret)) | ||
974 | remove_waiter(lock, waiter); | ||
975 | |||
1059 | raw_spin_unlock(&lock->wait_lock); | 976 | raw_spin_unlock(&lock->wait_lock); |
1060 | 977 | ||
1061 | debug_rt_mutex_print_deadlock(waiter); | 978 | debug_rt_mutex_print_deadlock(waiter); |
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
1110 | 1027 | ||
1111 | set_current_state(TASK_INTERRUPTIBLE); | 1028 | set_current_state(TASK_INTERRUPTIBLE); |
1112 | 1029 | ||
1113 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, | 1030 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); |
1114 | detect_deadlock); | ||
1115 | 1031 | ||
1116 | set_current_state(TASK_RUNNING); | 1032 | set_current_state(TASK_RUNNING); |
1117 | 1033 | ||
1118 | if (unlikely(waiter->task)) | 1034 | if (unlikely(ret)) |
1119 | remove_waiter(lock, waiter); | 1035 | remove_waiter(lock, waiter); |
1120 | 1036 | ||
1121 | /* | 1037 | /* |
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
1126 | 1042 | ||
1127 | raw_spin_unlock(&lock->wait_lock); | 1043 | raw_spin_unlock(&lock->wait_lock); |
1128 | 1044 | ||
1129 | /* | ||
1130 | * Readjust priority, when we did not get the lock. We might have been | ||
1131 | * the pending owner and boosted. Since we did not take the lock, the | ||
1132 | * PI boost has to go. | ||
1133 | */ | ||
1134 | if (unlikely(ret)) | ||
1135 | rt_mutex_adjust_prio(current); | ||
1136 | |||
1137 | return ret; | 1045 | return ret; |
1138 | } | 1046 | } |
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866af..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) | |||
91 | /* | 91 | /* |
92 | * lock->owner state tracking: | 92 | * lock->owner state tracking: |
93 | */ | 93 | */ |
94 | #define RT_MUTEX_OWNER_PENDING 1UL | 94 | #define RT_MUTEX_HAS_WAITERS 1UL |
95 | #define RT_MUTEX_HAS_WAITERS 2UL | 95 | #define RT_MUTEX_OWNER_MASKALL 1UL |
96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
97 | 96 | ||
98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | 97 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) |
99 | { | 98 | { |
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | |||
101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | 100 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); |
102 | } | 101 | } |
103 | 102 | ||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
105 | { | ||
106 | return (struct task_struct *) | ||
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
108 | } | ||
109 | |||
110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
111 | { | ||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
113 | } | ||
114 | |||
115 | /* | 103 | /* |
116 | * PI-futex support (proxy locking functions, etc.): | 104 | * PI-futex support (proxy locking functions, etc.): |
117 | */ | 105 | */ |
diff --git a/kernel/sched.c b/kernel/sched.c index c5d775079027..935f8e8e6160 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <linux/smp_lock.h> | ||
36 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
37 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
38 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
@@ -75,9 +74,11 @@ | |||
75 | 74 | ||
76 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | ||
78 | 78 | ||
79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 80 | #include "workqueue_sched.h" |
81 | #include "sched_autogroup.h" | ||
81 | 82 | ||
82 | #include <litmus/sched_trace.h> | 83 | #include <litmus/sched_trace.h> |
83 | #include <litmus/trace.h> | 84 | #include <litmus/trace.h> |
@@ -235,7 +236,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
235 | #endif | 236 | #endif |
236 | 237 | ||
237 | /* | 238 | /* |
238 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 239 | * sched_domains_mutex serializes calls to init_sched_domains, |
239 | * detach_destroy_domains and partition_sched_domains. | 240 | * detach_destroy_domains and partition_sched_domains. |
240 | */ | 241 | */ |
241 | static DEFINE_MUTEX(sched_domains_mutex); | 242 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -258,6 +259,8 @@ struct task_group { | |||
258 | /* runqueue "owned" by this group on each cpu */ | 259 | /* runqueue "owned" by this group on each cpu */ |
259 | struct cfs_rq **cfs_rq; | 260 | struct cfs_rq **cfs_rq; |
260 | unsigned long shares; | 261 | unsigned long shares; |
262 | |||
263 | atomic_t load_weight; | ||
261 | #endif | 264 | #endif |
262 | 265 | ||
263 | #ifdef CONFIG_RT_GROUP_SCHED | 266 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -273,25 +276,18 @@ struct task_group { | |||
273 | struct task_group *parent; | 276 | struct task_group *parent; |
274 | struct list_head siblings; | 277 | struct list_head siblings; |
275 | struct list_head children; | 278 | struct list_head children; |
276 | }; | ||
277 | 279 | ||
278 | #define root_task_group init_task_group | 280 | #ifdef CONFIG_SCHED_AUTOGROUP |
281 | struct autogroup *autogroup; | ||
282 | #endif | ||
283 | }; | ||
279 | 284 | ||
280 | /* task_group_lock serializes add/remove of task groups and also changes to | 285 | /* task_group_lock serializes the addition/removal of task groups */ |
281 | * a task group's cpu shares. | ||
282 | */ | ||
283 | static DEFINE_SPINLOCK(task_group_lock); | 286 | static DEFINE_SPINLOCK(task_group_lock); |
284 | 287 | ||
285 | #ifdef CONFIG_FAIR_GROUP_SCHED | 288 | #ifdef CONFIG_FAIR_GROUP_SCHED |
286 | 289 | ||
287 | #ifdef CONFIG_SMP | 290 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
288 | static int root_task_group_empty(void) | ||
289 | { | ||
290 | return list_empty(&root_task_group.children); | ||
291 | } | ||
292 | #endif | ||
293 | |||
294 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
295 | 291 | ||
296 | /* | 292 | /* |
297 | * A weight of 0 or 1 can cause arithmetics problems. | 293 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -301,16 +297,16 @@ static int root_task_group_empty(void) | |||
301 | * (The default weight is 1024 - so there's no practical | 297 | * (The default weight is 1024 - so there's no practical |
302 | * limitation from this.) | 298 | * limitation from this.) |
303 | */ | 299 | */ |
304 | #define MIN_SHARES 2 | 300 | #define MIN_SHARES (1UL << 1) |
305 | #define MAX_SHARES (1UL << 18) | 301 | #define MAX_SHARES (1UL << 18) |
306 | 302 | ||
307 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 303 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
308 | #endif | 304 | #endif |
309 | 305 | ||
310 | /* Default task group. | 306 | /* Default task group. |
311 | * Every task in system belong to this group at bootup. | 307 | * Every task in system belong to this group at bootup. |
312 | */ | 308 | */ |
313 | struct task_group init_task_group; | 309 | struct task_group root_task_group; |
314 | 310 | ||
315 | #endif /* CONFIG_CGROUP_SCHED */ | 311 | #endif /* CONFIG_CGROUP_SCHED */ |
316 | 312 | ||
@@ -321,6 +317,9 @@ struct cfs_rq { | |||
321 | 317 | ||
322 | u64 exec_clock; | 318 | u64 exec_clock; |
323 | u64 min_vruntime; | 319 | u64 min_vruntime; |
320 | #ifndef CONFIG_64BIT | ||
321 | u64 min_vruntime_copy; | ||
322 | #endif | ||
324 | 323 | ||
325 | struct rb_root tasks_timeline; | 324 | struct rb_root tasks_timeline; |
326 | struct rb_node *rb_leftmost; | 325 | struct rb_node *rb_leftmost; |
@@ -332,9 +331,11 @@ struct cfs_rq { | |||
332 | * 'curr' points to currently running entity on this cfs_rq. | 331 | * 'curr' points to currently running entity on this cfs_rq. |
333 | * It is set to NULL otherwise (i.e when none are currently running). | 332 | * It is set to NULL otherwise (i.e when none are currently running). |
334 | */ | 333 | */ |
335 | struct sched_entity *curr, *next, *last; | 334 | struct sched_entity *curr, *next, *last, *skip; |
336 | 335 | ||
336 | #ifdef CONFIG_SCHED_DEBUG | ||
337 | unsigned int nr_spread_over; | 337 | unsigned int nr_spread_over; |
338 | #endif | ||
338 | 339 | ||
339 | #ifdef CONFIG_FAIR_GROUP_SCHED | 340 | #ifdef CONFIG_FAIR_GROUP_SCHED |
340 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 341 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -347,6 +348,7 @@ struct cfs_rq { | |||
347 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 348 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
348 | * list is used during load balance. | 349 | * list is used during load balance. |
349 | */ | 350 | */ |
351 | int on_list; | ||
350 | struct list_head leaf_cfs_rq_list; | 352 | struct list_head leaf_cfs_rq_list; |
351 | struct task_group *tg; /* group that "owns" this runqueue */ | 353 | struct task_group *tg; /* group that "owns" this runqueue */ |
352 | 354 | ||
@@ -365,14 +367,17 @@ struct cfs_rq { | |||
365 | unsigned long h_load; | 367 | unsigned long h_load; |
366 | 368 | ||
367 | /* | 369 | /* |
368 | * this cpu's part of tg->shares | 370 | * Maintaining per-cpu shares distribution for group scheduling |
371 | * | ||
372 | * load_stamp is the last time we updated the load average | ||
373 | * load_last is the last time we updated the load average and saw load | ||
374 | * load_unacc_exec_time is currently unaccounted execution time | ||
369 | */ | 375 | */ |
370 | unsigned long shares; | 376 | u64 load_avg; |
377 | u64 load_period; | ||
378 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
371 | 379 | ||
372 | /* | 380 | unsigned long load_contribution; |
373 | * load.weight at the time we set shares | ||
374 | */ | ||
375 | unsigned long rq_weight; | ||
376 | #endif | 381 | #endif |
377 | #endif | 382 | #endif |
378 | }; | 383 | }; |
@@ -428,6 +433,7 @@ struct litmus_rq { | |||
428 | */ | 433 | */ |
429 | struct root_domain { | 434 | struct root_domain { |
430 | atomic_t refcount; | 435 | atomic_t refcount; |
436 | struct rcu_head rcu; | ||
431 | cpumask_var_t span; | 437 | cpumask_var_t span; |
432 | cpumask_var_t online; | 438 | cpumask_var_t online; |
433 | 439 | ||
@@ -437,9 +443,7 @@ struct root_domain { | |||
437 | */ | 443 | */ |
438 | cpumask_var_t rto_mask; | 444 | cpumask_var_t rto_mask; |
439 | atomic_t rto_count; | 445 | atomic_t rto_count; |
440 | #ifdef CONFIG_SMP | ||
441 | struct cpupri cpupri; | 446 | struct cpupri cpupri; |
442 | #endif | ||
443 | }; | 447 | }; |
444 | 448 | ||
445 | /* | 449 | /* |
@@ -448,7 +452,7 @@ struct root_domain { | |||
448 | */ | 452 | */ |
449 | static struct root_domain def_root_domain; | 453 | static struct root_domain def_root_domain; |
450 | 454 | ||
451 | #endif | 455 | #endif /* CONFIG_SMP */ |
452 | 456 | ||
453 | /* | 457 | /* |
454 | * This is the main, per-CPU runqueue data structure. | 458 | * This is the main, per-CPU runqueue data structure. |
@@ -473,7 +477,7 @@ struct rq { | |||
473 | u64 nohz_stamp; | 477 | u64 nohz_stamp; |
474 | unsigned char nohz_balance_kick; | 478 | unsigned char nohz_balance_kick; |
475 | #endif | 479 | #endif |
476 | unsigned int skip_clock_update; | 480 | int skip_clock_update; |
477 | 481 | ||
478 | /* capture load from *all* tasks on this cpu: */ | 482 | /* capture load from *all* tasks on this cpu: */ |
479 | struct load_weight load; | 483 | struct load_weight load; |
@@ -500,11 +504,12 @@ struct rq { | |||
500 | */ | 504 | */ |
501 | unsigned long nr_uninterruptible; | 505 | unsigned long nr_uninterruptible; |
502 | 506 | ||
503 | struct task_struct *curr, *idle; | 507 | struct task_struct *curr, *idle, *stop; |
504 | unsigned long next_balance; | 508 | unsigned long next_balance; |
505 | struct mm_struct *prev_mm; | 509 | struct mm_struct *prev_mm; |
506 | 510 | ||
507 | u64 clock; | 511 | u64 clock; |
512 | u64 clock_task; | ||
508 | 513 | ||
509 | atomic_t nr_iowait; | 514 | atomic_t nr_iowait; |
510 | 515 | ||
@@ -532,6 +537,10 @@ struct rq { | |||
532 | u64 avg_idle; | 537 | u64 avg_idle; |
533 | #endif | 538 | #endif |
534 | 539 | ||
540 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
541 | u64 prev_irq_time; | ||
542 | #endif | ||
543 | |||
535 | /* calc_load related fields */ | 544 | /* calc_load related fields */ |
536 | unsigned long calc_load_update; | 545 | unsigned long calc_load_update; |
537 | long calc_load_active; | 546 | long calc_load_active; |
@@ -561,32 +570,17 @@ struct rq { | |||
561 | /* try_to_wake_up() stats */ | 570 | /* try_to_wake_up() stats */ |
562 | unsigned int ttwu_count; | 571 | unsigned int ttwu_count; |
563 | unsigned int ttwu_local; | 572 | unsigned int ttwu_local; |
573 | #endif | ||
564 | 574 | ||
565 | /* BKL stats */ | 575 | #ifdef CONFIG_SMP |
566 | unsigned int bkl_count; | 576 | struct task_struct *wake_list; |
567 | #endif | 577 | #endif |
568 | }; | 578 | }; |
569 | 579 | ||
570 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 580 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
571 | 581 | ||
572 | static inline | ||
573 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
574 | { | ||
575 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
576 | 582 | ||
577 | /* | 583 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
578 | * A queue event has occurred, and we're going to schedule. In | ||
579 | * this case, we can save a useless back to back clock update. | ||
580 | */ | ||
581 | /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36; | ||
582 | * the scheduler can "forget" to renable the runqueue clock in some | ||
583 | * cases. LITMUS^RT amplifies the effects of this problem. Hence, we | ||
584 | * turn it off to avoid stalling clocks. */ | ||
585 | /* | ||
586 | if (test_tsk_need_resched(p)) | ||
587 | rq->skip_clock_update = 1; | ||
588 | */ | ||
589 | } | ||
590 | 584 | ||
591 | static inline int cpu_of(struct rq *rq) | 585 | static inline int cpu_of(struct rq *rq) |
592 | { | 586 | { |
@@ -599,7 +593,7 @@ static inline int cpu_of(struct rq *rq) | |||
599 | 593 | ||
600 | #define rcu_dereference_check_sched_domain(p) \ | 594 | #define rcu_dereference_check_sched_domain(p) \ |
601 | rcu_dereference_check((p), \ | 595 | rcu_dereference_check((p), \ |
602 | rcu_read_lock_sched_held() || \ | 596 | rcu_read_lock_held() || \ |
603 | lockdep_is_held(&sched_domains_mutex)) | 597 | lockdep_is_held(&sched_domains_mutex)) |
604 | 598 | ||
605 | /* | 599 | /* |
@@ -623,18 +617,22 @@ static inline int cpu_of(struct rq *rq) | |||
623 | /* | 617 | /* |
624 | * Return the group to which this tasks belongs. | 618 | * Return the group to which this tasks belongs. |
625 | * | 619 | * |
626 | * We use task_subsys_state_check() and extend the RCU verification | 620 | * We use task_subsys_state_check() and extend the RCU verification with |
627 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 621 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each |
628 | * holds that lock for each task it moves into the cgroup. Therefore | 622 | * task it moves into the cgroup. Therefore by holding either of those locks, |
629 | * by holding that lock, we pin the task to the current cgroup. | 623 | * we pin the task to the current cgroup. |
630 | */ | 624 | */ |
631 | static inline struct task_group *task_group(struct task_struct *p) | 625 | static inline struct task_group *task_group(struct task_struct *p) |
632 | { | 626 | { |
627 | struct task_group *tg; | ||
633 | struct cgroup_subsys_state *css; | 628 | struct cgroup_subsys_state *css; |
634 | 629 | ||
635 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 630 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
631 | lockdep_is_held(&p->pi_lock) || | ||
636 | lockdep_is_held(&task_rq(p)->lock)); | 632 | lockdep_is_held(&task_rq(p)->lock)); |
637 | return container_of(css, struct task_group, css); | 633 | tg = container_of(css, struct task_group, css); |
634 | |||
635 | return autogroup_task_group(p, tg); | ||
638 | } | 636 | } |
639 | 637 | ||
640 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 638 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -661,10 +659,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
661 | 659 | ||
662 | #endif /* CONFIG_CGROUP_SCHED */ | 660 | #endif /* CONFIG_CGROUP_SCHED */ |
663 | 661 | ||
664 | inline void update_rq_clock(struct rq *rq) | 662 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
663 | |||
664 | static void update_rq_clock(struct rq *rq) | ||
665 | { | 665 | { |
666 | if (!rq->skip_clock_update) | 666 | s64 delta; |
667 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 667 | |
668 | if (rq->skip_clock_update > 0) | ||
669 | return; | ||
670 | |||
671 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
672 | rq->clock += delta; | ||
673 | update_rq_clock_task(rq, delta); | ||
668 | } | 674 | } |
669 | 675 | ||
670 | /* | 676 | /* |
@@ -677,10 +683,9 @@ inline void update_rq_clock(struct rq *rq) | |||
677 | #endif | 683 | #endif |
678 | 684 | ||
679 | /** | 685 | /** |
680 | * runqueue_is_locked | 686 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
681 | * @cpu: the processor in question. | 687 | * @cpu: the processor in question. |
682 | * | 688 | * |
683 | * Returns true if the current cpu runqueue is locked. | ||
684 | * This interface allows printk to be called with the runqueue lock | 689 | * This interface allows printk to be called with the runqueue lock |
685 | * held and know whether or not it is OK to wake up the klogd. | 690 | * held and know whether or not it is OK to wake up the klogd. |
686 | */ | 691 | */ |
@@ -741,7 +746,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
741 | size_t cnt, loff_t *ppos) | 746 | size_t cnt, loff_t *ppos) |
742 | { | 747 | { |
743 | char buf[64]; | 748 | char buf[64]; |
744 | char *cmp = buf; | 749 | char *cmp; |
745 | int neg = 0; | 750 | int neg = 0; |
746 | int i; | 751 | int i; |
747 | 752 | ||
@@ -752,16 +757,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
752 | return -EFAULT; | 757 | return -EFAULT; |
753 | 758 | ||
754 | buf[cnt] = 0; | 759 | buf[cnt] = 0; |
760 | cmp = strstrip(buf); | ||
755 | 761 | ||
756 | if (strncmp(buf, "NO_", 3) == 0) { | 762 | if (strncmp(cmp, "NO_", 3) == 0) { |
757 | neg = 1; | 763 | neg = 1; |
758 | cmp += 3; | 764 | cmp += 3; |
759 | } | 765 | } |
760 | 766 | ||
761 | for (i = 0; sched_feat_names[i]; i++) { | 767 | for (i = 0; sched_feat_names[i]; i++) { |
762 | int len = strlen(sched_feat_names[i]); | 768 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
763 | |||
764 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
765 | if (neg) | 769 | if (neg) |
766 | sysctl_sched_features &= ~(1UL << i); | 770 | sysctl_sched_features &= ~(1UL << i); |
767 | else | 771 | else |
@@ -811,20 +815,6 @@ late_initcall(sched_init_debug); | |||
811 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 815 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
812 | 816 | ||
813 | /* | 817 | /* |
814 | * ratelimit for updating the group shares. | ||
815 | * default: 0.25ms | ||
816 | */ | ||
817 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
818 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
819 | |||
820 | /* | ||
821 | * Inject some fuzzyness into changing the per-cpu group shares | ||
822 | * this avoids remote rq-locks at the expense of fairness. | ||
823 | * default: 4 | ||
824 | */ | ||
825 | unsigned int sysctl_sched_shares_thresh = 4; | ||
826 | |||
827 | /* | ||
828 | * period over which we average the RT time consumption, measured | 818 | * period over which we average the RT time consumption, measured |
829 | * in ms. | 819 | * in ms. |
830 | * | 820 | * |
@@ -871,18 +861,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
871 | return rq->curr == p; | 861 | return rq->curr == p; |
872 | } | 862 | } |
873 | 863 | ||
874 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
875 | static inline int task_running(struct rq *rq, struct task_struct *p) | 864 | static inline int task_running(struct rq *rq, struct task_struct *p) |
876 | { | 865 | { |
866 | #ifdef CONFIG_SMP | ||
867 | return p->on_cpu; | ||
868 | #else | ||
877 | return task_current(rq, p); | 869 | return task_current(rq, p); |
870 | #endif | ||
878 | } | 871 | } |
879 | 872 | ||
873 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
880 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 874 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
881 | { | 875 | { |
876 | #ifdef CONFIG_SMP | ||
877 | /* | ||
878 | * We can optimise this out completely for !SMP, because the | ||
879 | * SMP rebalancing from interrupt is the only thing that cares | ||
880 | * here. | ||
881 | */ | ||
882 | next->on_cpu = 1; | ||
883 | #endif | ||
882 | } | 884 | } |
883 | 885 | ||
884 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 886 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
885 | { | 887 | { |
888 | #ifdef CONFIG_SMP | ||
889 | /* | ||
890 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
891 | * We must ensure this doesn't happen until the switch is completely | ||
892 | * finished. | ||
893 | */ | ||
894 | smp_wmb(); | ||
895 | prev->on_cpu = 0; | ||
896 | #endif | ||
886 | #ifdef CONFIG_DEBUG_SPINLOCK | 897 | #ifdef CONFIG_DEBUG_SPINLOCK |
887 | /* this is a valid case when another task releases the spinlock */ | 898 | /* this is a valid case when another task releases the spinlock */ |
888 | rq->lock.owner = current; | 899 | rq->lock.owner = current; |
@@ -898,15 +909,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
898 | } | 909 | } |
899 | 910 | ||
900 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 911 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
901 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
902 | { | ||
903 | #ifdef CONFIG_SMP | ||
904 | return p->oncpu; | ||
905 | #else | ||
906 | return task_current(rq, p); | ||
907 | #endif | ||
908 | } | ||
909 | |||
910 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 912 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
911 | { | 913 | { |
912 | #ifdef CONFIG_SMP | 914 | #ifdef CONFIG_SMP |
@@ -915,7 +917,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
915 | * SMP rebalancing from interrupt is the only thing that cares | 917 | * SMP rebalancing from interrupt is the only thing that cares |
916 | * here. | 918 | * here. |
917 | */ | 919 | */ |
918 | next->oncpu = 1; | 920 | next->on_cpu = 1; |
919 | #endif | 921 | #endif |
920 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 922 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
921 | raw_spin_unlock_irq(&rq->lock); | 923 | raw_spin_unlock_irq(&rq->lock); |
@@ -928,12 +930,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
928 | { | 930 | { |
929 | #ifdef CONFIG_SMP | 931 | #ifdef CONFIG_SMP |
930 | /* | 932 | /* |
931 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 933 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
932 | * We must ensure this doesn't happen until the switch is completely | 934 | * We must ensure this doesn't happen until the switch is completely |
933 | * finished. | 935 | * finished. |
934 | */ | 936 | */ |
935 | smp_wmb(); | 937 | smp_wmb(); |
936 | prev->oncpu = 0; | 938 | prev->on_cpu = 0; |
937 | #endif | 939 | #endif |
938 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 940 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
939 | local_irq_enable(); | 941 | local_irq_enable(); |
@@ -942,23 +944,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
942 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 944 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
943 | 945 | ||
944 | /* | 946 | /* |
945 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 947 | * __task_rq_lock - lock the rq @p resides on. |
946 | * against ttwu(). | ||
947 | */ | ||
948 | static inline int task_is_waking(struct task_struct *p) | ||
949 | { | ||
950 | return unlikely(p->state == TASK_WAKING); | ||
951 | } | ||
952 | |||
953 | /* | ||
954 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
955 | * Must be called interrupts disabled. | ||
956 | */ | 948 | */ |
957 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 949 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
958 | __acquires(rq->lock) | 950 | __acquires(rq->lock) |
959 | { | 951 | { |
960 | struct rq *rq; | 952 | struct rq *rq; |
961 | 953 | ||
954 | lockdep_assert_held(&p->pi_lock); | ||
955 | |||
962 | for (;;) { | 956 | for (;;) { |
963 | rq = task_rq(p); | 957 | rq = task_rq(p); |
964 | raw_spin_lock(&rq->lock); | 958 | raw_spin_lock(&rq->lock); |
@@ -969,22 +963,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
969 | } | 963 | } |
970 | 964 | ||
971 | /* | 965 | /* |
972 | * task_rq_lock - lock the runqueue a given task resides on and disable | 966 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
973 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
974 | * explicitly disabling preemption. | ||
975 | */ | 967 | */ |
976 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 968 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
969 | __acquires(p->pi_lock) | ||
977 | __acquires(rq->lock) | 970 | __acquires(rq->lock) |
978 | { | 971 | { |
979 | struct rq *rq; | 972 | struct rq *rq; |
980 | 973 | ||
981 | for (;;) { | 974 | for (;;) { |
982 | local_irq_save(*flags); | 975 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
983 | rq = task_rq(p); | 976 | rq = task_rq(p); |
984 | raw_spin_lock(&rq->lock); | 977 | raw_spin_lock(&rq->lock); |
985 | if (likely(rq == task_rq(p))) | 978 | if (likely(rq == task_rq(p))) |
986 | return rq; | 979 | return rq; |
987 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 980 | raw_spin_unlock(&rq->lock); |
981 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
988 | } | 982 | } |
989 | } | 983 | } |
990 | 984 | ||
@@ -994,10 +988,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
994 | raw_spin_unlock(&rq->lock); | 988 | raw_spin_unlock(&rq->lock); |
995 | } | 989 | } |
996 | 990 | ||
997 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 991 | static inline void |
992 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
998 | __releases(rq->lock) | 993 | __releases(rq->lock) |
994 | __releases(p->pi_lock) | ||
999 | { | 995 | { |
1000 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 996 | raw_spin_unlock(&rq->lock); |
997 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
1001 | } | 998 | } |
1002 | 999 | ||
1003 | /* | 1000 | /* |
@@ -1227,11 +1224,17 @@ int get_nohz_timer_target(void) | |||
1227 | int i; | 1224 | int i; |
1228 | struct sched_domain *sd; | 1225 | struct sched_domain *sd; |
1229 | 1226 | ||
1227 | rcu_read_lock(); | ||
1230 | for_each_domain(cpu, sd) { | 1228 | for_each_domain(cpu, sd) { |
1231 | for_each_cpu(i, sched_domain_span(sd)) | 1229 | for_each_cpu(i, sched_domain_span(sd)) { |
1232 | if (!idle_cpu(i)) | 1230 | if (!idle_cpu(i)) { |
1233 | return i; | 1231 | cpu = i; |
1232 | goto unlock; | ||
1233 | } | ||
1234 | } | ||
1234 | } | 1235 | } |
1236 | unlock: | ||
1237 | rcu_read_unlock(); | ||
1235 | return cpu; | 1238 | return cpu; |
1236 | } | 1239 | } |
1237 | /* | 1240 | /* |
@@ -1341,15 +1344,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1341 | { | 1344 | { |
1342 | u64 tmp; | 1345 | u64 tmp; |
1343 | 1346 | ||
1347 | /* | ||
1348 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1349 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1350 | * 2^SCHED_LOAD_RESOLUTION. | ||
1351 | */ | ||
1352 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1353 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1354 | else | ||
1355 | tmp = (u64)delta_exec; | ||
1356 | |||
1344 | if (!lw->inv_weight) { | 1357 | if (!lw->inv_weight) { |
1345 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1358 | unsigned long w = scale_load_down(lw->weight); |
1359 | |||
1360 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1346 | lw->inv_weight = 1; | 1361 | lw->inv_weight = 1; |
1362 | else if (unlikely(!w)) | ||
1363 | lw->inv_weight = WMULT_CONST; | ||
1347 | else | 1364 | else |
1348 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1365 | lw->inv_weight = WMULT_CONST / w; |
1349 | / (lw->weight+1); | ||
1350 | } | 1366 | } |
1351 | 1367 | ||
1352 | tmp = (u64)delta_exec * weight; | ||
1353 | /* | 1368 | /* |
1354 | * Check whether we'd overflow the 64-bit multiplication: | 1369 | * Check whether we'd overflow the 64-bit multiplication: |
1355 | */ | 1370 | */ |
@@ -1374,6 +1389,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1374 | lw->inv_weight = 0; | 1389 | lw->inv_weight = 0; |
1375 | } | 1390 | } |
1376 | 1391 | ||
1392 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1393 | { | ||
1394 | lw->weight = w; | ||
1395 | lw->inv_weight = 0; | ||
1396 | } | ||
1397 | |||
1377 | /* | 1398 | /* |
1378 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1399 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1379 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1400 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1562,101 +1583,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1562 | 1583 | ||
1563 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1584 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1564 | 1585 | ||
1565 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1566 | |||
1567 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1568 | |||
1569 | /* | ||
1570 | * Calculate and set the cpu's group shares. | ||
1571 | */ | ||
1572 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1573 | unsigned long sd_shares, | ||
1574 | unsigned long sd_rq_weight, | ||
1575 | unsigned long *usd_rq_weight) | ||
1576 | { | ||
1577 | unsigned long shares, rq_weight; | ||
1578 | int boost = 0; | ||
1579 | |||
1580 | rq_weight = usd_rq_weight[cpu]; | ||
1581 | if (!rq_weight) { | ||
1582 | boost = 1; | ||
1583 | rq_weight = NICE_0_LOAD; | ||
1584 | } | ||
1585 | |||
1586 | /* | ||
1587 | * \Sum_j shares_j * rq_weight_i | ||
1588 | * shares_i = ----------------------------- | ||
1589 | * \Sum_j rq_weight_j | ||
1590 | */ | ||
1591 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1592 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1593 | |||
1594 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1595 | sysctl_sched_shares_thresh) { | ||
1596 | struct rq *rq = cpu_rq(cpu); | ||
1597 | unsigned long flags; | ||
1598 | |||
1599 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1600 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1601 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1602 | __set_se_shares(tg->se[cpu], shares); | ||
1603 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1604 | } | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Re-compute the task group their per cpu shares over the given domain. | ||
1609 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1610 | * parent group depends on the shares of its child groups. | ||
1611 | */ | ||
1612 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1613 | { | ||
1614 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1615 | unsigned long *usd_rq_weight; | ||
1616 | struct sched_domain *sd = data; | ||
1617 | unsigned long flags; | ||
1618 | int i; | ||
1619 | |||
1620 | if (!tg->se[0]) | ||
1621 | return 0; | ||
1622 | |||
1623 | local_irq_save(flags); | ||
1624 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1625 | |||
1626 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1627 | weight = tg->cfs_rq[i]->load.weight; | ||
1628 | usd_rq_weight[i] = weight; | ||
1629 | |||
1630 | rq_weight += weight; | ||
1631 | /* | ||
1632 | * If there are currently no tasks on the cpu pretend there | ||
1633 | * is one of average load so that when a new task gets to | ||
1634 | * run here it will not get delayed by group starvation. | ||
1635 | */ | ||
1636 | if (!weight) | ||
1637 | weight = NICE_0_LOAD; | ||
1638 | |||
1639 | sum_weight += weight; | ||
1640 | shares += tg->cfs_rq[i]->shares; | ||
1641 | } | ||
1642 | |||
1643 | if (!rq_weight) | ||
1644 | rq_weight = sum_weight; | ||
1645 | |||
1646 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1647 | shares = tg->shares; | ||
1648 | |||
1649 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1650 | shares = tg->shares; | ||
1651 | |||
1652 | for_each_cpu(i, sched_domain_span(sd)) | ||
1653 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1654 | |||
1655 | local_irq_restore(flags); | ||
1656 | |||
1657 | return 0; | ||
1658 | } | ||
1659 | |||
1660 | /* | 1586 | /* |
1661 | * Compute the cpu's hierarchical load factor for each task group. | 1587 | * Compute the cpu's hierarchical load factor for each task group. |
1662 | * This needs to be done in a top-down fashion because the load of a child | 1588 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1671,7 +1597,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1671 | load = cpu_rq(cpu)->load.weight; | 1597 | load = cpu_rq(cpu)->load.weight; |
1672 | } else { | 1598 | } else { |
1673 | load = tg->parent->cfs_rq[cpu]->h_load; | 1599 | load = tg->parent->cfs_rq[cpu]->h_load; |
1674 | load *= tg->cfs_rq[cpu]->shares; | 1600 | load *= tg->se[cpu]->load.weight; |
1675 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1601 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1676 | } | 1602 | } |
1677 | 1603 | ||
@@ -1680,34 +1606,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1680 | return 0; | 1606 | return 0; |
1681 | } | 1607 | } |
1682 | 1608 | ||
1683 | static void update_shares(struct sched_domain *sd) | ||
1684 | { | ||
1685 | s64 elapsed; | ||
1686 | u64 now; | ||
1687 | |||
1688 | if (root_task_group_empty()) | ||
1689 | return; | ||
1690 | |||
1691 | now = local_clock(); | ||
1692 | elapsed = now - sd->last_update; | ||
1693 | |||
1694 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1695 | sd->last_update = now; | ||
1696 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | static void update_h_load(long cpu) | 1609 | static void update_h_load(long cpu) |
1701 | { | 1610 | { |
1702 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1611 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1703 | } | 1612 | } |
1704 | 1613 | ||
1705 | #else | ||
1706 | |||
1707 | static inline void update_shares(struct sched_domain *sd) | ||
1708 | { | ||
1709 | } | ||
1710 | |||
1711 | #endif | 1614 | #endif |
1712 | 1615 | ||
1713 | #ifdef CONFIG_PREEMPT | 1616 | #ifdef CONFIG_PREEMPT |
@@ -1827,15 +1730,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1827 | __release(rq2->lock); | 1730 | __release(rq2->lock); |
1828 | } | 1731 | } |
1829 | 1732 | ||
1830 | #endif | 1733 | #else /* CONFIG_SMP */ |
1831 | 1734 | ||
1832 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1735 | /* |
1833 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1736 | * double_rq_lock - safely lock two runqueues |
1737 | * | ||
1738 | * Note this does not disable interrupts like task_rq_lock, | ||
1739 | * you need to do so manually before calling. | ||
1740 | */ | ||
1741 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1742 | __acquires(rq1->lock) | ||
1743 | __acquires(rq2->lock) | ||
1834 | { | 1744 | { |
1835 | #ifdef CONFIG_SMP | 1745 | BUG_ON(!irqs_disabled()); |
1836 | cfs_rq->shares = shares; | 1746 | BUG_ON(rq1 != rq2); |
1837 | #endif | 1747 | raw_spin_lock(&rq1->lock); |
1748 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1838 | } | 1749 | } |
1750 | |||
1751 | /* | ||
1752 | * double_rq_unlock - safely unlock two runqueues | ||
1753 | * | ||
1754 | * Note this does not restore interrupts like task_rq_unlock, | ||
1755 | * you need to do so manually after calling. | ||
1756 | */ | ||
1757 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1758 | __releases(rq1->lock) | ||
1759 | __releases(rq2->lock) | ||
1760 | { | ||
1761 | BUG_ON(rq1 != rq2); | ||
1762 | raw_spin_unlock(&rq1->lock); | ||
1763 | __release(rq2->lock); | ||
1764 | } | ||
1765 | |||
1839 | #endif | 1766 | #endif |
1840 | 1767 | ||
1841 | static void calc_load_account_idle(struct rq *this_rq); | 1768 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1877,23 +1804,20 @@ static void dec_nr_running(struct rq *rq) | |||
1877 | 1804 | ||
1878 | static void set_load_weight(struct task_struct *p) | 1805 | static void set_load_weight(struct task_struct *p) |
1879 | { | 1806 | { |
1880 | if (task_has_rt_policy(p)) { | 1807 | int prio = p->static_prio - MAX_RT_PRIO; |
1881 | p->se.load.weight = 0; | 1808 | struct load_weight *load = &p->se.load; |
1882 | p->se.load.inv_weight = WMULT_CONST; | ||
1883 | return; | ||
1884 | } | ||
1885 | 1809 | ||
1886 | /* | 1810 | /* |
1887 | * SCHED_IDLE tasks get minimal weight: | 1811 | * SCHED_IDLE tasks get minimal weight: |
1888 | */ | 1812 | */ |
1889 | if (p->policy == SCHED_IDLE) { | 1813 | if (p->policy == SCHED_IDLE) { |
1890 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1814 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
1891 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1815 | load->inv_weight = WMULT_IDLEPRIO; |
1892 | return; | 1816 | return; |
1893 | } | 1817 | } |
1894 | 1818 | ||
1895 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1819 | load->weight = scale_load(prio_to_weight[prio]); |
1896 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1820 | load->inv_weight = prio_to_wmult[prio]; |
1897 | } | 1821 | } |
1898 | 1822 | ||
1899 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1823 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1901,7 +1825,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1901 | update_rq_clock(rq); | 1825 | update_rq_clock(rq); |
1902 | sched_info_queued(p); | 1826 | sched_info_queued(p); |
1903 | p->sched_class->enqueue_task(rq, p, flags); | 1827 | p->sched_class->enqueue_task(rq, p, flags); |
1904 | p->se.on_rq = 1; | ||
1905 | } | 1828 | } |
1906 | 1829 | ||
1907 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1830 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1909,7 +1832,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1909 | update_rq_clock(rq); | 1832 | update_rq_clock(rq); |
1910 | sched_info_dequeued(p); | 1833 | sched_info_dequeued(p); |
1911 | p->sched_class->dequeue_task(rq, p, flags); | 1834 | p->sched_class->dequeue_task(rq, p, flags); |
1912 | p->se.on_rq = 0; | ||
1913 | } | 1835 | } |
1914 | 1836 | ||
1915 | /* | 1837 | /* |
@@ -1936,14 +1858,227 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1936 | dec_nr_running(rq); | 1858 | dec_nr_running(rq); |
1937 | } | 1859 | } |
1938 | 1860 | ||
1861 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1862 | |||
1863 | /* | ||
1864 | * There are no locks covering percpu hardirq/softirq time. | ||
1865 | * They are only modified in account_system_vtime, on corresponding CPU | ||
1866 | * with interrupts disabled. So, writes are safe. | ||
1867 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
1868 | * This may result in other CPU reading this CPU's irq time and can | ||
1869 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
1870 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
1871 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
1872 | * compromise in place of having locks on each irq in account_system_time. | ||
1873 | */ | ||
1874 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
1875 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
1876 | |||
1877 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
1878 | static int sched_clock_irqtime; | ||
1879 | |||
1880 | void enable_sched_clock_irqtime(void) | ||
1881 | { | ||
1882 | sched_clock_irqtime = 1; | ||
1883 | } | ||
1884 | |||
1885 | void disable_sched_clock_irqtime(void) | ||
1886 | { | ||
1887 | sched_clock_irqtime = 0; | ||
1888 | } | ||
1889 | |||
1890 | #ifndef CONFIG_64BIT | ||
1891 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1892 | |||
1893 | static inline void irq_time_write_begin(void) | ||
1894 | { | ||
1895 | __this_cpu_inc(irq_time_seq.sequence); | ||
1896 | smp_wmb(); | ||
1897 | } | ||
1898 | |||
1899 | static inline void irq_time_write_end(void) | ||
1900 | { | ||
1901 | smp_wmb(); | ||
1902 | __this_cpu_inc(irq_time_seq.sequence); | ||
1903 | } | ||
1904 | |||
1905 | static inline u64 irq_time_read(int cpu) | ||
1906 | { | ||
1907 | u64 irq_time; | ||
1908 | unsigned seq; | ||
1909 | |||
1910 | do { | ||
1911 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1912 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1913 | per_cpu(cpu_hardirq_time, cpu); | ||
1914 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1915 | |||
1916 | return irq_time; | ||
1917 | } | ||
1918 | #else /* CONFIG_64BIT */ | ||
1919 | static inline void irq_time_write_begin(void) | ||
1920 | { | ||
1921 | } | ||
1922 | |||
1923 | static inline void irq_time_write_end(void) | ||
1924 | { | ||
1925 | } | ||
1926 | |||
1927 | static inline u64 irq_time_read(int cpu) | ||
1928 | { | ||
1929 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1930 | } | ||
1931 | #endif /* CONFIG_64BIT */ | ||
1932 | |||
1933 | /* | ||
1934 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1935 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1936 | */ | ||
1937 | void account_system_vtime(struct task_struct *curr) | ||
1938 | { | ||
1939 | unsigned long flags; | ||
1940 | s64 delta; | ||
1941 | int cpu; | ||
1942 | |||
1943 | if (!sched_clock_irqtime) | ||
1944 | return; | ||
1945 | |||
1946 | local_irq_save(flags); | ||
1947 | |||
1948 | cpu = smp_processor_id(); | ||
1949 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
1950 | __this_cpu_add(irq_start_time, delta); | ||
1951 | |||
1952 | irq_time_write_begin(); | ||
1953 | /* | ||
1954 | * We do not account for softirq time from ksoftirqd here. | ||
1955 | * We want to continue accounting softirq time to ksoftirqd thread | ||
1956 | * in that case, so as not to confuse scheduler with a special task | ||
1957 | * that do not consume any time, but still wants to run. | ||
1958 | */ | ||
1959 | if (hardirq_count()) | ||
1960 | __this_cpu_add(cpu_hardirq_time, delta); | ||
1961 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
1962 | __this_cpu_add(cpu_softirq_time, delta); | ||
1963 | |||
1964 | irq_time_write_end(); | ||
1965 | local_irq_restore(flags); | ||
1966 | } | ||
1967 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
1968 | |||
1969 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1970 | { | ||
1971 | s64 irq_delta; | ||
1972 | |||
1973 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
1974 | |||
1975 | /* | ||
1976 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1977 | * this case when a previous update_rq_clock() happened inside a | ||
1978 | * {soft,}irq region. | ||
1979 | * | ||
1980 | * When this happens, we stop ->clock_task and only update the | ||
1981 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1982 | * update will consume the rest. This ensures ->clock_task is | ||
1983 | * monotonic. | ||
1984 | * | ||
1985 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1986 | * time, a more accurate solution would be to update the irq_time using | ||
1987 | * the current rq->clock timestamp, except that would require using | ||
1988 | * atomic ops. | ||
1989 | */ | ||
1990 | if (irq_delta > delta) | ||
1991 | irq_delta = delta; | ||
1992 | |||
1993 | rq->prev_irq_time += irq_delta; | ||
1994 | delta -= irq_delta; | ||
1995 | rq->clock_task += delta; | ||
1996 | |||
1997 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
1998 | sched_rt_avg_update(rq, irq_delta); | ||
1999 | } | ||
2000 | |||
2001 | static int irqtime_account_hi_update(void) | ||
2002 | { | ||
2003 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2004 | unsigned long flags; | ||
2005 | u64 latest_ns; | ||
2006 | int ret = 0; | ||
2007 | |||
2008 | local_irq_save(flags); | ||
2009 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
2010 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
2011 | ret = 1; | ||
2012 | local_irq_restore(flags); | ||
2013 | return ret; | ||
2014 | } | ||
2015 | |||
2016 | static int irqtime_account_si_update(void) | ||
2017 | { | ||
2018 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2019 | unsigned long flags; | ||
2020 | u64 latest_ns; | ||
2021 | int ret = 0; | ||
2022 | |||
2023 | local_irq_save(flags); | ||
2024 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
2025 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
2026 | ret = 1; | ||
2027 | local_irq_restore(flags); | ||
2028 | return ret; | ||
2029 | } | ||
2030 | |||
2031 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2032 | |||
2033 | #define sched_clock_irqtime (0) | ||
2034 | |||
2035 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
2036 | { | ||
2037 | rq->clock_task += delta; | ||
2038 | } | ||
2039 | |||
2040 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2041 | |||
1939 | #include "sched_idletask.c" | 2042 | #include "sched_idletask.c" |
1940 | #include "sched_fair.c" | 2043 | #include "sched_fair.c" |
1941 | #include "sched_rt.c" | 2044 | #include "sched_rt.c" |
2045 | #include "sched_autogroup.c" | ||
2046 | #include "sched_stoptask.c" | ||
1942 | #include "../litmus/sched_litmus.c" | 2047 | #include "../litmus/sched_litmus.c" |
1943 | #ifdef CONFIG_SCHED_DEBUG | 2048 | #ifdef CONFIG_SCHED_DEBUG |
1944 | # include "sched_debug.c" | 2049 | # include "sched_debug.c" |
1945 | #endif | 2050 | #endif |
1946 | 2051 | ||
2052 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
2053 | { | ||
2054 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
2055 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
2056 | |||
2057 | if (stop) { | ||
2058 | /* | ||
2059 | * Make it appear like a SCHED_FIFO task, its something | ||
2060 | * userspace knows about and won't get confused about. | ||
2061 | * | ||
2062 | * Also, it will make PI more or less work without too | ||
2063 | * much confusion -- but then, stop work should not | ||
2064 | * rely on PI working anyway. | ||
2065 | */ | ||
2066 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
2067 | |||
2068 | stop->sched_class = &stop_sched_class; | ||
2069 | } | ||
2070 | |||
2071 | cpu_rq(cpu)->stop = stop; | ||
2072 | |||
2073 | if (old_stop) { | ||
2074 | /* | ||
2075 | * Reset it back to a normal scheduling class so that | ||
2076 | * it can die in pieces. | ||
2077 | */ | ||
2078 | old_stop->sched_class = &rt_sched_class; | ||
2079 | } | ||
2080 | } | ||
2081 | |||
1947 | /* | 2082 | /* |
1948 | * __normal_prio - return the priority that is based on the static prio | 2083 | * __normal_prio - return the priority that is based on the static prio |
1949 | */ | 2084 | */ |
@@ -2001,14 +2136,43 @@ inline int task_curr(const struct task_struct *p) | |||
2001 | 2136 | ||
2002 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2137 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2003 | const struct sched_class *prev_class, | 2138 | const struct sched_class *prev_class, |
2004 | int oldprio, int running) | 2139 | int oldprio) |
2005 | { | 2140 | { |
2006 | if (prev_class != p->sched_class) { | 2141 | if (prev_class != p->sched_class) { |
2007 | if (prev_class->switched_from) | 2142 | if (prev_class->switched_from) |
2008 | prev_class->switched_from(rq, p, running); | 2143 | prev_class->switched_from(rq, p); |
2009 | p->sched_class->switched_to(rq, p, running); | 2144 | p->sched_class->switched_to(rq, p); |
2010 | } else | 2145 | } else if (oldprio != p->prio) |
2011 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2146 | p->sched_class->prio_changed(rq, p, oldprio); |
2147 | } | ||
2148 | |||
2149 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
2150 | { | ||
2151 | const struct sched_class *class; | ||
2152 | |||
2153 | if (p->sched_class == rq->curr->sched_class) { | ||
2154 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
2155 | } else { | ||
2156 | for_each_class(class) { | ||
2157 | if (class == rq->curr->sched_class) | ||
2158 | break; | ||
2159 | if (class == p->sched_class) { | ||
2160 | resched_task(rq->curr); | ||
2161 | break; | ||
2162 | } | ||
2163 | } | ||
2164 | } | ||
2165 | |||
2166 | /* | ||
2167 | * A queue event has occurred, and we're going to schedule. In | ||
2168 | * this case, we can save a useless back to back clock update. | ||
2169 | */ | ||
2170 | /* LITMUS^RT: | ||
2171 | * The "disable-clock-update" approach was buggy in Linux 2.6.36. | ||
2172 | * The issue has been solved in 2.6.37. | ||
2173 | */ | ||
2174 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | ||
2175 | rq->skip_clock_update = 1; | ||
2012 | } | 2176 | } |
2013 | 2177 | ||
2014 | #ifdef CONFIG_SMP | 2178 | #ifdef CONFIG_SMP |
@@ -2023,6 +2187,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2023 | if (p->sched_class != &fair_sched_class) | 2187 | if (p->sched_class != &fair_sched_class) |
2024 | return 0; | 2188 | return 0; |
2025 | 2189 | ||
2190 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2191 | return 0; | ||
2192 | |||
2026 | /* | 2193 | /* |
2027 | * Buddy candidates are cache hot: | 2194 | * Buddy candidates are cache hot: |
2028 | */ | 2195 | */ |
@@ -2050,6 +2217,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2050 | */ | 2217 | */ |
2051 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2218 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2052 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2219 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2220 | |||
2221 | #ifdef CONFIG_LOCKDEP | ||
2222 | /* | ||
2223 | * The caller should hold either p->pi_lock or rq->lock, when changing | ||
2224 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | ||
2225 | * | ||
2226 | * sched_move_task() holds both and thus holding either pins the cgroup, | ||
2227 | * see set_task_rq(). | ||
2228 | * | ||
2229 | * Furthermore, all task_rq users should acquire both locks, see | ||
2230 | * task_rq_lock(). | ||
2231 | */ | ||
2232 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2233 | lockdep_is_held(&task_rq(p)->lock))); | ||
2234 | #endif | ||
2053 | #endif | 2235 | #endif |
2054 | 2236 | ||
2055 | trace_sched_migrate_task(p, new_cpu); | 2237 | trace_sched_migrate_task(p, new_cpu); |
@@ -2070,21 +2252,6 @@ struct migration_arg { | |||
2070 | static int migration_cpu_stop(void *data); | 2252 | static int migration_cpu_stop(void *data); |
2071 | 2253 | ||
2072 | /* | 2254 | /* |
2073 | * The task's runqueue lock must be held. | ||
2074 | * Returns true if you have to wait for migration thread. | ||
2075 | */ | ||
2076 | static bool migrate_task(struct task_struct *p, int dest_cpu) | ||
2077 | { | ||
2078 | struct rq *rq = task_rq(p); | ||
2079 | |||
2080 | /* | ||
2081 | * If the task is not on a runqueue (and not running), then | ||
2082 | * the next wake-up will properly place the task. | ||
2083 | */ | ||
2084 | return p->se.on_rq || task_running(rq, p); | ||
2085 | } | ||
2086 | |||
2087 | /* | ||
2088 | * wait_task_inactive - wait for a thread to unschedule. | 2255 | * wait_task_inactive - wait for a thread to unschedule. |
2089 | * | 2256 | * |
2090 | * If @match_state is nonzero, it's the @p->state value just checked and | 2257 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2141,11 +2308,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2141 | rq = task_rq_lock(p, &flags); | 2308 | rq = task_rq_lock(p, &flags); |
2142 | trace_sched_wait_task(p); | 2309 | trace_sched_wait_task(p); |
2143 | running = task_running(rq, p); | 2310 | running = task_running(rq, p); |
2144 | on_rq = p->se.on_rq; | 2311 | on_rq = p->on_rq; |
2145 | ncsw = 0; | 2312 | ncsw = 0; |
2146 | if (!match_state || p->state == match_state) | 2313 | if (!match_state || p->state == match_state) |
2147 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2314 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2148 | task_rq_unlock(rq, &flags); | 2315 | task_rq_unlock(rq, p, &flags); |
2149 | 2316 | ||
2150 | /* | 2317 | /* |
2151 | * If it changed from the expected state, bail out now. | 2318 | * If it changed from the expected state, bail out now. |
@@ -2174,7 +2341,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2174 | * yield - it could be a while. | 2341 | * yield - it could be a while. |
2175 | */ | 2342 | */ |
2176 | if (unlikely(on_rq)) { | 2343 | if (unlikely(on_rq)) { |
2177 | schedule_timeout_uninterruptible(1); | 2344 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2345 | |||
2346 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2347 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2178 | continue; | 2348 | continue; |
2179 | } | 2349 | } |
2180 | 2350 | ||
@@ -2196,7 +2366,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2196 | * Cause a process which is running on another CPU to enter | 2366 | * Cause a process which is running on another CPU to enter |
2197 | * kernel-mode, without any delay. (to get signals handled.) | 2367 | * kernel-mode, without any delay. (to get signals handled.) |
2198 | * | 2368 | * |
2199 | * NOTE: this function doesnt have to take the runqueue lock, | 2369 | * NOTE: this function doesn't have to take the runqueue lock, |
2200 | * because all it wants to ensure is that the remote task enters | 2370 | * because all it wants to ensure is that the remote task enters |
2201 | * the kernel. If the IPI races and the task has been migrated | 2371 | * the kernel. If the IPI races and the task has been migrated |
2202 | * to another CPU then no harm is done and the purpose has been | 2372 | * to another CPU then no harm is done and the purpose has been |
@@ -2215,30 +2385,9 @@ void kick_process(struct task_struct *p) | |||
2215 | EXPORT_SYMBOL_GPL(kick_process); | 2385 | EXPORT_SYMBOL_GPL(kick_process); |
2216 | #endif /* CONFIG_SMP */ | 2386 | #endif /* CONFIG_SMP */ |
2217 | 2387 | ||
2218 | /** | ||
2219 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2220 | * @p: the task to evaluate | ||
2221 | * @func: the function to be called | ||
2222 | * @info: the function call argument | ||
2223 | * | ||
2224 | * Calls the function @func when the task is currently running. This might | ||
2225 | * be on the current CPU, which just calls the function directly | ||
2226 | */ | ||
2227 | void task_oncpu_function_call(struct task_struct *p, | ||
2228 | void (*func) (void *info), void *info) | ||
2229 | { | ||
2230 | int cpu; | ||
2231 | |||
2232 | preempt_disable(); | ||
2233 | cpu = task_cpu(p); | ||
2234 | if (task_curr(p)) | ||
2235 | smp_call_function_single(cpu, func, info, 1); | ||
2236 | preempt_enable(); | ||
2237 | } | ||
2238 | |||
2239 | #ifdef CONFIG_SMP | 2388 | #ifdef CONFIG_SMP |
2240 | /* | 2389 | /* |
2241 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2390 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2242 | */ | 2391 | */ |
2243 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2392 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2244 | { | 2393 | { |
@@ -2256,30 +2405,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2256 | return dest_cpu; | 2405 | return dest_cpu; |
2257 | 2406 | ||
2258 | /* No more Mr. Nice Guy. */ | 2407 | /* No more Mr. Nice Guy. */ |
2259 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2408 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2260 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2409 | /* |
2261 | /* | 2410 | * Don't tell them about moving exiting tasks or |
2262 | * Don't tell them about moving exiting tasks or | 2411 | * kernel threads (both mm NULL), since they never |
2263 | * kernel threads (both mm NULL), since they never | 2412 | * leave kernel. |
2264 | * leave kernel. | 2413 | */ |
2265 | */ | 2414 | if (p->mm && printk_ratelimit()) { |
2266 | if (p->mm && printk_ratelimit()) { | 2415 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2267 | printk(KERN_INFO "process %d (%s) no " | 2416 | task_pid_nr(p), p->comm, cpu); |
2268 | "longer affine to cpu%d\n", | ||
2269 | task_pid_nr(p), p->comm, cpu); | ||
2270 | } | ||
2271 | } | 2417 | } |
2272 | 2418 | ||
2273 | return dest_cpu; | 2419 | return dest_cpu; |
2274 | } | 2420 | } |
2275 | 2421 | ||
2276 | /* | 2422 | /* |
2277 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2423 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2278 | */ | 2424 | */ |
2279 | static inline | 2425 | static inline |
2280 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2426 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2281 | { | 2427 | { |
2282 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2428 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2283 | 2429 | ||
2284 | /* | 2430 | /* |
2285 | * In order not to call set_task_cpu() on a blocking task we need | 2431 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2305,27 +2451,63 @@ static void update_avg(u64 *avg, u64 sample) | |||
2305 | } | 2451 | } |
2306 | #endif | 2452 | #endif |
2307 | 2453 | ||
2308 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2454 | static void |
2309 | bool is_sync, bool is_migrate, bool is_local, | 2455 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2310 | unsigned long en_flags) | ||
2311 | { | 2456 | { |
2312 | schedstat_inc(p, se.statistics.nr_wakeups); | 2457 | #ifdef CONFIG_SCHEDSTATS |
2313 | if (is_sync) | 2458 | struct rq *rq = this_rq(); |
2314 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2459 | |
2315 | if (is_migrate) | 2460 | #ifdef CONFIG_SMP |
2316 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2461 | int this_cpu = smp_processor_id(); |
2317 | if (is_local) | 2462 | |
2463 | if (cpu == this_cpu) { | ||
2464 | schedstat_inc(rq, ttwu_local); | ||
2318 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 2465 | schedstat_inc(p, se.statistics.nr_wakeups_local); |
2319 | else | 2466 | } else { |
2467 | struct sched_domain *sd; | ||
2468 | |||
2320 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 2469 | schedstat_inc(p, se.statistics.nr_wakeups_remote); |
2470 | rcu_read_lock(); | ||
2471 | for_each_domain(this_cpu, sd) { | ||
2472 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2473 | schedstat_inc(sd, ttwu_wake_remote); | ||
2474 | break; | ||
2475 | } | ||
2476 | } | ||
2477 | rcu_read_unlock(); | ||
2478 | } | ||
2479 | |||
2480 | if (wake_flags & WF_MIGRATED) | ||
2481 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2482 | |||
2483 | #endif /* CONFIG_SMP */ | ||
2484 | |||
2485 | schedstat_inc(rq, ttwu_count); | ||
2486 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
2487 | |||
2488 | if (wake_flags & WF_SYNC) | ||
2489 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2321 | 2490 | ||
2491 | #endif /* CONFIG_SCHEDSTATS */ | ||
2492 | } | ||
2493 | |||
2494 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2495 | { | ||
2322 | activate_task(rq, p, en_flags); | 2496 | activate_task(rq, p, en_flags); |
2497 | p->on_rq = 1; | ||
2498 | |||
2499 | /* if a worker is waking up, notify workqueue */ | ||
2500 | if (p->flags & PF_WQ_WORKER) | ||
2501 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2323 | } | 2502 | } |
2324 | 2503 | ||
2325 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2504 | /* |
2326 | int wake_flags, bool success) | 2505 | * Mark the task runnable and perform wakeup-preemption. |
2506 | */ | ||
2507 | static void | ||
2508 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2327 | { | 2509 | { |
2328 | trace_sched_wakeup(p, success); | 2510 | trace_sched_wakeup(p, true); |
2329 | check_preempt_curr(rq, p, wake_flags); | 2511 | check_preempt_curr(rq, p, wake_flags); |
2330 | 2512 | ||
2331 | p->state = TASK_RUNNING; | 2513 | p->state = TASK_RUNNING; |
@@ -2344,9 +2526,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2344 | rq->idle_stamp = 0; | 2526 | rq->idle_stamp = 0; |
2345 | } | 2527 | } |
2346 | #endif | 2528 | #endif |
2347 | /* if a worker is waking up, notify workqueue */ | 2529 | } |
2348 | if ((p->flags & PF_WQ_WORKER) && success) | 2530 | |
2349 | wq_worker_waking_up(p, cpu_of(rq)); | 2531 | static void |
2532 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2533 | { | ||
2534 | #ifdef CONFIG_SMP | ||
2535 | if (p->sched_contributes_to_load) | ||
2536 | rq->nr_uninterruptible--; | ||
2537 | #endif | ||
2538 | |||
2539 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2540 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2541 | } | ||
2542 | |||
2543 | /* | ||
2544 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2545 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2546 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2547 | * the task is still ->on_rq. | ||
2548 | */ | ||
2549 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2550 | { | ||
2551 | struct rq *rq; | ||
2552 | int ret = 0; | ||
2553 | |||
2554 | rq = __task_rq_lock(p); | ||
2555 | if (p->on_rq) { | ||
2556 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2557 | ret = 1; | ||
2558 | } | ||
2559 | __task_rq_unlock(rq); | ||
2560 | |||
2561 | return ret; | ||
2562 | } | ||
2563 | |||
2564 | #ifdef CONFIG_SMP | ||
2565 | static void sched_ttwu_do_pending(struct task_struct *list) | ||
2566 | { | ||
2567 | struct rq *rq = this_rq(); | ||
2568 | |||
2569 | raw_spin_lock(&rq->lock); | ||
2570 | |||
2571 | while (list) { | ||
2572 | struct task_struct *p = list; | ||
2573 | list = list->wake_entry; | ||
2574 | ttwu_do_activate(rq, p, 0); | ||
2575 | } | ||
2576 | |||
2577 | raw_spin_unlock(&rq->lock); | ||
2578 | } | ||
2579 | |||
2580 | #ifdef CONFIG_HOTPLUG_CPU | ||
2581 | |||
2582 | static void sched_ttwu_pending(void) | ||
2583 | { | ||
2584 | struct rq *rq = this_rq(); | ||
2585 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2586 | |||
2587 | if (!list) | ||
2588 | return; | ||
2589 | |||
2590 | sched_ttwu_do_pending(list); | ||
2591 | } | ||
2592 | |||
2593 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2594 | |||
2595 | void scheduler_ipi(void) | ||
2596 | { | ||
2597 | struct rq *rq = this_rq(); | ||
2598 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2599 | |||
2600 | if (!list) | ||
2601 | return; | ||
2602 | |||
2603 | /* | ||
2604 | * Not all reschedule IPI handlers call irq_enter/irq_exit, since | ||
2605 | * traditionally all their work was done from the interrupt return | ||
2606 | * path. Now that we actually do some work, we need to make sure | ||
2607 | * we do call them. | ||
2608 | * | ||
2609 | * Some archs already do call them, luckily irq_enter/exit nest | ||
2610 | * properly. | ||
2611 | * | ||
2612 | * Arguably we should visit all archs and update all handlers, | ||
2613 | * however a fair share of IPIs are still resched only so this would | ||
2614 | * somewhat pessimize the simple resched case. | ||
2615 | */ | ||
2616 | irq_enter(); | ||
2617 | sched_ttwu_do_pending(list); | ||
2618 | irq_exit(); | ||
2619 | } | ||
2620 | |||
2621 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2622 | { | ||
2623 | struct rq *rq = cpu_rq(cpu); | ||
2624 | struct task_struct *next = rq->wake_list; | ||
2625 | |||
2626 | for (;;) { | ||
2627 | struct task_struct *old = next; | ||
2628 | |||
2629 | p->wake_entry = next; | ||
2630 | next = cmpxchg(&rq->wake_list, old, p); | ||
2631 | if (next == old) | ||
2632 | break; | ||
2633 | } | ||
2634 | |||
2635 | if (!next) | ||
2636 | smp_send_reschedule(cpu); | ||
2637 | } | ||
2638 | |||
2639 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
2640 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
2641 | { | ||
2642 | struct rq *rq; | ||
2643 | int ret = 0; | ||
2644 | |||
2645 | rq = __task_rq_lock(p); | ||
2646 | if (p->on_cpu) { | ||
2647 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
2648 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2649 | ret = 1; | ||
2650 | } | ||
2651 | __task_rq_unlock(rq); | ||
2652 | |||
2653 | return ret; | ||
2654 | |||
2655 | } | ||
2656 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
2657 | #endif /* CONFIG_SMP */ | ||
2658 | |||
2659 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2660 | { | ||
2661 | struct rq *rq = cpu_rq(cpu); | ||
2662 | |||
2663 | #if defined(CONFIG_SMP) | ||
2664 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2665 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | ||
2666 | ttwu_queue_remote(p, cpu); | ||
2667 | return; | ||
2668 | } | ||
2669 | #endif | ||
2670 | |||
2671 | raw_spin_lock(&rq->lock); | ||
2672 | ttwu_do_activate(rq, p, 0); | ||
2673 | raw_spin_unlock(&rq->lock); | ||
2350 | } | 2674 | } |
2351 | 2675 | ||
2352 | /** | 2676 | /** |
@@ -2364,97 +2688,79 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2364 | * Returns %true if @p was woken up, %false if it was already running | 2688 | * Returns %true if @p was woken up, %false if it was already running |
2365 | * or @state didn't match @p's state. | 2689 | * or @state didn't match @p's state. |
2366 | */ | 2690 | */ |
2367 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2691 | static int |
2368 | int wake_flags) | 2692 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2369 | { | 2693 | { |
2370 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2371 | unsigned long flags; | 2694 | unsigned long flags; |
2372 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2695 | int cpu, success = 0; |
2373 | struct rq *rq; | ||
2374 | 2696 | ||
2375 | if (is_realtime(p)) | 2697 | if (is_realtime(p)) |
2376 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); | 2698 | TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); |
2377 | 2699 | ||
2378 | this_cpu = get_cpu(); | ||
2379 | |||
2380 | smp_wmb(); | 2700 | smp_wmb(); |
2381 | rq = task_rq_lock(p, &flags); | 2701 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2382 | if (!(p->state & state)) | 2702 | if (!(p->state & state)) |
2383 | goto out; | 2703 | goto out; |
2384 | 2704 | ||
2385 | if (p->se.on_rq) | 2705 | success = 1; /* we're going to change ->state */ |
2386 | goto out_running; | ||
2387 | |||
2388 | cpu = task_cpu(p); | 2706 | cpu = task_cpu(p); |
2389 | orig_cpu = cpu; | ||
2390 | 2707 | ||
2391 | #ifdef CONFIG_SMP | 2708 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2392 | if (unlikely(task_running(rq, p)) || is_realtime(p)) | 2709 | goto stat; |
2393 | goto out_activate; | ||
2394 | 2710 | ||
2711 | #ifdef CONFIG_SMP | ||
2395 | /* | 2712 | /* |
2396 | * In order to handle concurrent wakeups and release the rq->lock | 2713 | * If the owning (remote) cpu is still in the middle of schedule() with |
2397 | * we put the task in TASK_WAKING state. | 2714 | * this task as prev, wait until its done referencing the task. |
2398 | * | ||
2399 | * First fix up the nr_uninterruptible count: | ||
2400 | */ | 2715 | */ |
2401 | if (task_contributes_to_load(p)) { | 2716 | while (p->on_cpu) { |
2402 | if (likely(cpu_online(orig_cpu))) | 2717 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2403 | rq->nr_uninterruptible--; | 2718 | /* |
2404 | else | 2719 | * In case the architecture enables interrupts in |
2405 | this_rq()->nr_uninterruptible--; | 2720 | * context_switch(), we cannot busy wait, since that |
2406 | } | 2721 | * would lead to deadlocks when an interrupt hits and |
2407 | p->state = TASK_WAKING; | 2722 | * tries to wake up @prev. So bail and do a complete |
2408 | 2723 | * remote wakeup. | |
2409 | if (p->sched_class->task_waking) { | 2724 | */ |
2410 | p->sched_class->task_waking(rq, p); | 2725 | if (ttwu_activate_remote(p, wake_flags)) |
2411 | en_flags |= ENQUEUE_WAKING; | 2726 | goto stat; |
2727 | #else | ||
2728 | cpu_relax(); | ||
2729 | #endif | ||
2412 | } | 2730 | } |
2731 | /* | ||
2732 | * Pairs with the smp_wmb() in finish_lock_switch(). | ||
2733 | */ | ||
2734 | smp_rmb(); | ||
2413 | 2735 | ||
2414 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | 2736 | /* LITMUS^RT: once the task can be safely referenced by this |
2415 | if (cpu != orig_cpu) | 2737 | * CPU, don't mess up with Linux load balancing stuff. |
2416 | set_task_cpu(p, cpu); | 2738 | */ |
2417 | __task_rq_unlock(rq); | 2739 | if (is_realtime(p)) |
2740 | goto litmus_out_activate; | ||
2418 | 2741 | ||
2419 | rq = cpu_rq(cpu); | 2742 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2420 | raw_spin_lock(&rq->lock); | 2743 | p->state = TASK_WAKING; |
2421 | 2744 | ||
2422 | /* | 2745 | if (p->sched_class->task_waking) |
2423 | * We migrated the task without holding either rq->lock, however | 2746 | p->sched_class->task_waking(p); |
2424 | * since the task is not on the task list itself, nobody else | ||
2425 | * will try and migrate the task, hence the rq should match the | ||
2426 | * cpu we just moved it to. | ||
2427 | */ | ||
2428 | WARN_ON(task_cpu(p) != cpu); | ||
2429 | WARN_ON(p->state != TASK_WAKING); | ||
2430 | 2747 | ||
2431 | #ifdef CONFIG_SCHEDSTATS | 2748 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2432 | schedstat_inc(rq, ttwu_count); | 2749 | if (task_cpu(p) != cpu) { |
2433 | if (cpu == this_cpu) | 2750 | wake_flags |= WF_MIGRATED; |
2434 | schedstat_inc(rq, ttwu_local); | 2751 | set_task_cpu(p, cpu); |
2435 | else { | ||
2436 | struct sched_domain *sd; | ||
2437 | for_each_domain(this_cpu, sd) { | ||
2438 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2439 | schedstat_inc(sd, ttwu_wake_remote); | ||
2440 | break; | ||
2441 | } | ||
2442 | } | ||
2443 | } | 2752 | } |
2444 | #endif /* CONFIG_SCHEDSTATS */ | ||
2445 | 2753 | ||
2446 | out_activate: | 2754 | litmus_out_activate: |
2447 | #endif /* CONFIG_SMP */ | 2755 | #endif /* CONFIG_SMP */ |
2448 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2756 | |
2449 | cpu == this_cpu, en_flags); | 2757 | ttwu_queue(p, cpu); |
2450 | success = 1; | 2758 | stat: |
2451 | out_running: | 2759 | ttwu_stat(p, cpu, wake_flags); |
2452 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2453 | out: | 2760 | out: |
2454 | if (is_realtime(p)) | 2761 | if (is_realtime(p)) |
2455 | TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); | 2762 | TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); |
2456 | task_rq_unlock(rq, &flags); | 2763 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2457 | put_cpu(); | ||
2458 | 2764 | ||
2459 | return success; | 2765 | return success; |
2460 | } | 2766 | } |
@@ -2463,31 +2769,34 @@ out: | |||
2463 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2769 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2464 | * @p: the thread to be awakened | 2770 | * @p: the thread to be awakened |
2465 | * | 2771 | * |
2466 | * Put @p on the run-queue if it's not alredy there. The caller must | 2772 | * Put @p on the run-queue if it's not already there. The caller must |
2467 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2773 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2468 | * the current task. this_rq() stays locked over invocation. | 2774 | * the current task. |
2469 | */ | 2775 | */ |
2470 | static void try_to_wake_up_local(struct task_struct *p) | 2776 | static void try_to_wake_up_local(struct task_struct *p) |
2471 | { | 2777 | { |
2472 | struct rq *rq = task_rq(p); | 2778 | struct rq *rq = task_rq(p); |
2473 | bool success = false; | ||
2474 | 2779 | ||
2475 | BUG_ON(rq != this_rq()); | 2780 | BUG_ON(rq != this_rq()); |
2476 | BUG_ON(p == current); | 2781 | BUG_ON(p == current); |
2477 | lockdep_assert_held(&rq->lock); | 2782 | lockdep_assert_held(&rq->lock); |
2478 | 2783 | ||
2784 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2785 | raw_spin_unlock(&rq->lock); | ||
2786 | raw_spin_lock(&p->pi_lock); | ||
2787 | raw_spin_lock(&rq->lock); | ||
2788 | } | ||
2789 | |||
2479 | if (!(p->state & TASK_NORMAL)) | 2790 | if (!(p->state & TASK_NORMAL)) |
2480 | return; | 2791 | goto out; |
2481 | 2792 | ||
2482 | if (!p->se.on_rq) { | 2793 | if (!p->on_rq) |
2483 | if (likely(!task_running(rq, p))) { | 2794 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2484 | schedstat_inc(rq, ttwu_count); | 2795 | |
2485 | schedstat_inc(rq, ttwu_local); | 2796 | ttwu_do_wakeup(rq, p, 0); |
2486 | } | 2797 | ttwu_stat(p, smp_processor_id(), 0); |
2487 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2798 | out: |
2488 | success = true; | 2799 | raw_spin_unlock(&p->pi_lock); |
2489 | } | ||
2490 | ttwu_post_activation(p, rq, 0, success); | ||
2491 | } | 2800 | } |
2492 | 2801 | ||
2493 | /** | 2802 | /** |
@@ -2520,18 +2829,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2520 | */ | 2829 | */ |
2521 | static void __sched_fork(struct task_struct *p) | 2830 | static void __sched_fork(struct task_struct *p) |
2522 | { | 2831 | { |
2832 | p->on_rq = 0; | ||
2833 | |||
2834 | p->se.on_rq = 0; | ||
2523 | p->se.exec_start = 0; | 2835 | p->se.exec_start = 0; |
2524 | p->se.sum_exec_runtime = 0; | 2836 | p->se.sum_exec_runtime = 0; |
2525 | p->se.prev_sum_exec_runtime = 0; | 2837 | p->se.prev_sum_exec_runtime = 0; |
2526 | p->se.nr_migrations = 0; | 2838 | p->se.nr_migrations = 0; |
2839 | p->se.vruntime = 0; | ||
2840 | INIT_LIST_HEAD(&p->se.group_node); | ||
2527 | 2841 | ||
2528 | #ifdef CONFIG_SCHEDSTATS | 2842 | #ifdef CONFIG_SCHEDSTATS |
2529 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2843 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2530 | #endif | 2844 | #endif |
2531 | 2845 | ||
2532 | INIT_LIST_HEAD(&p->rt.run_list); | 2846 | INIT_LIST_HEAD(&p->rt.run_list); |
2533 | p->se.on_rq = 0; | ||
2534 | INIT_LIST_HEAD(&p->se.group_node); | ||
2535 | 2847 | ||
2536 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2848 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2537 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2849 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2541,8 +2853,9 @@ static void __sched_fork(struct task_struct *p) | |||
2541 | /* | 2853 | /* |
2542 | * fork()/clone()-time setup: | 2854 | * fork()/clone()-time setup: |
2543 | */ | 2855 | */ |
2544 | void sched_fork(struct task_struct *p, int clone_flags) | 2856 | void sched_fork(struct task_struct *p) |
2545 | { | 2857 | { |
2858 | unsigned long flags; | ||
2546 | int cpu = get_cpu(); | 2859 | int cpu = get_cpu(); |
2547 | 2860 | ||
2548 | __sched_fork(p); | 2861 | __sched_fork(p); |
@@ -2594,22 +2907,24 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2594 | * | 2907 | * |
2595 | * Silence PROVE_RCU. | 2908 | * Silence PROVE_RCU. |
2596 | */ | 2909 | */ |
2597 | rcu_read_lock(); | 2910 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2598 | set_task_cpu(p, cpu); | 2911 | set_task_cpu(p, cpu); |
2599 | rcu_read_unlock(); | 2912 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2600 | 2913 | ||
2601 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2914 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2602 | if (likely(sched_info_on())) | 2915 | if (likely(sched_info_on())) |
2603 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2916 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2604 | #endif | 2917 | #endif |
2605 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2918 | #if defined(CONFIG_SMP) |
2606 | p->oncpu = 0; | 2919 | p->on_cpu = 0; |
2607 | #endif | 2920 | #endif |
2608 | #ifdef CONFIG_PREEMPT | 2921 | #ifdef CONFIG_PREEMPT |
2609 | /* Want to start with kernel preemption disabled. */ | 2922 | /* Want to start with kernel preemption disabled. */ |
2610 | task_thread_info(p)->preempt_count = 1; | 2923 | task_thread_info(p)->preempt_count = 1; |
2611 | #endif | 2924 | #endif |
2925 | #ifdef CONFIG_SMP | ||
2612 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2926 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2927 | #endif | ||
2613 | 2928 | ||
2614 | put_cpu(); | 2929 | put_cpu(); |
2615 | } | 2930 | } |
@@ -2621,41 +2936,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2621 | * that must be done for every newly created context, then puts the task | 2936 | * that must be done for every newly created context, then puts the task |
2622 | * on the runqueue and wakes it. | 2937 | * on the runqueue and wakes it. |
2623 | */ | 2938 | */ |
2624 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2939 | void wake_up_new_task(struct task_struct *p) |
2625 | { | 2940 | { |
2626 | unsigned long flags; | 2941 | unsigned long flags; |
2627 | struct rq *rq; | 2942 | struct rq *rq; |
2628 | int cpu __maybe_unused = get_cpu(); | ||
2629 | 2943 | ||
2944 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2630 | #ifdef CONFIG_SMP | 2945 | #ifdef CONFIG_SMP |
2631 | rq = task_rq_lock(p, &flags); | ||
2632 | p->state = TASK_WAKING; | ||
2633 | |||
2634 | /* | 2946 | /* |
2635 | * Fork balancing, do it here and not earlier because: | 2947 | * Fork balancing, do it here and not earlier because: |
2636 | * - cpus_allowed can change in the fork path | 2948 | * - cpus_allowed can change in the fork path |
2637 | * - any previously selected cpu might disappear through hotplug | 2949 | * - any previously selected cpu might disappear through hotplug |
2638 | * | ||
2639 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2640 | * without people poking at ->cpus_allowed. | ||
2641 | */ | 2950 | */ |
2642 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2951 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2643 | set_task_cpu(p, cpu); | ||
2644 | |||
2645 | p->state = TASK_RUNNING; | ||
2646 | task_rq_unlock(rq, &flags); | ||
2647 | #endif | 2952 | #endif |
2648 | 2953 | ||
2649 | rq = task_rq_lock(p, &flags); | 2954 | rq = __task_rq_lock(p); |
2650 | activate_task(rq, p, 0); | 2955 | activate_task(rq, p, 0); |
2651 | trace_sched_wakeup_new(p, 1); | 2956 | p->on_rq = 1; |
2957 | trace_sched_wakeup_new(p, true); | ||
2652 | check_preempt_curr(rq, p, WF_FORK); | 2958 | check_preempt_curr(rq, p, WF_FORK); |
2653 | #ifdef CONFIG_SMP | 2959 | #ifdef CONFIG_SMP |
2654 | if (p->sched_class->task_woken) | 2960 | if (p->sched_class->task_woken) |
2655 | p->sched_class->task_woken(rq, p); | 2961 | p->sched_class->task_woken(rq, p); |
2656 | #endif | 2962 | #endif |
2657 | task_rq_unlock(rq, &flags); | 2963 | task_rq_unlock(rq, p, &flags); |
2658 | put_cpu(); | ||
2659 | } | 2964 | } |
2660 | 2965 | ||
2661 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2966 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -2733,9 +3038,12 @@ static inline void | |||
2733 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 3038 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2734 | struct task_struct *next) | 3039 | struct task_struct *next) |
2735 | { | 3040 | { |
3041 | sched_info_switch(prev, next); | ||
3042 | perf_event_task_sched_out(prev, next); | ||
2736 | fire_sched_out_preempt_notifiers(prev, next); | 3043 | fire_sched_out_preempt_notifiers(prev, next); |
2737 | prepare_lock_switch(rq, next); | 3044 | prepare_lock_switch(rq, next); |
2738 | prepare_arch_switch(next); | 3045 | prepare_arch_switch(next); |
3046 | trace_sched_switch(prev, next); | ||
2739 | } | 3047 | } |
2740 | 3048 | ||
2741 | /** | 3049 | /** |
@@ -2879,7 +3187,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2879 | struct mm_struct *mm, *oldmm; | 3187 | struct mm_struct *mm, *oldmm; |
2880 | 3188 | ||
2881 | prepare_task_switch(rq, prev, next); | 3189 | prepare_task_switch(rq, prev, next); |
2882 | trace_sched_switch(prev, next); | 3190 | |
2883 | mm = next->mm; | 3191 | mm = next->mm; |
2884 | oldmm = prev->active_mm; | 3192 | oldmm = prev->active_mm; |
2885 | /* | 3193 | /* |
@@ -2889,14 +3197,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2889 | */ | 3197 | */ |
2890 | arch_start_context_switch(prev); | 3198 | arch_start_context_switch(prev); |
2891 | 3199 | ||
2892 | if (likely(!mm)) { | 3200 | if (!mm) { |
2893 | next->active_mm = oldmm; | 3201 | next->active_mm = oldmm; |
2894 | atomic_inc(&oldmm->mm_count); | 3202 | atomic_inc(&oldmm->mm_count); |
2895 | enter_lazy_tlb(oldmm, next); | 3203 | enter_lazy_tlb(oldmm, next); |
2896 | } else | 3204 | } else |
2897 | switch_mm(oldmm, mm, next); | 3205 | switch_mm(oldmm, mm, next); |
2898 | 3206 | ||
2899 | if (likely(!prev->mm)) { | 3207 | if (!prev->mm) { |
2900 | prev->active_mm = NULL; | 3208 | prev->active_mm = NULL; |
2901 | rq->prev_mm = oldmm; | 3209 | rq->prev_mm = oldmm; |
2902 | } | 3210 | } |
@@ -3011,6 +3319,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
3011 | return delta; | 3319 | return delta; |
3012 | } | 3320 | } |
3013 | 3321 | ||
3322 | static unsigned long | ||
3323 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3324 | { | ||
3325 | load *= exp; | ||
3326 | load += active * (FIXED_1 - exp); | ||
3327 | load += 1UL << (FSHIFT - 1); | ||
3328 | return load >> FSHIFT; | ||
3329 | } | ||
3330 | |||
3014 | #ifdef CONFIG_NO_HZ | 3331 | #ifdef CONFIG_NO_HZ |
3015 | /* | 3332 | /* |
3016 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3333 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3040,6 +3357,128 @@ static long calc_load_fold_idle(void) | |||
3040 | 3357 | ||
3041 | return delta; | 3358 | return delta; |
3042 | } | 3359 | } |
3360 | |||
3361 | /** | ||
3362 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3363 | * | ||
3364 | * @x: base of the power | ||
3365 | * @frac_bits: fractional bits of @x | ||
3366 | * @n: power to raise @x to. | ||
3367 | * | ||
3368 | * By exploiting the relation between the definition of the natural power | ||
3369 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3370 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3371 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3372 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3373 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3374 | * vector. | ||
3375 | */ | ||
3376 | static unsigned long | ||
3377 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3378 | { | ||
3379 | unsigned long result = 1UL << frac_bits; | ||
3380 | |||
3381 | if (n) for (;;) { | ||
3382 | if (n & 1) { | ||
3383 | result *= x; | ||
3384 | result += 1UL << (frac_bits - 1); | ||
3385 | result >>= frac_bits; | ||
3386 | } | ||
3387 | n >>= 1; | ||
3388 | if (!n) | ||
3389 | break; | ||
3390 | x *= x; | ||
3391 | x += 1UL << (frac_bits - 1); | ||
3392 | x >>= frac_bits; | ||
3393 | } | ||
3394 | |||
3395 | return result; | ||
3396 | } | ||
3397 | |||
3398 | /* | ||
3399 | * a1 = a0 * e + a * (1 - e) | ||
3400 | * | ||
3401 | * a2 = a1 * e + a * (1 - e) | ||
3402 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3403 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3404 | * | ||
3405 | * a3 = a2 * e + a * (1 - e) | ||
3406 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3407 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3408 | * | ||
3409 | * ... | ||
3410 | * | ||
3411 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3412 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3413 | * = a0 * e^n + a * (1 - e^n) | ||
3414 | * | ||
3415 | * [1] application of the geometric series: | ||
3416 | * | ||
3417 | * n 1 - x^(n+1) | ||
3418 | * S_n := \Sum x^i = ------------- | ||
3419 | * i=0 1 - x | ||
3420 | */ | ||
3421 | static unsigned long | ||
3422 | calc_load_n(unsigned long load, unsigned long exp, | ||
3423 | unsigned long active, unsigned int n) | ||
3424 | { | ||
3425 | |||
3426 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3427 | } | ||
3428 | |||
3429 | /* | ||
3430 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3431 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3432 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3433 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3434 | * | ||
3435 | * Once we've updated the global active value, we need to apply the exponential | ||
3436 | * weights adjusted to the number of cycles missed. | ||
3437 | */ | ||
3438 | static void calc_global_nohz(unsigned long ticks) | ||
3439 | { | ||
3440 | long delta, active, n; | ||
3441 | |||
3442 | if (time_before(jiffies, calc_load_update)) | ||
3443 | return; | ||
3444 | |||
3445 | /* | ||
3446 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3447 | * any pending idle changes, the respective CPUs might have | ||
3448 | * missed the tick driven calc_load_account_active() update | ||
3449 | * due to NO_HZ. | ||
3450 | */ | ||
3451 | delta = calc_load_fold_idle(); | ||
3452 | if (delta) | ||
3453 | atomic_long_add(delta, &calc_load_tasks); | ||
3454 | |||
3455 | /* | ||
3456 | * If we were idle for multiple load cycles, apply them. | ||
3457 | */ | ||
3458 | if (ticks >= LOAD_FREQ) { | ||
3459 | n = ticks / LOAD_FREQ; | ||
3460 | |||
3461 | active = atomic_long_read(&calc_load_tasks); | ||
3462 | active = active > 0 ? active * FIXED_1 : 0; | ||
3463 | |||
3464 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3465 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3466 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3467 | |||
3468 | calc_load_update += n * LOAD_FREQ; | ||
3469 | } | ||
3470 | |||
3471 | /* | ||
3472 | * Its possible the remainder of the above division also crosses | ||
3473 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3474 | * which comes after this will take care of that. | ||
3475 | * | ||
3476 | * Consider us being 11 ticks before a cycle completion, and us | ||
3477 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3478 | * age us 4 cycles, and the test in calc_global_load() will | ||
3479 | * pick up the final one. | ||
3480 | */ | ||
3481 | } | ||
3043 | #else | 3482 | #else |
3044 | static void calc_load_account_idle(struct rq *this_rq) | 3483 | static void calc_load_account_idle(struct rq *this_rq) |
3045 | { | 3484 | { |
@@ -3049,6 +3488,10 @@ static inline long calc_load_fold_idle(void) | |||
3049 | { | 3488 | { |
3050 | return 0; | 3489 | return 0; |
3051 | } | 3490 | } |
3491 | |||
3492 | static void calc_global_nohz(unsigned long ticks) | ||
3493 | { | ||
3494 | } | ||
3052 | #endif | 3495 | #endif |
3053 | 3496 | ||
3054 | /** | 3497 | /** |
@@ -3066,24 +3509,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3066 | loads[2] = (avenrun[2] + offset) << shift; | 3509 | loads[2] = (avenrun[2] + offset) << shift; |
3067 | } | 3510 | } |
3068 | 3511 | ||
3069 | static unsigned long | ||
3070 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3071 | { | ||
3072 | load *= exp; | ||
3073 | load += active * (FIXED_1 - exp); | ||
3074 | return load >> FSHIFT; | ||
3075 | } | ||
3076 | |||
3077 | /* | 3512 | /* |
3078 | * calc_load - update the avenrun load estimates 10 ticks after the | 3513 | * calc_load - update the avenrun load estimates 10 ticks after the |
3079 | * CPUs have updated calc_load_tasks. | 3514 | * CPUs have updated calc_load_tasks. |
3080 | */ | 3515 | */ |
3081 | void calc_global_load(void) | 3516 | void calc_global_load(unsigned long ticks) |
3082 | { | 3517 | { |
3083 | unsigned long upd = calc_load_update + 10; | ||
3084 | long active; | 3518 | long active; |
3085 | 3519 | ||
3086 | if (time_before(jiffies, upd)) | 3520 | calc_global_nohz(ticks); |
3521 | |||
3522 | if (time_before(jiffies, calc_load_update + 10)) | ||
3087 | return; | 3523 | return; |
3088 | 3524 | ||
3089 | active = atomic_long_read(&calc_load_tasks); | 3525 | active = atomic_long_read(&calc_load_tasks); |
@@ -3244,27 +3680,22 @@ void sched_exec(void) | |||
3244 | { | 3680 | { |
3245 | struct task_struct *p = current; | 3681 | struct task_struct *p = current; |
3246 | unsigned long flags; | 3682 | unsigned long flags; |
3247 | struct rq *rq; | ||
3248 | int dest_cpu; | 3683 | int dest_cpu; |
3249 | 3684 | ||
3250 | rq = task_rq_lock(p, &flags); | 3685 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3251 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3686 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3252 | if (dest_cpu == smp_processor_id()) | 3687 | if (dest_cpu == smp_processor_id()) |
3253 | goto unlock; | 3688 | goto unlock; |
3254 | 3689 | ||
3255 | /* | 3690 | if (likely(cpu_active(dest_cpu))) { |
3256 | * select_task_rq() can race against ->cpus_allowed | ||
3257 | */ | ||
3258 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3259 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | ||
3260 | struct migration_arg arg = { p, dest_cpu }; | 3691 | struct migration_arg arg = { p, dest_cpu }; |
3261 | 3692 | ||
3262 | task_rq_unlock(rq, &flags); | 3693 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3263 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3694 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3264 | return; | 3695 | return; |
3265 | } | 3696 | } |
3266 | unlock: | 3697 | unlock: |
3267 | task_rq_unlock(rq, &flags); | 3698 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3268 | } | 3699 | } |
3269 | 3700 | ||
3270 | #endif | 3701 | #endif |
@@ -3285,7 +3716,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
3285 | 3716 | ||
3286 | if (task_current(rq, p)) { | 3717 | if (task_current(rq, p)) { |
3287 | update_rq_clock(rq); | 3718 | update_rq_clock(rq); |
3288 | ns = rq->clock - p->se.exec_start; | 3719 | ns = rq->clock_task - p->se.exec_start; |
3289 | if ((s64)ns < 0) | 3720 | if ((s64)ns < 0) |
3290 | ns = 0; | 3721 | ns = 0; |
3291 | } | 3722 | } |
@@ -3301,7 +3732,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3301 | 3732 | ||
3302 | rq = task_rq_lock(p, &flags); | 3733 | rq = task_rq_lock(p, &flags); |
3303 | ns = do_task_delta_exec(p, rq); | 3734 | ns = do_task_delta_exec(p, rq); |
3304 | task_rq_unlock(rq, &flags); | 3735 | task_rq_unlock(rq, p, &flags); |
3305 | 3736 | ||
3306 | return ns; | 3737 | return ns; |
3307 | } | 3738 | } |
@@ -3319,7 +3750,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3319 | 3750 | ||
3320 | rq = task_rq_lock(p, &flags); | 3751 | rq = task_rq_lock(p, &flags); |
3321 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3752 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3322 | task_rq_unlock(rq, &flags); | 3753 | task_rq_unlock(rq, p, &flags); |
3323 | 3754 | ||
3324 | return ns; | 3755 | return ns; |
3325 | } | 3756 | } |
@@ -3343,7 +3774,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3343 | rq = task_rq_lock(p, &flags); | 3774 | rq = task_rq_lock(p, &flags); |
3344 | thread_group_cputime(p, &totals); | 3775 | thread_group_cputime(p, &totals); |
3345 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3776 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3346 | task_rq_unlock(rq, &flags); | 3777 | task_rq_unlock(rq, p, &flags); |
3347 | 3778 | ||
3348 | return ns; | 3779 | return ns; |
3349 | } | 3780 | } |
@@ -3408,6 +3839,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3408 | } | 3839 | } |
3409 | 3840 | ||
3410 | /* | 3841 | /* |
3842 | * Account system cpu time to a process and desired cpustat field | ||
3843 | * @p: the process that the cpu time gets accounted to | ||
3844 | * @cputime: the cpu time spent in kernel space since the last update | ||
3845 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3846 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3847 | */ | ||
3848 | static inline | ||
3849 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3850 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3851 | { | ||
3852 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3853 | |||
3854 | /* Add system time to process. */ | ||
3855 | p->stime = cputime_add(p->stime, cputime); | ||
3856 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3857 | account_group_system_time(p, cputime); | ||
3858 | |||
3859 | /* Add system time to cpustat. */ | ||
3860 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3861 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3862 | |||
3863 | /* Account for system time used */ | ||
3864 | acct_update_integrals(p); | ||
3865 | } | ||
3866 | |||
3867 | /* | ||
3411 | * Account system cpu time to a process. | 3868 | * Account system cpu time to a process. |
3412 | * @p: the process that the cpu time gets accounted to | 3869 | * @p: the process that the cpu time gets accounted to |
3413 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3870 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3418,36 +3875,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3418 | cputime_t cputime, cputime_t cputime_scaled) | 3875 | cputime_t cputime, cputime_t cputime_scaled) |
3419 | { | 3876 | { |
3420 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3877 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3421 | cputime64_t tmp; | 3878 | cputime64_t *target_cputime64; |
3422 | 3879 | ||
3423 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3880 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3424 | account_guest_time(p, cputime, cputime_scaled); | 3881 | account_guest_time(p, cputime, cputime_scaled); |
3425 | return; | 3882 | return; |
3426 | } | 3883 | } |
3427 | 3884 | ||
3428 | /* Add system time to process. */ | ||
3429 | p->stime = cputime_add(p->stime, cputime); | ||
3430 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3431 | account_group_system_time(p, cputime); | ||
3432 | |||
3433 | /* Add system time to cpustat. */ | ||
3434 | tmp = cputime_to_cputime64(cputime); | ||
3435 | if (hardirq_count() - hardirq_offset) | 3885 | if (hardirq_count() - hardirq_offset) |
3436 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3886 | target_cputime64 = &cpustat->irq; |
3437 | else if (softirq_count()) | 3887 | else if (in_serving_softirq()) |
3438 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3888 | target_cputime64 = &cpustat->softirq; |
3439 | else | 3889 | else |
3440 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3890 | target_cputime64 = &cpustat->system; |
3441 | |||
3442 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3443 | 3891 | ||
3444 | /* Account for system time used */ | 3892 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3445 | acct_update_integrals(p); | ||
3446 | } | 3893 | } |
3447 | 3894 | ||
3448 | /* | 3895 | /* |
3449 | * Account for involuntary wait time. | 3896 | * Account for involuntary wait time. |
3450 | * @steal: the cpu time spent in involuntary wait | 3897 | * @cputime: the cpu time spent in involuntary wait |
3451 | */ | 3898 | */ |
3452 | void account_steal_time(cputime_t cputime) | 3899 | void account_steal_time(cputime_t cputime) |
3453 | { | 3900 | { |
@@ -3475,6 +3922,73 @@ void account_idle_time(cputime_t cputime) | |||
3475 | 3922 | ||
3476 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3923 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3477 | 3924 | ||
3925 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3926 | /* | ||
3927 | * Account a tick to a process and cpustat | ||
3928 | * @p: the process that the cpu time gets accounted to | ||
3929 | * @user_tick: is the tick from userspace | ||
3930 | * @rq: the pointer to rq | ||
3931 | * | ||
3932 | * Tick demultiplexing follows the order | ||
3933 | * - pending hardirq update | ||
3934 | * - pending softirq update | ||
3935 | * - user_time | ||
3936 | * - idle_time | ||
3937 | * - system time | ||
3938 | * - check for guest_time | ||
3939 | * - else account as system_time | ||
3940 | * | ||
3941 | * Check for hardirq is done both for system and user time as there is | ||
3942 | * no timer going off while we are on hardirq and hence we may never get an | ||
3943 | * opportunity to update it solely in system time. | ||
3944 | * p->stime and friends are only updated on system time and not on irq | ||
3945 | * softirq as those do not count in task exec_runtime any more. | ||
3946 | */ | ||
3947 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3948 | struct rq *rq) | ||
3949 | { | ||
3950 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3951 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3952 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3953 | |||
3954 | if (irqtime_account_hi_update()) { | ||
3955 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3956 | } else if (irqtime_account_si_update()) { | ||
3957 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3958 | } else if (this_cpu_ksoftirqd() == p) { | ||
3959 | /* | ||
3960 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3961 | * So, we have to handle it separately here. | ||
3962 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3963 | */ | ||
3964 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3965 | &cpustat->softirq); | ||
3966 | } else if (user_tick) { | ||
3967 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3968 | } else if (p == rq->idle) { | ||
3969 | account_idle_time(cputime_one_jiffy); | ||
3970 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3971 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3972 | } else { | ||
3973 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3974 | &cpustat->system); | ||
3975 | } | ||
3976 | } | ||
3977 | |||
3978 | static void irqtime_account_idle_ticks(int ticks) | ||
3979 | { | ||
3980 | int i; | ||
3981 | struct rq *rq = this_rq(); | ||
3982 | |||
3983 | for (i = 0; i < ticks; i++) | ||
3984 | irqtime_account_process_tick(current, 0, rq); | ||
3985 | } | ||
3986 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3987 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3988 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3989 | struct rq *rq) {} | ||
3990 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3991 | |||
3478 | /* | 3992 | /* |
3479 | * Account a single tick of cpu time. | 3993 | * Account a single tick of cpu time. |
3480 | * @p: the process that the cpu time gets accounted to | 3994 | * @p: the process that the cpu time gets accounted to |
@@ -3485,6 +3999,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3485 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3999 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3486 | struct rq *rq = this_rq(); | 4000 | struct rq *rq = this_rq(); |
3487 | 4001 | ||
4002 | if (sched_clock_irqtime) { | ||
4003 | irqtime_account_process_tick(p, user_tick, rq); | ||
4004 | return; | ||
4005 | } | ||
4006 | |||
3488 | if (user_tick) | 4007 | if (user_tick) |
3489 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 4008 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3490 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 4009 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3510,6 +4029,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3510 | */ | 4029 | */ |
3511 | void account_idle_ticks(unsigned long ticks) | 4030 | void account_idle_ticks(unsigned long ticks) |
3512 | { | 4031 | { |
4032 | |||
4033 | if (sched_clock_irqtime) { | ||
4034 | irqtime_account_idle_ticks(ticks); | ||
4035 | return; | ||
4036 | } | ||
4037 | |||
3513 | account_idle_time(jiffies_to_cputime(ticks)); | 4038 | account_idle_time(jiffies_to_cputime(ticks)); |
3514 | } | 4039 | } |
3515 | 4040 | ||
@@ -3603,9 +4128,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3603 | /* | 4128 | /* |
3604 | * This function gets called by the timer code, with HZ frequency. | 4129 | * This function gets called by the timer code, with HZ frequency. |
3605 | * We call it with interrupts disabled. | 4130 | * We call it with interrupts disabled. |
3606 | * | ||
3607 | * It also gets called by the fork code, when changing the parent's | ||
3608 | * timeslices. | ||
3609 | */ | 4131 | */ |
3610 | void scheduler_tick(void) | 4132 | void scheduler_tick(void) |
3611 | { | 4133 | { |
@@ -3627,7 +4149,7 @@ void scheduler_tick(void) | |||
3627 | 4149 | ||
3628 | raw_spin_unlock(&rq->lock); | 4150 | raw_spin_unlock(&rq->lock); |
3629 | 4151 | ||
3630 | perf_event_task_tick(curr); | 4152 | perf_event_task_tick(); |
3631 | 4153 | ||
3632 | #ifdef CONFIG_SMP | 4154 | #ifdef CONFIG_SMP |
3633 | rq->idle_at_tick = idle_cpu(cpu); | 4155 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -3733,19 +4255,12 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3733 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4255 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3734 | 4256 | ||
3735 | schedstat_inc(this_rq(), sched_count); | 4257 | schedstat_inc(this_rq(), sched_count); |
3736 | #ifdef CONFIG_SCHEDSTATS | ||
3737 | if (unlikely(prev->lock_depth >= 0)) { | ||
3738 | schedstat_inc(this_rq(), bkl_count); | ||
3739 | schedstat_inc(prev, sched_info.bkl_count); | ||
3740 | } | ||
3741 | #endif | ||
3742 | } | 4258 | } |
3743 | 4259 | ||
3744 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4260 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
3745 | { | 4261 | { |
3746 | if (prev->se.on_rq) | 4262 | if (prev->on_rq || rq->skip_clock_update < 0) |
3747 | update_rq_clock(rq); | 4263 | update_rq_clock(rq); |
3748 | rq->skip_clock_update = 0; | ||
3749 | prev->sched_class->put_prev_task(rq, prev); | 4264 | prev->sched_class->put_prev_task(rq, prev); |
3750 | } | 4265 | } |
3751 | 4266 | ||
@@ -3776,17 +4291,13 @@ pick_next_task(struct rq *rq) | |||
3776 | } | 4291 | } |
3777 | */ | 4292 | */ |
3778 | 4293 | ||
3779 | class = sched_class_highest; | 4294 | for_each_class(class) { |
3780 | for ( ; ; ) { | ||
3781 | p = class->pick_next_task(rq); | 4295 | p = class->pick_next_task(rq); |
3782 | if (p) | 4296 | if (p) |
3783 | return p; | 4297 | return p; |
3784 | /* | ||
3785 | * Will never be NULL as the idle class always | ||
3786 | * returns a non-NULL p: | ||
3787 | */ | ||
3788 | class = class->next; | ||
3789 | } | 4298 | } |
4299 | |||
4300 | BUG(); /* the idle class will always have a runnable task */ | ||
3790 | } | 4301 | } |
3791 | 4302 | ||
3792 | /* | 4303 | /* |
@@ -3807,8 +4318,10 @@ need_resched: | |||
3807 | rcu_note_context_switch(cpu); | 4318 | rcu_note_context_switch(cpu); |
3808 | prev = rq->curr; | 4319 | prev = rq->curr; |
3809 | 4320 | ||
3810 | release_kernel_lock(prev); | 4321 | /* LITMUS^RT: quickly re-evaluate the scheduling decision |
3811 | need_resched_nonpreemptible: | 4322 | * if the previous one is no longer valid after CTX. |
4323 | */ | ||
4324 | litmus_need_resched_nonpreemptible: | ||
3812 | TS_SCHED_START; | 4325 | TS_SCHED_START; |
3813 | sched_trace_task_switch_away(prev); | 4326 | sched_trace_task_switch_away(prev); |
3814 | 4327 | ||
@@ -3818,18 +4331,19 @@ need_resched_nonpreemptible: | |||
3818 | hrtick_clear(rq); | 4331 | hrtick_clear(rq); |
3819 | 4332 | ||
3820 | raw_spin_lock_irq(&rq->lock); | 4333 | raw_spin_lock_irq(&rq->lock); |
3821 | clear_tsk_need_resched(prev); | ||
3822 | 4334 | ||
3823 | switch_count = &prev->nivcsw; | 4335 | switch_count = &prev->nivcsw; |
3824 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4336 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3825 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4337 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3826 | prev->state = TASK_RUNNING; | 4338 | prev->state = TASK_RUNNING; |
3827 | } else { | 4339 | } else { |
4340 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4341 | prev->on_rq = 0; | ||
4342 | |||
3828 | /* | 4343 | /* |
3829 | * If a worker is going to sleep, notify and | 4344 | * If a worker went to sleep, notify and ask workqueue |
3830 | * ask workqueue whether it wants to wake up a | 4345 | * whether it wants to wake up a task to maintain |
3831 | * task to maintain concurrency. If so, wake | 4346 | * concurrency. |
3832 | * up the task. | ||
3833 | */ | 4347 | */ |
3834 | if (prev->flags & PF_WQ_WORKER) { | 4348 | if (prev->flags & PF_WQ_WORKER) { |
3835 | struct task_struct *to_wakeup; | 4349 | struct task_struct *to_wakeup; |
@@ -3838,7 +4352,16 @@ need_resched_nonpreemptible: | |||
3838 | if (to_wakeup) | 4352 | if (to_wakeup) |
3839 | try_to_wake_up_local(to_wakeup); | 4353 | try_to_wake_up_local(to_wakeup); |
3840 | } | 4354 | } |
3841 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4355 | |
4356 | /* | ||
4357 | * If we are going to sleep and we have plugged IO | ||
4358 | * queued, make sure to submit it to avoid deadlocks. | ||
4359 | */ | ||
4360 | if (blk_needs_flush_plug(prev)) { | ||
4361 | raw_spin_unlock(&rq->lock); | ||
4362 | blk_schedule_flush_plug(prev); | ||
4363 | raw_spin_lock(&rq->lock); | ||
4364 | } | ||
3842 | } | 4365 | } |
3843 | switch_count = &prev->nvcsw; | 4366 | switch_count = &prev->nvcsw; |
3844 | } | 4367 | } |
@@ -3850,11 +4373,10 @@ need_resched_nonpreemptible: | |||
3850 | 4373 | ||
3851 | put_prev_task(rq, prev); | 4374 | put_prev_task(rq, prev); |
3852 | next = pick_next_task(rq); | 4375 | next = pick_next_task(rq); |
4376 | clear_tsk_need_resched(prev); | ||
4377 | rq->skip_clock_update = 0; | ||
3853 | 4378 | ||
3854 | if (likely(prev != next)) { | 4379 | if (likely(prev != next)) { |
3855 | sched_info_switch(prev, next); | ||
3856 | perf_event_task_sched_out(prev, next); | ||
3857 | |||
3858 | rq->nr_switches++; | 4380 | rq->nr_switches++; |
3859 | rq->curr = next; | 4381 | rq->curr = next; |
3860 | ++*switch_count; | 4382 | ++*switch_count; |
@@ -3880,8 +4402,8 @@ need_resched_nonpreemptible: | |||
3880 | 4402 | ||
3881 | post_schedule(rq); | 4403 | post_schedule(rq); |
3882 | 4404 | ||
3883 | if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) | 4405 | if (sched_state_validate_switch()) |
3884 | goto need_resched_nonpreemptible; | 4406 | goto litmus_need_resched_nonpreemptible; |
3885 | 4407 | ||
3886 | preempt_enable_no_resched(); | 4408 | preempt_enable_no_resched(); |
3887 | if (need_resched()) | 4409 | if (need_resched()) |
@@ -3892,70 +4414,53 @@ need_resched_nonpreemptible: | |||
3892 | EXPORT_SYMBOL(schedule); | 4414 | EXPORT_SYMBOL(schedule); |
3893 | 4415 | ||
3894 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4416 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4417 | |||
4418 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
4419 | { | ||
4420 | bool ret = false; | ||
4421 | |||
4422 | rcu_read_lock(); | ||
4423 | if (lock->owner != owner) | ||
4424 | goto fail; | ||
4425 | |||
4426 | /* | ||
4427 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
4428 | * lock->owner still matches owner, if that fails, owner might | ||
4429 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4430 | * ensures the memory stays valid. | ||
4431 | */ | ||
4432 | barrier(); | ||
4433 | |||
4434 | ret = owner->on_cpu; | ||
4435 | fail: | ||
4436 | rcu_read_unlock(); | ||
4437 | |||
4438 | return ret; | ||
4439 | } | ||
4440 | |||
3895 | /* | 4441 | /* |
3896 | * Look out! "owner" is an entirely speculative pointer | 4442 | * Look out! "owner" is an entirely speculative pointer |
3897 | * access and not reliable. | 4443 | * access and not reliable. |
3898 | */ | 4444 | */ |
3899 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | 4445 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
3900 | { | 4446 | { |
3901 | unsigned int cpu; | ||
3902 | struct rq *rq; | ||
3903 | |||
3904 | if (!sched_feat(OWNER_SPIN)) | 4447 | if (!sched_feat(OWNER_SPIN)) |
3905 | return 0; | 4448 | return 0; |
3906 | 4449 | ||
3907 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4450 | while (owner_running(lock, owner)) { |
3908 | /* | 4451 | if (need_resched()) |
3909 | * Need to access the cpu field knowing that | 4452 | return 0; |
3910 | * DEBUG_PAGEALLOC could have unmapped it if | ||
3911 | * the mutex owner just released it and exited. | ||
3912 | */ | ||
3913 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
3914 | return 0; | ||
3915 | #else | ||
3916 | cpu = owner->cpu; | ||
3917 | #endif | ||
3918 | 4453 | ||
3919 | /* | 4454 | arch_mutex_cpu_relax(); |
3920 | * Even if the access succeeded (likely case), | 4455 | } |
3921 | * the cpu field may no longer be valid. | ||
3922 | */ | ||
3923 | if (cpu >= nr_cpumask_bits) | ||
3924 | return 0; | ||
3925 | 4456 | ||
3926 | /* | 4457 | /* |
3927 | * We need to validate that we can do a | 4458 | * If the owner changed to another task there is likely |
3928 | * get_cpu() and that we have the percpu area. | 4459 | * heavy contention, stop spinning. |
3929 | */ | 4460 | */ |
3930 | if (!cpu_online(cpu)) | 4461 | if (lock->owner) |
3931 | return 0; | 4462 | return 0; |
3932 | 4463 | ||
3933 | rq = cpu_rq(cpu); | ||
3934 | |||
3935 | for (;;) { | ||
3936 | /* | ||
3937 | * Owner changed, break to re-assess state. | ||
3938 | */ | ||
3939 | if (lock->owner != owner) { | ||
3940 | /* | ||
3941 | * If the lock has switched to a different owner, | ||
3942 | * we likely have heavy contention. Return 0 to quit | ||
3943 | * optimistic spinning and not contend further: | ||
3944 | */ | ||
3945 | if (lock->owner) | ||
3946 | return 0; | ||
3947 | break; | ||
3948 | } | ||
3949 | |||
3950 | /* | ||
3951 | * Is that owner really running on that cpu? | ||
3952 | */ | ||
3953 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
3954 | return 0; | ||
3955 | |||
3956 | cpu_relax(); | ||
3957 | } | ||
3958 | |||
3959 | return 1; | 4464 | return 1; |
3960 | } | 4465 | } |
3961 | #endif | 4466 | #endif |
@@ -4085,6 +4590,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
4085 | { | 4590 | { |
4086 | __wake_up_common(q, mode, 1, 0, key); | 4591 | __wake_up_common(q, mode, 1, 0, key); |
4087 | } | 4592 | } |
4593 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
4088 | 4594 | ||
4089 | /** | 4595 | /** |
4090 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4596 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
@@ -4276,7 +4782,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4276 | * This waits for either a completion of a specific task to be signaled or for a | 4782 | * This waits for either a completion of a specific task to be signaled or for a |
4277 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4783 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4278 | */ | 4784 | */ |
4279 | unsigned long __sched | 4785 | long __sched |
4280 | wait_for_completion_interruptible_timeout(struct completion *x, | 4786 | wait_for_completion_interruptible_timeout(struct completion *x, |
4281 | unsigned long timeout) | 4787 | unsigned long timeout) |
4282 | { | 4788 | { |
@@ -4309,7 +4815,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4309 | * signaled or for a specified timeout to expire. It can be | 4815 | * signaled or for a specified timeout to expire. It can be |
4310 | * interrupted by a kill signal. The timeout is in jiffies. | 4816 | * interrupted by a kill signal. The timeout is in jiffies. |
4311 | */ | 4817 | */ |
4312 | unsigned long __sched | 4818 | long __sched |
4313 | wait_for_completion_killable_timeout(struct completion *x, | 4819 | wait_for_completion_killable_timeout(struct completion *x, |
4314 | unsigned long timeout) | 4820 | unsigned long timeout) |
4315 | { | 4821 | { |
@@ -4425,18 +4931,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4425 | */ | 4931 | */ |
4426 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4932 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4427 | { | 4933 | { |
4428 | unsigned long flags; | ||
4429 | int oldprio, on_rq, running; | 4934 | int oldprio, on_rq, running; |
4430 | struct rq *rq; | 4935 | struct rq *rq; |
4431 | const struct sched_class *prev_class; | 4936 | const struct sched_class *prev_class; |
4432 | 4937 | ||
4433 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4938 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4434 | 4939 | ||
4435 | rq = task_rq_lock(p, &flags); | 4940 | rq = __task_rq_lock(p); |
4436 | 4941 | ||
4942 | trace_sched_pi_setprio(p, prio); | ||
4437 | oldprio = p->prio; | 4943 | oldprio = p->prio; |
4438 | prev_class = p->sched_class; | 4944 | prev_class = p->sched_class; |
4439 | on_rq = p->se.on_rq; | 4945 | on_rq = p->on_rq; |
4440 | running = task_current(rq, p); | 4946 | running = task_current(rq, p); |
4441 | if (on_rq) | 4947 | if (on_rq) |
4442 | dequeue_task(rq, p, 0); | 4948 | dequeue_task(rq, p, 0); |
@@ -4452,12 +4958,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4452 | 4958 | ||
4453 | if (running) | 4959 | if (running) |
4454 | p->sched_class->set_curr_task(rq); | 4960 | p->sched_class->set_curr_task(rq); |
4455 | if (on_rq) { | 4961 | if (on_rq) |
4456 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4962 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4457 | 4963 | ||
4458 | check_class_changed(rq, p, prev_class, oldprio, running); | 4964 | check_class_changed(rq, p, prev_class, oldprio); |
4459 | } | 4965 | __task_rq_unlock(rq); |
4460 | task_rq_unlock(rq, &flags); | ||
4461 | } | 4966 | } |
4462 | 4967 | ||
4463 | #endif | 4968 | #endif |
@@ -4485,7 +4990,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4485 | p->static_prio = NICE_TO_PRIO(nice); | 4990 | p->static_prio = NICE_TO_PRIO(nice); |
4486 | goto out_unlock; | 4991 | goto out_unlock; |
4487 | } | 4992 | } |
4488 | on_rq = p->se.on_rq; | 4993 | on_rq = p->on_rq; |
4489 | if (on_rq) | 4994 | if (on_rq) |
4490 | dequeue_task(rq, p, 0); | 4995 | dequeue_task(rq, p, 0); |
4491 | 4996 | ||
@@ -4505,7 +5010,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4505 | resched_task(rq->curr); | 5010 | resched_task(rq->curr); |
4506 | } | 5011 | } |
4507 | out_unlock: | 5012 | out_unlock: |
4508 | task_rq_unlock(rq, &flags); | 5013 | task_rq_unlock(rq, p, &flags); |
4509 | } | 5014 | } |
4510 | EXPORT_SYMBOL(set_user_nice); | 5015 | EXPORT_SYMBOL(set_user_nice); |
4511 | 5016 | ||
@@ -4619,8 +5124,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4619 | static void | 5124 | static void |
4620 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 5125 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4621 | { | 5126 | { |
4622 | BUG_ON(p->se.on_rq); | ||
4623 | |||
4624 | p->policy = policy; | 5127 | p->policy = policy; |
4625 | p->rt_priority = prio; | 5128 | p->rt_priority = prio; |
4626 | p->normal_prio = normal_prio(p); | 5129 | p->normal_prio = normal_prio(p); |
@@ -4645,14 +5148,17 @@ static bool check_same_owner(struct task_struct *p) | |||
4645 | 5148 | ||
4646 | rcu_read_lock(); | 5149 | rcu_read_lock(); |
4647 | pcred = __task_cred(p); | 5150 | pcred = __task_cred(p); |
4648 | match = (cred->euid == pcred->euid || | 5151 | if (cred->user->user_ns == pcred->user->user_ns) |
4649 | cred->euid == pcred->uid); | 5152 | match = (cred->euid == pcred->euid || |
5153 | cred->euid == pcred->uid); | ||
5154 | else | ||
5155 | match = false; | ||
4650 | rcu_read_unlock(); | 5156 | rcu_read_unlock(); |
4651 | return match; | 5157 | return match; |
4652 | } | 5158 | } |
4653 | 5159 | ||
4654 | static int __sched_setscheduler(struct task_struct *p, int policy, | 5160 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4655 | struct sched_param *param, bool user) | 5161 | const struct sched_param *param, bool user) |
4656 | { | 5162 | { |
4657 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 5163 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4658 | unsigned long flags; | 5164 | unsigned long flags; |
@@ -4708,12 +5214,15 @@ recheck: | |||
4708 | param->sched_priority > rlim_rtprio) | 5214 | param->sched_priority > rlim_rtprio) |
4709 | return -EPERM; | 5215 | return -EPERM; |
4710 | } | 5216 | } |
5217 | |||
4711 | /* | 5218 | /* |
4712 | * Like positive nice levels, dont allow tasks to | 5219 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4713 | * move out of SCHED_IDLE either: | 5220 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4714 | */ | 5221 | */ |
4715 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 5222 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4716 | return -EPERM; | 5223 | if (!can_nice(p, TASK_NICE(p))) |
5224 | return -EPERM; | ||
5225 | } | ||
4717 | 5226 | ||
4718 | /* can't change other user's priorities */ | 5227 | /* can't change other user's priorities */ |
4719 | if (!check_same_owner(p)) | 5228 | if (!check_same_owner(p)) |
@@ -4725,7 +5234,7 @@ recheck: | |||
4725 | } | 5234 | } |
4726 | 5235 | ||
4727 | if (user) { | 5236 | if (user) { |
4728 | retval = security_task_setscheduler(p, policy, param); | 5237 | retval = security_task_setscheduler(p); |
4729 | if (retval) | 5238 | if (retval) |
4730 | return retval; | 5239 | return retval; |
4731 | } | 5240 | } |
@@ -4739,13 +5248,30 @@ recheck: | |||
4739 | /* | 5248 | /* |
4740 | * make sure no PI-waiters arrive (or leave) while we are | 5249 | * make sure no PI-waiters arrive (or leave) while we are |
4741 | * changing the priority of the task: | 5250 | * changing the priority of the task: |
5251 | * | ||
5252 | * To be able to change p->policy safely, the appropriate | ||
5253 | * runqueue lock must be held. | ||
4742 | */ | 5254 | */ |
4743 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5255 | rq = task_rq_lock(p, &flags); |
5256 | |||
4744 | /* | 5257 | /* |
4745 | * To be able to change p->policy safely, the apropriate | 5258 | * Changing the policy of the stop threads its a very bad idea |
4746 | * runqueue lock must be held. | ||
4747 | */ | 5259 | */ |
4748 | rq = __task_rq_lock(p); | 5260 | if (p == rq->stop) { |
5261 | task_rq_unlock(rq, p, &flags); | ||
5262 | return -EINVAL; | ||
5263 | } | ||
5264 | |||
5265 | /* | ||
5266 | * If not changing anything there's no need to proceed further: | ||
5267 | */ | ||
5268 | if (unlikely(policy == p->policy && (!rt_policy(policy) || | ||
5269 | param->sched_priority == p->rt_priority))) { | ||
5270 | |||
5271 | __task_rq_unlock(rq); | ||
5272 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5273 | return 0; | ||
5274 | } | ||
4749 | 5275 | ||
4750 | #ifdef CONFIG_RT_GROUP_SCHED | 5276 | #ifdef CONFIG_RT_GROUP_SCHED |
4751 | if (user) { | 5277 | if (user) { |
@@ -4754,9 +5280,9 @@ recheck: | |||
4754 | * assigned. | 5280 | * assigned. |
4755 | */ | 5281 | */ |
4756 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5282 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4757 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 5283 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
4758 | __task_rq_unlock(rq); | 5284 | !task_group_is_autogroup(task_group(p))) { |
4759 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5285 | task_rq_unlock(rq, p, &flags); |
4760 | return -EPERM; | 5286 | return -EPERM; |
4761 | } | 5287 | } |
4762 | } | 5288 | } |
@@ -4765,11 +5291,10 @@ recheck: | |||
4765 | /* recheck policy now with rq lock held */ | 5291 | /* recheck policy now with rq lock held */ |
4766 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5292 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4767 | policy = oldpolicy = -1; | 5293 | policy = oldpolicy = -1; |
4768 | __task_rq_unlock(rq); | 5294 | task_rq_unlock(rq, p, &flags); |
4769 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4770 | goto recheck; | 5295 | goto recheck; |
4771 | } | 5296 | } |
4772 | on_rq = p->se.on_rq; | 5297 | on_rq = p->on_rq; |
4773 | running = task_current(rq, p); | 5298 | running = task_current(rq, p); |
4774 | if (on_rq) | 5299 | if (on_rq) |
4775 | deactivate_task(rq, p, 0); | 5300 | deactivate_task(rq, p, 0); |
@@ -4793,13 +5318,11 @@ recheck: | |||
4793 | 5318 | ||
4794 | if (running) | 5319 | if (running) |
4795 | p->sched_class->set_curr_task(rq); | 5320 | p->sched_class->set_curr_task(rq); |
4796 | if (on_rq) { | 5321 | if (on_rq) |
4797 | activate_task(rq, p, 0); | 5322 | activate_task(rq, p, 0); |
4798 | 5323 | ||
4799 | check_class_changed(rq, p, prev_class, oldprio, running); | 5324 | check_class_changed(rq, p, prev_class, oldprio); |
4800 | } | 5325 | task_rq_unlock(rq, p, &flags); |
4801 | __task_rq_unlock(rq); | ||
4802 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4803 | 5326 | ||
4804 | rt_mutex_adjust_pi(p); | 5327 | rt_mutex_adjust_pi(p); |
4805 | 5328 | ||
@@ -4815,7 +5338,7 @@ recheck: | |||
4815 | * NOTE that the task may be already dead. | 5338 | * NOTE that the task may be already dead. |
4816 | */ | 5339 | */ |
4817 | int sched_setscheduler(struct task_struct *p, int policy, | 5340 | int sched_setscheduler(struct task_struct *p, int policy, |
4818 | struct sched_param *param) | 5341 | const struct sched_param *param) |
4819 | { | 5342 | { |
4820 | return __sched_setscheduler(p, policy, param, true); | 5343 | return __sched_setscheduler(p, policy, param, true); |
4821 | } | 5344 | } |
@@ -4833,7 +5356,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
4833 | * but our caller might not have that capability. | 5356 | * but our caller might not have that capability. |
4834 | */ | 5357 | */ |
4835 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 5358 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4836 | struct sched_param *param) | 5359 | const struct sched_param *param) |
4837 | { | 5360 | { |
4838 | return __sched_setscheduler(p, policy, param, false); | 5361 | return __sched_setscheduler(p, policy, param, false); |
4839 | } | 5362 | } |
@@ -4980,16 +5503,16 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4980 | goto out_free_cpus_allowed; | 5503 | goto out_free_cpus_allowed; |
4981 | } | 5504 | } |
4982 | retval = -EPERM; | 5505 | retval = -EPERM; |
4983 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5506 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
4984 | goto out_unlock; | 5507 | goto out_unlock; |
4985 | 5508 | ||
4986 | retval = security_task_setscheduler(p, 0, NULL); | 5509 | retval = security_task_setscheduler(p); |
4987 | if (retval) | 5510 | if (retval) |
4988 | goto out_unlock; | 5511 | goto out_unlock; |
4989 | 5512 | ||
4990 | cpuset_cpus_allowed(p, cpus_allowed); | 5513 | cpuset_cpus_allowed(p, cpus_allowed); |
4991 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5514 | cpumask_and(new_mask, in_mask, cpus_allowed); |
4992 | again: | 5515 | again: |
4993 | retval = set_cpus_allowed_ptr(p, new_mask); | 5516 | retval = set_cpus_allowed_ptr(p, new_mask); |
4994 | 5517 | ||
4995 | if (!retval) { | 5518 | if (!retval) { |
@@ -5051,7 +5574,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5051 | { | 5574 | { |
5052 | struct task_struct *p; | 5575 | struct task_struct *p; |
5053 | unsigned long flags; | 5576 | unsigned long flags; |
5054 | struct rq *rq; | ||
5055 | int retval; | 5577 | int retval; |
5056 | 5578 | ||
5057 | get_online_cpus(); | 5579 | get_online_cpus(); |
@@ -5066,9 +5588,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5066 | if (retval) | 5588 | if (retval) |
5067 | goto out_unlock; | 5589 | goto out_unlock; |
5068 | 5590 | ||
5069 | rq = task_rq_lock(p, &flags); | 5591 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5070 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5592 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5071 | task_rq_unlock(rq, &flags); | 5593 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5072 | 5594 | ||
5073 | out_unlock: | 5595 | out_unlock: |
5074 | rcu_read_unlock(); | 5596 | rcu_read_unlock(); |
@@ -5215,6 +5737,67 @@ void __sched yield(void) | |||
5215 | } | 5737 | } |
5216 | EXPORT_SYMBOL(yield); | 5738 | EXPORT_SYMBOL(yield); |
5217 | 5739 | ||
5740 | /** | ||
5741 | * yield_to - yield the current processor to another thread in | ||
5742 | * your thread group, or accelerate that thread toward the | ||
5743 | * processor it's on. | ||
5744 | * @p: target task | ||
5745 | * @preempt: whether task preemption is allowed or not | ||
5746 | * | ||
5747 | * It's the caller's job to ensure that the target task struct | ||
5748 | * can't go away on us before we can do any checks. | ||
5749 | * | ||
5750 | * Returns true if we indeed boosted the target task. | ||
5751 | */ | ||
5752 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5753 | { | ||
5754 | struct task_struct *curr = current; | ||
5755 | struct rq *rq, *p_rq; | ||
5756 | unsigned long flags; | ||
5757 | bool yielded = 0; | ||
5758 | |||
5759 | local_irq_save(flags); | ||
5760 | rq = this_rq(); | ||
5761 | |||
5762 | again: | ||
5763 | p_rq = task_rq(p); | ||
5764 | double_rq_lock(rq, p_rq); | ||
5765 | while (task_rq(p) != p_rq) { | ||
5766 | double_rq_unlock(rq, p_rq); | ||
5767 | goto again; | ||
5768 | } | ||
5769 | |||
5770 | if (!curr->sched_class->yield_to_task) | ||
5771 | goto out; | ||
5772 | |||
5773 | if (curr->sched_class != p->sched_class) | ||
5774 | goto out; | ||
5775 | |||
5776 | if (task_running(p_rq, p) || p->state) | ||
5777 | goto out; | ||
5778 | |||
5779 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5780 | if (yielded) { | ||
5781 | schedstat_inc(rq, yld_count); | ||
5782 | /* | ||
5783 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5784 | * fairness. | ||
5785 | */ | ||
5786 | if (preempt && rq != p_rq) | ||
5787 | resched_task(p_rq->curr); | ||
5788 | } | ||
5789 | |||
5790 | out: | ||
5791 | double_rq_unlock(rq, p_rq); | ||
5792 | local_irq_restore(flags); | ||
5793 | |||
5794 | if (yielded) | ||
5795 | schedule(); | ||
5796 | |||
5797 | return yielded; | ||
5798 | } | ||
5799 | EXPORT_SYMBOL_GPL(yield_to); | ||
5800 | |||
5218 | /* | 5801 | /* |
5219 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5802 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5220 | * that process accounting knows that this is a task in IO wait state. | 5803 | * that process accounting knows that this is a task in IO wait state. |
@@ -5225,6 +5808,7 @@ void __sched io_schedule(void) | |||
5225 | 5808 | ||
5226 | delayacct_blkio_start(); | 5809 | delayacct_blkio_start(); |
5227 | atomic_inc(&rq->nr_iowait); | 5810 | atomic_inc(&rq->nr_iowait); |
5811 | blk_flush_plug(current); | ||
5228 | current->in_iowait = 1; | 5812 | current->in_iowait = 1; |
5229 | schedule(); | 5813 | schedule(); |
5230 | current->in_iowait = 0; | 5814 | current->in_iowait = 0; |
@@ -5240,6 +5824,7 @@ long __sched io_schedule_timeout(long timeout) | |||
5240 | 5824 | ||
5241 | delayacct_blkio_start(); | 5825 | delayacct_blkio_start(); |
5242 | atomic_inc(&rq->nr_iowait); | 5826 | atomic_inc(&rq->nr_iowait); |
5827 | blk_flush_plug(current); | ||
5243 | current->in_iowait = 1; | 5828 | current->in_iowait = 1; |
5244 | ret = schedule_timeout(timeout); | 5829 | ret = schedule_timeout(timeout); |
5245 | current->in_iowait = 0; | 5830 | current->in_iowait = 0; |
@@ -5330,7 +5915,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5330 | 5915 | ||
5331 | rq = task_rq_lock(p, &flags); | 5916 | rq = task_rq_lock(p, &flags); |
5332 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5917 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5333 | task_rq_unlock(rq, &flags); | 5918 | task_rq_unlock(rq, p, &flags); |
5334 | 5919 | ||
5335 | rcu_read_unlock(); | 5920 | rcu_read_unlock(); |
5336 | jiffies_to_timespec(time_slice, &t); | 5921 | jiffies_to_timespec(time_slice, &t); |
@@ -5350,7 +5935,7 @@ void sched_show_task(struct task_struct *p) | |||
5350 | unsigned state; | 5935 | unsigned state; |
5351 | 5936 | ||
5352 | state = p->state ? __ffs(p->state) + 1 : 0; | 5937 | state = p->state ? __ffs(p->state) + 1 : 0; |
5353 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5938 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5354 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5939 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5355 | #if BITS_PER_LONG == 32 | 5940 | #if BITS_PER_LONG == 32 |
5356 | if (state == TASK_RUNNING) | 5941 | if (state == TASK_RUNNING) |
@@ -5388,7 +5973,7 @@ void show_state_filter(unsigned long state_filter) | |||
5388 | do_each_thread(g, p) { | 5973 | do_each_thread(g, p) { |
5389 | /* | 5974 | /* |
5390 | * reset the NMI-timeout, listing all files on a slow | 5975 | * reset the NMI-timeout, listing all files on a slow |
5391 | * console might take alot of time: | 5976 | * console might take a lot of time: |
5392 | */ | 5977 | */ |
5393 | touch_nmi_watchdog(); | 5978 | touch_nmi_watchdog(); |
5394 | if (!state_filter || (p->state & state_filter)) | 5979 | if (!state_filter || (p->state & state_filter)) |
@@ -5432,26 +6017,35 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5432 | idle->state = TASK_RUNNING; | 6017 | idle->state = TASK_RUNNING; |
5433 | idle->se.exec_start = sched_clock(); | 6018 | idle->se.exec_start = sched_clock(); |
5434 | 6019 | ||
5435 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 6020 | do_set_cpus_allowed(idle, cpumask_of(cpu)); |
6021 | /* | ||
6022 | * We're having a chicken and egg problem, even though we are | ||
6023 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
6024 | * lockdep check in task_group() will fail. | ||
6025 | * | ||
6026 | * Similar case to sched_fork(). / Alternatively we could | ||
6027 | * use task_rq_lock() here and obtain the other rq->lock. | ||
6028 | * | ||
6029 | * Silence PROVE_RCU | ||
6030 | */ | ||
6031 | rcu_read_lock(); | ||
5436 | __set_task_cpu(idle, cpu); | 6032 | __set_task_cpu(idle, cpu); |
6033 | rcu_read_unlock(); | ||
5437 | 6034 | ||
5438 | rq->curr = rq->idle = idle; | 6035 | rq->curr = rq->idle = idle; |
5439 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 6036 | #if defined(CONFIG_SMP) |
5440 | idle->oncpu = 1; | 6037 | idle->on_cpu = 1; |
5441 | #endif | 6038 | #endif |
5442 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6039 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5443 | 6040 | ||
5444 | /* Set the preempt count _outside_ the spinlocks! */ | 6041 | /* Set the preempt count _outside_ the spinlocks! */ |
5445 | #if defined(CONFIG_PREEMPT) | ||
5446 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5447 | #else | ||
5448 | task_thread_info(idle)->preempt_count = 0; | 6042 | task_thread_info(idle)->preempt_count = 0; |
5449 | #endif | 6043 | |
5450 | /* | 6044 | /* |
5451 | * The idle tasks have their own, simple scheduling class: | 6045 | * The idle tasks have their own, simple scheduling class: |
5452 | */ | 6046 | */ |
5453 | idle->sched_class = &idle_sched_class; | 6047 | idle->sched_class = &idle_sched_class; |
5454 | ftrace_graph_init_task(idle); | 6048 | ftrace_graph_init_idle_task(idle, cpu); |
5455 | } | 6049 | } |
5456 | 6050 | ||
5457 | /* | 6051 | /* |
@@ -5502,7 +6096,6 @@ static void update_sysctl(void) | |||
5502 | SET_SYSCTL(sched_min_granularity); | 6096 | SET_SYSCTL(sched_min_granularity); |
5503 | SET_SYSCTL(sched_latency); | 6097 | SET_SYSCTL(sched_latency); |
5504 | SET_SYSCTL(sched_wakeup_granularity); | 6098 | SET_SYSCTL(sched_wakeup_granularity); |
5505 | SET_SYSCTL(sched_shares_ratelimit); | ||
5506 | #undef SET_SYSCTL | 6099 | #undef SET_SYSCTL |
5507 | } | 6100 | } |
5508 | 6101 | ||
@@ -5512,6 +6105,16 @@ static inline void sched_init_granularity(void) | |||
5512 | } | 6105 | } |
5513 | 6106 | ||
5514 | #ifdef CONFIG_SMP | 6107 | #ifdef CONFIG_SMP |
6108 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | ||
6109 | { | ||
6110 | if (p->sched_class && p->sched_class->set_cpus_allowed) | ||
6111 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
6112 | else { | ||
6113 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
6114 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
6115 | } | ||
6116 | } | ||
6117 | |||
5515 | /* | 6118 | /* |
5516 | * This is how migration works: | 6119 | * This is how migration works: |
5517 | * | 6120 | * |
@@ -5542,52 +6145,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5542 | unsigned int dest_cpu; | 6145 | unsigned int dest_cpu; |
5543 | int ret = 0; | 6146 | int ret = 0; |
5544 | 6147 | ||
5545 | /* | ||
5546 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5547 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5548 | */ | ||
5549 | again: | ||
5550 | while (task_is_waking(p)) | ||
5551 | cpu_relax(); | ||
5552 | rq = task_rq_lock(p, &flags); | 6148 | rq = task_rq_lock(p, &flags); |
5553 | if (task_is_waking(p)) { | 6149 | |
5554 | task_rq_unlock(rq, &flags); | 6150 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5555 | goto again; | 6151 | goto out; |
5556 | } | ||
5557 | 6152 | ||
5558 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 6153 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5559 | ret = -EINVAL; | 6154 | ret = -EINVAL; |
5560 | goto out; | 6155 | goto out; |
5561 | } | 6156 | } |
5562 | 6157 | ||
5563 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 6158 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5564 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5565 | ret = -EINVAL; | 6159 | ret = -EINVAL; |
5566 | goto out; | 6160 | goto out; |
5567 | } | 6161 | } |
5568 | 6162 | ||
5569 | if (p->sched_class->set_cpus_allowed) | 6163 | do_set_cpus_allowed(p, new_mask); |
5570 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
5571 | else { | ||
5572 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
5573 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
5574 | } | ||
5575 | 6164 | ||
5576 | /* Can the task run on the task's current CPU? If so, we're done */ | 6165 | /* Can the task run on the task's current CPU? If so, we're done */ |
5577 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 6166 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
5578 | goto out; | 6167 | goto out; |
5579 | 6168 | ||
5580 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 6169 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5581 | if (migrate_task(p, dest_cpu)) { | 6170 | if (p->on_rq) { |
5582 | struct migration_arg arg = { p, dest_cpu }; | 6171 | struct migration_arg arg = { p, dest_cpu }; |
5583 | /* Need help from migration thread: drop lock and wait. */ | 6172 | /* Need help from migration thread: drop lock and wait. */ |
5584 | task_rq_unlock(rq, &flags); | 6173 | task_rq_unlock(rq, p, &flags); |
5585 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 6174 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5586 | tlb_migrate_finish(p->mm); | 6175 | tlb_migrate_finish(p->mm); |
5587 | return 0; | 6176 | return 0; |
5588 | } | 6177 | } |
5589 | out: | 6178 | out: |
5590 | task_rq_unlock(rq, &flags); | 6179 | task_rq_unlock(rq, p, &flags); |
5591 | 6180 | ||
5592 | return ret; | 6181 | return ret; |
5593 | } | 6182 | } |
@@ -5615,6 +6204,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5615 | rq_src = cpu_rq(src_cpu); | 6204 | rq_src = cpu_rq(src_cpu); |
5616 | rq_dest = cpu_rq(dest_cpu); | 6205 | rq_dest = cpu_rq(dest_cpu); |
5617 | 6206 | ||
6207 | raw_spin_lock(&p->pi_lock); | ||
5618 | double_rq_lock(rq_src, rq_dest); | 6208 | double_rq_lock(rq_src, rq_dest); |
5619 | /* Already moved. */ | 6209 | /* Already moved. */ |
5620 | if (task_cpu(p) != src_cpu) | 6210 | if (task_cpu(p) != src_cpu) |
@@ -5627,7 +6217,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5627 | * If we're not on a rq, the next wake-up will ensure we're | 6217 | * If we're not on a rq, the next wake-up will ensure we're |
5628 | * placed properly. | 6218 | * placed properly. |
5629 | */ | 6219 | */ |
5630 | if (p->se.on_rq) { | 6220 | if (p->on_rq) { |
5631 | deactivate_task(rq_src, p, 0); | 6221 | deactivate_task(rq_src, p, 0); |
5632 | set_task_cpu(p, dest_cpu); | 6222 | set_task_cpu(p, dest_cpu); |
5633 | activate_task(rq_dest, p, 0); | 6223 | activate_task(rq_dest, p, 0); |
@@ -5637,6 +6227,7 @@ done: | |||
5637 | ret = 1; | 6227 | ret = 1; |
5638 | fail: | 6228 | fail: |
5639 | double_rq_unlock(rq_src, rq_dest); | 6229 | double_rq_unlock(rq_src, rq_dest); |
6230 | raw_spin_unlock(&p->pi_lock); | ||
5640 | return ret; | 6231 | return ret; |
5641 | } | 6232 | } |
5642 | 6233 | ||
@@ -5660,29 +6251,20 @@ static int migration_cpu_stop(void *data) | |||
5660 | } | 6251 | } |
5661 | 6252 | ||
5662 | #ifdef CONFIG_HOTPLUG_CPU | 6253 | #ifdef CONFIG_HOTPLUG_CPU |
6254 | |||
5663 | /* | 6255 | /* |
5664 | * Figure out where task on dead CPU should go, use force if necessary. | 6256 | * Ensures that the idle task is using init_mm right before its cpu goes |
6257 | * offline. | ||
5665 | */ | 6258 | */ |
5666 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 6259 | void idle_task_exit(void) |
5667 | { | 6260 | { |
5668 | struct rq *rq = cpu_rq(dead_cpu); | 6261 | struct mm_struct *mm = current->active_mm; |
5669 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5670 | unsigned long flags; | ||
5671 | 6262 | ||
5672 | local_irq_save(flags); | 6263 | BUG_ON(cpu_online(smp_processor_id())); |
5673 | 6264 | ||
5674 | raw_spin_lock(&rq->lock); | 6265 | if (mm != &init_mm) |
5675 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 6266 | switch_mm(mm, &init_mm, current); |
5676 | if (needs_cpu) | 6267 | mmdrop(mm); |
5677 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5678 | raw_spin_unlock(&rq->lock); | ||
5679 | /* | ||
5680 | * It can only fail if we race with set_cpus_allowed(), | ||
5681 | * in the racer should migrate the task anyway. | ||
5682 | */ | ||
5683 | if (needs_cpu) | ||
5684 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5685 | local_irq_restore(flags); | ||
5686 | } | 6268 | } |
5687 | 6269 | ||
5688 | /* | 6270 | /* |
@@ -5695,128 +6277,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5695 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 6277 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5696 | { | 6278 | { |
5697 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 6279 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5698 | unsigned long flags; | ||
5699 | 6280 | ||
5700 | local_irq_save(flags); | ||
5701 | double_rq_lock(rq_src, rq_dest); | ||
5702 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 6281 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5703 | rq_src->nr_uninterruptible = 0; | 6282 | rq_src->nr_uninterruptible = 0; |
5704 | double_rq_unlock(rq_src, rq_dest); | ||
5705 | local_irq_restore(flags); | ||
5706 | } | ||
5707 | |||
5708 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5709 | static void migrate_live_tasks(int src_cpu) | ||
5710 | { | ||
5711 | struct task_struct *p, *t; | ||
5712 | |||
5713 | read_lock(&tasklist_lock); | ||
5714 | |||
5715 | do_each_thread(t, p) { | ||
5716 | if (p == current) | ||
5717 | continue; | ||
5718 | |||
5719 | if (task_cpu(p) == src_cpu) | ||
5720 | move_task_off_dead_cpu(src_cpu, p); | ||
5721 | } while_each_thread(t, p); | ||
5722 | |||
5723 | read_unlock(&tasklist_lock); | ||
5724 | } | 6283 | } |
5725 | 6284 | ||
5726 | /* | 6285 | /* |
5727 | * Schedules idle task to be the next runnable task on current CPU. | 6286 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5728 | * It does so by boosting its priority to highest possible. | ||
5729 | * Used by CPU offline code. | ||
5730 | */ | 6287 | */ |
5731 | void sched_idle_next(void) | 6288 | static void calc_global_load_remove(struct rq *rq) |
5732 | { | 6289 | { |
5733 | int this_cpu = smp_processor_id(); | 6290 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5734 | struct rq *rq = cpu_rq(this_cpu); | 6291 | rq->calc_load_active = 0; |
5735 | struct task_struct *p = rq->idle; | ||
5736 | unsigned long flags; | ||
5737 | |||
5738 | /* cpu has to be offline */ | ||
5739 | BUG_ON(cpu_online(this_cpu)); | ||
5740 | |||
5741 | /* | ||
5742 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5743 | * and interrupts disabled on the current cpu. | ||
5744 | */ | ||
5745 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5746 | |||
5747 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5748 | |||
5749 | activate_task(rq, p, 0); | ||
5750 | |||
5751 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5752 | } | 6292 | } |
5753 | 6293 | ||
5754 | /* | 6294 | /* |
5755 | * Ensures that the idle task is using init_mm right before its cpu goes | 6295 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5756 | * offline. | 6296 | * try_to_wake_up()->select_task_rq(). |
6297 | * | ||
6298 | * Called with rq->lock held even though we'er in stop_machine() and | ||
6299 | * there's no concurrency possible, we hold the required locks anyway | ||
6300 | * because of lock validation efforts. | ||
5757 | */ | 6301 | */ |
5758 | void idle_task_exit(void) | 6302 | static void migrate_tasks(unsigned int dead_cpu) |
5759 | { | ||
5760 | struct mm_struct *mm = current->active_mm; | ||
5761 | |||
5762 | BUG_ON(cpu_online(smp_processor_id())); | ||
5763 | |||
5764 | if (mm != &init_mm) | ||
5765 | switch_mm(mm, &init_mm, current); | ||
5766 | mmdrop(mm); | ||
5767 | } | ||
5768 | |||
5769 | /* called under rq->lock with disabled interrupts */ | ||
5770 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5771 | { | 6303 | { |
5772 | struct rq *rq = cpu_rq(dead_cpu); | 6304 | struct rq *rq = cpu_rq(dead_cpu); |
5773 | 6305 | struct task_struct *next, *stop = rq->stop; | |
5774 | /* Must be exiting, otherwise would be on tasklist. */ | 6306 | int dest_cpu; |
5775 | BUG_ON(!p->exit_state); | ||
5776 | |||
5777 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5778 | BUG_ON(p->state == TASK_DEAD); | ||
5779 | |||
5780 | get_task_struct(p); | ||
5781 | 6307 | ||
5782 | /* | 6308 | /* |
5783 | * Drop lock around migration; if someone else moves it, | 6309 | * Fudge the rq selection such that the below task selection loop |
5784 | * that's OK. No task can be added to this CPU, so iteration is | 6310 | * doesn't get stuck on the currently eligible stop task. |
5785 | * fine. | 6311 | * |
6312 | * We're currently inside stop_machine() and the rq is either stuck | ||
6313 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
6314 | * either way we should never end up calling schedule() until we're | ||
6315 | * done here. | ||
5786 | */ | 6316 | */ |
5787 | raw_spin_unlock_irq(&rq->lock); | 6317 | rq->stop = NULL; |
5788 | move_task_off_dead_cpu(dead_cpu, p); | ||
5789 | raw_spin_lock_irq(&rq->lock); | ||
5790 | |||
5791 | put_task_struct(p); | ||
5792 | } | ||
5793 | |||
5794 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5795 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5796 | { | ||
5797 | struct rq *rq = cpu_rq(dead_cpu); | ||
5798 | struct task_struct *next; | ||
5799 | 6318 | ||
5800 | for ( ; ; ) { | 6319 | for ( ; ; ) { |
5801 | if (!rq->nr_running) | 6320 | /* |
6321 | * There's this thread running, bail when that's the only | ||
6322 | * remaining thread. | ||
6323 | */ | ||
6324 | if (rq->nr_running == 1) | ||
5802 | break; | 6325 | break; |
6326 | |||
5803 | next = pick_next_task(rq); | 6327 | next = pick_next_task(rq); |
5804 | if (!next) | 6328 | BUG_ON(!next); |
5805 | break; | ||
5806 | next->sched_class->put_prev_task(rq, next); | 6329 | next->sched_class->put_prev_task(rq, next); |
5807 | migrate_dead(dead_cpu, next); | ||
5808 | 6330 | ||
6331 | /* Find suitable destination for @next, with force if needed. */ | ||
6332 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
6333 | raw_spin_unlock(&rq->lock); | ||
6334 | |||
6335 | __migrate_task(next, dead_cpu, dest_cpu); | ||
6336 | |||
6337 | raw_spin_lock(&rq->lock); | ||
5809 | } | 6338 | } |
5810 | } | ||
5811 | 6339 | ||
5812 | /* | 6340 | rq->stop = stop; |
5813 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5814 | */ | ||
5815 | static void calc_global_load_remove(struct rq *rq) | ||
5816 | { | ||
5817 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5818 | rq->calc_load_active = 0; | ||
5819 | } | 6341 | } |
6342 | |||
5820 | #endif /* CONFIG_HOTPLUG_CPU */ | 6343 | #endif /* CONFIG_HOTPLUG_CPU */ |
5821 | 6344 | ||
5822 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 6345 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6026,15 +6549,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6026 | unsigned long flags; | 6549 | unsigned long flags; |
6027 | struct rq *rq = cpu_rq(cpu); | 6550 | struct rq *rq = cpu_rq(cpu); |
6028 | 6551 | ||
6029 | switch (action) { | 6552 | switch (action & ~CPU_TASKS_FROZEN) { |
6030 | 6553 | ||
6031 | case CPU_UP_PREPARE: | 6554 | case CPU_UP_PREPARE: |
6032 | case CPU_UP_PREPARE_FROZEN: | ||
6033 | rq->calc_load_update = calc_load_update; | 6555 | rq->calc_load_update = calc_load_update; |
6034 | break; | 6556 | break; |
6035 | 6557 | ||
6036 | case CPU_ONLINE: | 6558 | case CPU_ONLINE: |
6037 | case CPU_ONLINE_FROZEN: | ||
6038 | /* Update our root-domain */ | 6559 | /* Update our root-domain */ |
6039 | raw_spin_lock_irqsave(&rq->lock, flags); | 6560 | raw_spin_lock_irqsave(&rq->lock, flags); |
6040 | if (rq->rd) { | 6561 | if (rq->rd) { |
@@ -6046,33 +6567,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6046 | break; | 6567 | break; |
6047 | 6568 | ||
6048 | #ifdef CONFIG_HOTPLUG_CPU | 6569 | #ifdef CONFIG_HOTPLUG_CPU |
6049 | case CPU_DEAD: | ||
6050 | case CPU_DEAD_FROZEN: | ||
6051 | migrate_live_tasks(cpu); | ||
6052 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6053 | raw_spin_lock_irq(&rq->lock); | ||
6054 | deactivate_task(rq, rq->idle, 0); | ||
6055 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6056 | rq->idle->sched_class = &idle_sched_class; | ||
6057 | migrate_dead_tasks(cpu); | ||
6058 | raw_spin_unlock_irq(&rq->lock); | ||
6059 | migrate_nr_uninterruptible(rq); | ||
6060 | BUG_ON(rq->nr_running != 0); | ||
6061 | calc_global_load_remove(rq); | ||
6062 | break; | ||
6063 | |||
6064 | case CPU_DYING: | 6570 | case CPU_DYING: |
6065 | case CPU_DYING_FROZEN: | 6571 | sched_ttwu_pending(); |
6066 | /* Update our root-domain */ | 6572 | /* Update our root-domain */ |
6067 | raw_spin_lock_irqsave(&rq->lock, flags); | 6573 | raw_spin_lock_irqsave(&rq->lock, flags); |
6068 | if (rq->rd) { | 6574 | if (rq->rd) { |
6069 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6575 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6070 | set_rq_offline(rq); | 6576 | set_rq_offline(rq); |
6071 | } | 6577 | } |
6578 | migrate_tasks(cpu); | ||
6579 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6072 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6580 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6581 | |||
6582 | migrate_nr_uninterruptible(rq); | ||
6583 | calc_global_load_remove(rq); | ||
6073 | break; | 6584 | break; |
6074 | #endif | 6585 | #endif |
6075 | } | 6586 | } |
6587 | |||
6588 | update_max_interval(); | ||
6589 | |||
6076 | return NOTIFY_OK; | 6590 | return NOTIFY_OK; |
6077 | } | 6591 | } |
6078 | 6592 | ||
@@ -6133,6 +6647,8 @@ early_initcall(migration_init); | |||
6133 | 6647 | ||
6134 | #ifdef CONFIG_SMP | 6648 | #ifdef CONFIG_SMP |
6135 | 6649 | ||
6650 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6651 | |||
6136 | #ifdef CONFIG_SCHED_DEBUG | 6652 | #ifdef CONFIG_SCHED_DEBUG |
6137 | 6653 | ||
6138 | static __read_mostly int sched_domain_debug_enabled; | 6654 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6183,7 +6699,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6183 | break; | 6699 | break; |
6184 | } | 6700 | } |
6185 | 6701 | ||
6186 | if (!group->cpu_power) { | 6702 | if (!group->sgp->power) { |
6187 | printk(KERN_CONT "\n"); | 6703 | printk(KERN_CONT "\n"); |
6188 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 6704 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
6189 | "set\n"); | 6705 | "set\n"); |
@@ -6207,9 +6723,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6207 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6723 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
6208 | 6724 | ||
6209 | printk(KERN_CONT " %s", str); | 6725 | printk(KERN_CONT " %s", str); |
6210 | if (group->cpu_power != SCHED_LOAD_SCALE) { | 6726 | if (group->sgp->power != SCHED_POWER_SCALE) { |
6211 | printk(KERN_CONT " (cpu_power = %d)", | 6727 | printk(KERN_CONT " (cpu_power = %d)", |
6212 | group->cpu_power); | 6728 | group->sgp->power); |
6213 | } | 6729 | } |
6214 | 6730 | ||
6215 | group = group->next; | 6731 | group = group->next; |
@@ -6228,7 +6744,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6228 | 6744 | ||
6229 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6745 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6230 | { | 6746 | { |
6231 | cpumask_var_t groupmask; | ||
6232 | int level = 0; | 6747 | int level = 0; |
6233 | 6748 | ||
6234 | if (!sched_domain_debug_enabled) | 6749 | if (!sched_domain_debug_enabled) |
@@ -6241,20 +6756,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6241 | 6756 | ||
6242 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6757 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6243 | 6758 | ||
6244 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6245 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6246 | return; | ||
6247 | } | ||
6248 | |||
6249 | for (;;) { | 6759 | for (;;) { |
6250 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6760 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6251 | break; | 6761 | break; |
6252 | level++; | 6762 | level++; |
6253 | sd = sd->parent; | 6763 | sd = sd->parent; |
6254 | if (!sd) | 6764 | if (!sd) |
6255 | break; | 6765 | break; |
6256 | } | 6766 | } |
6257 | free_cpumask_var(groupmask); | ||
6258 | } | 6767 | } |
6259 | #else /* !CONFIG_SCHED_DEBUG */ | 6768 | #else /* !CONFIG_SCHED_DEBUG */ |
6260 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6769 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6311,12 +6820,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6311 | return 1; | 6820 | return 1; |
6312 | } | 6821 | } |
6313 | 6822 | ||
6314 | static void free_rootdomain(struct root_domain *rd) | 6823 | static void free_rootdomain(struct rcu_head *rcu) |
6315 | { | 6824 | { |
6316 | synchronize_sched(); | 6825 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6317 | 6826 | ||
6318 | cpupri_cleanup(&rd->cpupri); | 6827 | cpupri_cleanup(&rd->cpupri); |
6319 | |||
6320 | free_cpumask_var(rd->rto_mask); | 6828 | free_cpumask_var(rd->rto_mask); |
6321 | free_cpumask_var(rd->online); | 6829 | free_cpumask_var(rd->online); |
6322 | free_cpumask_var(rd->span); | 6830 | free_cpumask_var(rd->span); |
@@ -6357,7 +6865,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6357 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6865 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6358 | 6866 | ||
6359 | if (old_rd) | 6867 | if (old_rd) |
6360 | free_rootdomain(old_rd); | 6868 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6361 | } | 6869 | } |
6362 | 6870 | ||
6363 | static int init_rootdomain(struct root_domain *rd) | 6871 | static int init_rootdomain(struct root_domain *rd) |
@@ -6408,6 +6916,53 @@ static struct root_domain *alloc_rootdomain(void) | |||
6408 | return rd; | 6916 | return rd; |
6409 | } | 6917 | } |
6410 | 6918 | ||
6919 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | ||
6920 | { | ||
6921 | struct sched_group *tmp, *first; | ||
6922 | |||
6923 | if (!sg) | ||
6924 | return; | ||
6925 | |||
6926 | first = sg; | ||
6927 | do { | ||
6928 | tmp = sg->next; | ||
6929 | |||
6930 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | ||
6931 | kfree(sg->sgp); | ||
6932 | |||
6933 | kfree(sg); | ||
6934 | sg = tmp; | ||
6935 | } while (sg != first); | ||
6936 | } | ||
6937 | |||
6938 | static void free_sched_domain(struct rcu_head *rcu) | ||
6939 | { | ||
6940 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6941 | |||
6942 | /* | ||
6943 | * If its an overlapping domain it has private groups, iterate and | ||
6944 | * nuke them all. | ||
6945 | */ | ||
6946 | if (sd->flags & SD_OVERLAP) { | ||
6947 | free_sched_groups(sd->groups, 1); | ||
6948 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
6949 | kfree(sd->groups->sgp); | ||
6950 | kfree(sd->groups); | ||
6951 | } | ||
6952 | kfree(sd); | ||
6953 | } | ||
6954 | |||
6955 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6956 | { | ||
6957 | call_rcu(&sd->rcu, free_sched_domain); | ||
6958 | } | ||
6959 | |||
6960 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6961 | { | ||
6962 | for (; sd; sd = sd->parent) | ||
6963 | destroy_sched_domain(sd, cpu); | ||
6964 | } | ||
6965 | |||
6411 | /* | 6966 | /* |
6412 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6967 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6413 | * hold the hotplug lock. | 6968 | * hold the hotplug lock. |
@@ -6418,9 +6973,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6418 | struct rq *rq = cpu_rq(cpu); | 6973 | struct rq *rq = cpu_rq(cpu); |
6419 | struct sched_domain *tmp; | 6974 | struct sched_domain *tmp; |
6420 | 6975 | ||
6421 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6422 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6423 | |||
6424 | /* Remove the sched domains which do not contribute to scheduling. */ | 6976 | /* Remove the sched domains which do not contribute to scheduling. */ |
6425 | for (tmp = sd; tmp; ) { | 6977 | for (tmp = sd; tmp; ) { |
6426 | struct sched_domain *parent = tmp->parent; | 6978 | struct sched_domain *parent = tmp->parent; |
@@ -6431,12 +6983,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6431 | tmp->parent = parent->parent; | 6983 | tmp->parent = parent->parent; |
6432 | if (parent->parent) | 6984 | if (parent->parent) |
6433 | parent->parent->child = tmp; | 6985 | parent->parent->child = tmp; |
6986 | destroy_sched_domain(parent, cpu); | ||
6434 | } else | 6987 | } else |
6435 | tmp = tmp->parent; | 6988 | tmp = tmp->parent; |
6436 | } | 6989 | } |
6437 | 6990 | ||
6438 | if (sd && sd_degenerate(sd)) { | 6991 | if (sd && sd_degenerate(sd)) { |
6992 | tmp = sd; | ||
6439 | sd = sd->parent; | 6993 | sd = sd->parent; |
6994 | destroy_sched_domain(tmp, cpu); | ||
6440 | if (sd) | 6995 | if (sd) |
6441 | sd->child = NULL; | 6996 | sd->child = NULL; |
6442 | } | 6997 | } |
@@ -6444,7 +6999,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6444 | sched_domain_debug(sd, cpu); | 6999 | sched_domain_debug(sd, cpu); |
6445 | 7000 | ||
6446 | rq_attach_root(rq, rd); | 7001 | rq_attach_root(rq, rd); |
7002 | tmp = rq->sd; | ||
6447 | rcu_assign_pointer(rq->sd, sd); | 7003 | rcu_assign_pointer(rq->sd, sd); |
7004 | destroy_sched_domains(tmp, cpu); | ||
6448 | } | 7005 | } |
6449 | 7006 | ||
6450 | /* cpus with isolated domains */ | 7007 | /* cpus with isolated domains */ |
@@ -6460,56 +7017,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6460 | 7017 | ||
6461 | __setup("isolcpus=", isolated_cpu_setup); | 7018 | __setup("isolcpus=", isolated_cpu_setup); |
6462 | 7019 | ||
6463 | /* | ||
6464 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6465 | * to a function which identifies what group(along with sched group) a CPU | ||
6466 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6467 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6468 | * | ||
6469 | * init_sched_build_groups will build a circular linked list of the groups | ||
6470 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6471 | * and ->cpu_power to 0. | ||
6472 | */ | ||
6473 | static void | ||
6474 | init_sched_build_groups(const struct cpumask *span, | ||
6475 | const struct cpumask *cpu_map, | ||
6476 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6477 | struct sched_group **sg, | ||
6478 | struct cpumask *tmpmask), | ||
6479 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6480 | { | ||
6481 | struct sched_group *first = NULL, *last = NULL; | ||
6482 | int i; | ||
6483 | |||
6484 | cpumask_clear(covered); | ||
6485 | |||
6486 | for_each_cpu(i, span) { | ||
6487 | struct sched_group *sg; | ||
6488 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6489 | int j; | ||
6490 | |||
6491 | if (cpumask_test_cpu(i, covered)) | ||
6492 | continue; | ||
6493 | |||
6494 | cpumask_clear(sched_group_cpus(sg)); | ||
6495 | sg->cpu_power = 0; | ||
6496 | |||
6497 | for_each_cpu(j, span) { | ||
6498 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6499 | continue; | ||
6500 | |||
6501 | cpumask_set_cpu(j, covered); | ||
6502 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6503 | } | ||
6504 | if (!first) | ||
6505 | first = sg; | ||
6506 | if (last) | ||
6507 | last->next = sg; | ||
6508 | last = sg; | ||
6509 | } | ||
6510 | last->next = first; | ||
6511 | } | ||
6512 | |||
6513 | #define SD_NODES_PER_DOMAIN 16 | 7020 | #define SD_NODES_PER_DOMAIN 16 |
6514 | 7021 | ||
6515 | #ifdef CONFIG_NUMA | 7022 | #ifdef CONFIG_NUMA |
@@ -6526,7 +7033,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6526 | */ | 7033 | */ |
6527 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 7034 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6528 | { | 7035 | { |
6529 | int i, n, val, min_val, best_node = 0; | 7036 | int i, n, val, min_val, best_node = -1; |
6530 | 7037 | ||
6531 | min_val = INT_MAX; | 7038 | min_val = INT_MAX; |
6532 | 7039 | ||
@@ -6550,7 +7057,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6550 | } | 7057 | } |
6551 | } | 7058 | } |
6552 | 7059 | ||
6553 | node_set(best_node, *used_nodes); | 7060 | if (best_node != -1) |
7061 | node_set(best_node, *used_nodes); | ||
6554 | return best_node; | 7062 | return best_node; |
6555 | } | 7063 | } |
6556 | 7064 | ||
@@ -6576,293 +7084,197 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6576 | 7084 | ||
6577 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7085 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6578 | int next_node = find_next_best_node(node, &used_nodes); | 7086 | int next_node = find_next_best_node(node, &used_nodes); |
6579 | 7087 | if (next_node < 0) | |
7088 | break; | ||
6580 | cpumask_or(span, span, cpumask_of_node(next_node)); | 7089 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6581 | } | 7090 | } |
6582 | } | 7091 | } |
7092 | |||
7093 | static const struct cpumask *cpu_node_mask(int cpu) | ||
7094 | { | ||
7095 | lockdep_assert_held(&sched_domains_mutex); | ||
7096 | |||
7097 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
7098 | |||
7099 | return sched_domains_tmpmask; | ||
7100 | } | ||
7101 | |||
7102 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
7103 | { | ||
7104 | return cpu_possible_mask; | ||
7105 | } | ||
6583 | #endif /* CONFIG_NUMA */ | 7106 | #endif /* CONFIG_NUMA */ |
6584 | 7107 | ||
6585 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 7108 | static const struct cpumask *cpu_cpu_mask(int cpu) |
7109 | { | ||
7110 | return cpumask_of_node(cpu_to_node(cpu)); | ||
7111 | } | ||
6586 | 7112 | ||
6587 | /* | 7113 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6588 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6589 | * | ||
6590 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6591 | * and struct sched_domain. ) | ||
6592 | */ | ||
6593 | struct static_sched_group { | ||
6594 | struct sched_group sg; | ||
6595 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6596 | }; | ||
6597 | 7114 | ||
6598 | struct static_sched_domain { | 7115 | struct sd_data { |
6599 | struct sched_domain sd; | 7116 | struct sched_domain **__percpu sd; |
6600 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 7117 | struct sched_group **__percpu sg; |
7118 | struct sched_group_power **__percpu sgp; | ||
6601 | }; | 7119 | }; |
6602 | 7120 | ||
6603 | struct s_data { | 7121 | struct s_data { |
6604 | #ifdef CONFIG_NUMA | 7122 | struct sched_domain ** __percpu sd; |
6605 | int sd_allnodes; | ||
6606 | cpumask_var_t domainspan; | ||
6607 | cpumask_var_t covered; | ||
6608 | cpumask_var_t notcovered; | ||
6609 | #endif | ||
6610 | cpumask_var_t nodemask; | ||
6611 | cpumask_var_t this_sibling_map; | ||
6612 | cpumask_var_t this_core_map; | ||
6613 | cpumask_var_t send_covered; | ||
6614 | cpumask_var_t tmpmask; | ||
6615 | struct sched_group **sched_group_nodes; | ||
6616 | struct root_domain *rd; | 7123 | struct root_domain *rd; |
6617 | }; | 7124 | }; |
6618 | 7125 | ||
6619 | enum s_alloc { | 7126 | enum s_alloc { |
6620 | sa_sched_groups = 0, | ||
6621 | sa_rootdomain, | 7127 | sa_rootdomain, |
6622 | sa_tmpmask, | 7128 | sa_sd, |
6623 | sa_send_covered, | 7129 | sa_sd_storage, |
6624 | sa_this_core_map, | ||
6625 | sa_this_sibling_map, | ||
6626 | sa_nodemask, | ||
6627 | sa_sched_group_nodes, | ||
6628 | #ifdef CONFIG_NUMA | ||
6629 | sa_notcovered, | ||
6630 | sa_covered, | ||
6631 | sa_domainspan, | ||
6632 | #endif | ||
6633 | sa_none, | 7130 | sa_none, |
6634 | }; | 7131 | }; |
6635 | 7132 | ||
6636 | /* | 7133 | struct sched_domain_topology_level; |
6637 | * SMT sched-domains: | ||
6638 | */ | ||
6639 | #ifdef CONFIG_SCHED_SMT | ||
6640 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6641 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6642 | 7134 | ||
6643 | static int | 7135 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6644 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 7136 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6645 | struct sched_group **sg, struct cpumask *unused) | ||
6646 | { | ||
6647 | if (sg) | ||
6648 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6649 | return cpu; | ||
6650 | } | ||
6651 | #endif /* CONFIG_SCHED_SMT */ | ||
6652 | 7137 | ||
6653 | /* | 7138 | #define SDTL_OVERLAP 0x01 |
6654 | * multi-core sched-domains: | ||
6655 | */ | ||
6656 | #ifdef CONFIG_SCHED_MC | ||
6657 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | ||
6658 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6659 | #endif /* CONFIG_SCHED_MC */ | ||
6660 | 7139 | ||
6661 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 7140 | struct sched_domain_topology_level { |
6662 | static int | 7141 | sched_domain_init_f init; |
6663 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7142 | sched_domain_mask_f mask; |
6664 | struct sched_group **sg, struct cpumask *mask) | 7143 | int flags; |
6665 | { | 7144 | struct sd_data data; |
6666 | int group; | 7145 | }; |
6667 | 7146 | ||
6668 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6669 | group = cpumask_first(mask); | ||
6670 | if (sg) | ||
6671 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6672 | return group; | ||
6673 | } | ||
6674 | #elif defined(CONFIG_SCHED_MC) | ||
6675 | static int | 7147 | static int |
6676 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7148 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6677 | struct sched_group **sg, struct cpumask *unused) | ||
6678 | { | 7149 | { |
6679 | if (sg) | 7150 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; |
6680 | *sg = &per_cpu(sched_group_core, cpu).sg; | 7151 | const struct cpumask *span = sched_domain_span(sd); |
6681 | return cpu; | 7152 | struct cpumask *covered = sched_domains_tmpmask; |
6682 | } | 7153 | struct sd_data *sdd = sd->private; |
6683 | #endif | 7154 | struct sched_domain *child; |
7155 | int i; | ||
6684 | 7156 | ||
6685 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 7157 | cpumask_clear(covered); |
6686 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | ||
6687 | 7158 | ||
6688 | static int | 7159 | for_each_cpu(i, span) { |
6689 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | 7160 | struct cpumask *sg_span; |
6690 | struct sched_group **sg, struct cpumask *mask) | ||
6691 | { | ||
6692 | int group; | ||
6693 | #ifdef CONFIG_SCHED_MC | ||
6694 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6695 | group = cpumask_first(mask); | ||
6696 | #elif defined(CONFIG_SCHED_SMT) | ||
6697 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6698 | group = cpumask_first(mask); | ||
6699 | #else | ||
6700 | group = cpu; | ||
6701 | #endif | ||
6702 | if (sg) | ||
6703 | *sg = &per_cpu(sched_group_phys, group).sg; | ||
6704 | return group; | ||
6705 | } | ||
6706 | 7161 | ||
6707 | #ifdef CONFIG_NUMA | 7162 | if (cpumask_test_cpu(i, covered)) |
6708 | /* | 7163 | continue; |
6709 | * The init_sched_build_groups can't handle what we want to do with node | ||
6710 | * groups, so roll our own. Now each node has its own list of groups which | ||
6711 | * gets dynamically allocated. | ||
6712 | */ | ||
6713 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | ||
6714 | static struct sched_group ***sched_group_nodes_bycpu; | ||
6715 | 7164 | ||
6716 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | 7165 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6717 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | 7166 | GFP_KERNEL, cpu_to_node(i)); |
6718 | 7167 | ||
6719 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | 7168 | if (!sg) |
6720 | struct sched_group **sg, | 7169 | goto fail; |
6721 | struct cpumask *nodemask) | ||
6722 | { | ||
6723 | int group; | ||
6724 | 7170 | ||
6725 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | 7171 | sg_span = sched_group_cpus(sg); |
6726 | group = cpumask_first(nodemask); | ||
6727 | 7172 | ||
6728 | if (sg) | 7173 | child = *per_cpu_ptr(sdd->sd, i); |
6729 | *sg = &per_cpu(sched_group_allnodes, group).sg; | 7174 | if (child->child) { |
6730 | return group; | 7175 | child = child->child; |
6731 | } | 7176 | cpumask_copy(sg_span, sched_domain_span(child)); |
7177 | } else | ||
7178 | cpumask_set_cpu(i, sg_span); | ||
6732 | 7179 | ||
6733 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 7180 | cpumask_or(covered, covered, sg_span); |
6734 | { | ||
6735 | struct sched_group *sg = group_head; | ||
6736 | int j; | ||
6737 | 7181 | ||
6738 | if (!sg) | 7182 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); |
6739 | return; | 7183 | atomic_inc(&sg->sgp->ref); |
6740 | do { | ||
6741 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
6742 | struct sched_domain *sd; | ||
6743 | 7184 | ||
6744 | sd = &per_cpu(phys_domains, j).sd; | 7185 | if (cpumask_test_cpu(cpu, sg_span)) |
6745 | if (j != group_first_cpu(sd->groups)) { | 7186 | groups = sg; |
6746 | /* | ||
6747 | * Only add "power" once for each | ||
6748 | * physical package. | ||
6749 | */ | ||
6750 | continue; | ||
6751 | } | ||
6752 | 7187 | ||
6753 | sg->cpu_power += sd->groups->cpu_power; | 7188 | if (!first) |
6754 | } | 7189 | first = sg; |
6755 | sg = sg->next; | 7190 | if (last) |
6756 | } while (sg != group_head); | 7191 | last->next = sg; |
7192 | last = sg; | ||
7193 | last->next = first; | ||
7194 | } | ||
7195 | sd->groups = groups; | ||
7196 | |||
7197 | return 0; | ||
7198 | |||
7199 | fail: | ||
7200 | free_sched_groups(first, 0); | ||
7201 | |||
7202 | return -ENOMEM; | ||
6757 | } | 7203 | } |
6758 | 7204 | ||
6759 | static int build_numa_sched_groups(struct s_data *d, | 7205 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6760 | const struct cpumask *cpu_map, int num) | ||
6761 | { | 7206 | { |
6762 | struct sched_domain *sd; | 7207 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6763 | struct sched_group *sg, *prev; | 7208 | struct sched_domain *child = sd->child; |
6764 | int n, j; | ||
6765 | 7209 | ||
6766 | cpumask_clear(d->covered); | 7210 | if (child) |
6767 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | 7211 | cpu = cpumask_first(sched_domain_span(child)); |
6768 | if (cpumask_empty(d->nodemask)) { | 7212 | |
6769 | d->sched_group_nodes[num] = NULL; | 7213 | if (sg) { |
6770 | goto out; | 7214 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
7215 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | ||
7216 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | ||
6771 | } | 7217 | } |
6772 | 7218 | ||
6773 | sched_domain_node_span(num, d->domainspan); | 7219 | return cpu; |
6774 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | 7220 | } |
6775 | 7221 | ||
6776 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 7222 | /* |
6777 | GFP_KERNEL, num); | 7223 | * build_sched_groups will build a circular linked list of the groups |
6778 | if (!sg) { | 7224 | * covered by the given span, and will set each group's ->cpumask correctly, |
6779 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | 7225 | * and ->cpu_power to 0. |
6780 | num); | 7226 | * |
6781 | return -ENOMEM; | 7227 | * Assumes the sched_domain tree is fully constructed |
6782 | } | 7228 | */ |
6783 | d->sched_group_nodes[num] = sg; | 7229 | static int |
7230 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
7231 | { | ||
7232 | struct sched_group *first = NULL, *last = NULL; | ||
7233 | struct sd_data *sdd = sd->private; | ||
7234 | const struct cpumask *span = sched_domain_span(sd); | ||
7235 | struct cpumask *covered; | ||
7236 | int i; | ||
6784 | 7237 | ||
6785 | for_each_cpu(j, d->nodemask) { | 7238 | get_group(cpu, sdd, &sd->groups); |
6786 | sd = &per_cpu(node_domains, j).sd; | 7239 | atomic_inc(&sd->groups->ref); |
6787 | sd->groups = sg; | ||
6788 | } | ||
6789 | 7240 | ||
6790 | sg->cpu_power = 0; | 7241 | if (cpu != cpumask_first(sched_domain_span(sd))) |
6791 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 7242 | return 0; |
6792 | sg->next = sg; | ||
6793 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
6794 | 7243 | ||
6795 | prev = sg; | 7244 | lockdep_assert_held(&sched_domains_mutex); |
6796 | for (j = 0; j < nr_node_ids; j++) { | 7245 | covered = sched_domains_tmpmask; |
6797 | n = (num + j) % nr_node_ids; | ||
6798 | cpumask_complement(d->notcovered, d->covered); | ||
6799 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
6800 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
6801 | if (cpumask_empty(d->tmpmask)) | ||
6802 | break; | ||
6803 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
6804 | if (cpumask_empty(d->tmpmask)) | ||
6805 | continue; | ||
6806 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6807 | GFP_KERNEL, num); | ||
6808 | if (!sg) { | ||
6809 | printk(KERN_WARNING | ||
6810 | "Can not alloc domain group for node %d\n", j); | ||
6811 | return -ENOMEM; | ||
6812 | } | ||
6813 | sg->cpu_power = 0; | ||
6814 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
6815 | sg->next = prev->next; | ||
6816 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
6817 | prev->next = sg; | ||
6818 | prev = sg; | ||
6819 | } | ||
6820 | out: | ||
6821 | return 0; | ||
6822 | } | ||
6823 | #endif /* CONFIG_NUMA */ | ||
6824 | 7246 | ||
6825 | #ifdef CONFIG_NUMA | 7247 | cpumask_clear(covered); |
6826 | /* Free memory allocated for various sched_group structures */ | ||
6827 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6828 | struct cpumask *nodemask) | ||
6829 | { | ||
6830 | int cpu, i; | ||
6831 | 7248 | ||
6832 | for_each_cpu(cpu, cpu_map) { | 7249 | for_each_cpu(i, span) { |
6833 | struct sched_group **sched_group_nodes | 7250 | struct sched_group *sg; |
6834 | = sched_group_nodes_bycpu[cpu]; | 7251 | int group = get_group(i, sdd, &sg); |
7252 | int j; | ||
6835 | 7253 | ||
6836 | if (!sched_group_nodes) | 7254 | if (cpumask_test_cpu(i, covered)) |
6837 | continue; | 7255 | continue; |
6838 | 7256 | ||
6839 | for (i = 0; i < nr_node_ids; i++) { | 7257 | cpumask_clear(sched_group_cpus(sg)); |
6840 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7258 | sg->sgp->power = 0; |
6841 | 7259 | ||
6842 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 7260 | for_each_cpu(j, span) { |
6843 | if (cpumask_empty(nodemask)) | 7261 | if (get_group(j, sdd, NULL) != group) |
6844 | continue; | 7262 | continue; |
6845 | 7263 | ||
6846 | if (sg == NULL) | 7264 | cpumask_set_cpu(j, covered); |
6847 | continue; | 7265 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
6848 | sg = sg->next; | ||
6849 | next_sg: | ||
6850 | oldsg = sg; | ||
6851 | sg = sg->next; | ||
6852 | kfree(oldsg); | ||
6853 | if (oldsg != sched_group_nodes[i]) | ||
6854 | goto next_sg; | ||
6855 | } | 7266 | } |
6856 | kfree(sched_group_nodes); | 7267 | |
6857 | sched_group_nodes_bycpu[cpu] = NULL; | 7268 | if (!first) |
7269 | first = sg; | ||
7270 | if (last) | ||
7271 | last->next = sg; | ||
7272 | last = sg; | ||
6858 | } | 7273 | } |
7274 | last->next = first; | ||
7275 | |||
7276 | return 0; | ||
6859 | } | 7277 | } |
6860 | #else /* !CONFIG_NUMA */ | ||
6861 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
6862 | struct cpumask *nodemask) | ||
6863 | { | ||
6864 | } | ||
6865 | #endif /* CONFIG_NUMA */ | ||
6866 | 7278 | ||
6867 | /* | 7279 | /* |
6868 | * Initialize sched groups cpu_power. | 7280 | * Initialize sched groups cpu_power. |
@@ -6876,46 +7288,19 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
6876 | */ | 7288 | */ |
6877 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7289 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
6878 | { | 7290 | { |
6879 | struct sched_domain *child; | 7291 | struct sched_group *sg = sd->groups; |
6880 | struct sched_group *group; | ||
6881 | long power; | ||
6882 | int weight; | ||
6883 | |||
6884 | WARN_ON(!sd || !sd->groups); | ||
6885 | |||
6886 | if (cpu != group_first_cpu(sd->groups)) | ||
6887 | return; | ||
6888 | 7292 | ||
6889 | child = sd->child; | 7293 | WARN_ON(!sd || !sg); |
6890 | 7294 | ||
6891 | sd->groups->cpu_power = 0; | 7295 | do { |
7296 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
7297 | sg = sg->next; | ||
7298 | } while (sg != sd->groups); | ||
6892 | 7299 | ||
6893 | if (!child) { | 7300 | if (cpu != group_first_cpu(sg)) |
6894 | power = SCHED_LOAD_SCALE; | ||
6895 | weight = cpumask_weight(sched_domain_span(sd)); | ||
6896 | /* | ||
6897 | * SMT siblings share the power of a single core. | ||
6898 | * Usually multiple threads get a better yield out of | ||
6899 | * that one core than a single thread would have, | ||
6900 | * reflect that in sd->smt_gain. | ||
6901 | */ | ||
6902 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
6903 | power *= sd->smt_gain; | ||
6904 | power /= weight; | ||
6905 | power >>= SCHED_LOAD_SHIFT; | ||
6906 | } | ||
6907 | sd->groups->cpu_power += power; | ||
6908 | return; | 7301 | return; |
6909 | } | ||
6910 | 7302 | ||
6911 | /* | 7303 | update_group_power(sd, cpu); |
6912 | * Add cpu_power of each child group to this groups cpu_power. | ||
6913 | */ | ||
6914 | group = child->groups; | ||
6915 | do { | ||
6916 | sd->groups->cpu_power += group->cpu_power; | ||
6917 | group = group->next; | ||
6918 | } while (group != child->groups); | ||
6919 | } | 7304 | } |
6920 | 7305 | ||
6921 | /* | 7306 | /* |
@@ -6929,15 +7314,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6929 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7314 | # define SD_INIT_NAME(sd, type) do { } while (0) |
6930 | #endif | 7315 | #endif |
6931 | 7316 | ||
6932 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7317 | #define SD_INIT_FUNC(type) \ |
6933 | 7318 | static noinline struct sched_domain * \ | |
6934 | #define SD_INIT_FUNC(type) \ | 7319 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
6935 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7320 | { \ |
6936 | { \ | 7321 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
6937 | memset(sd, 0, sizeof(*sd)); \ | 7322 | *sd = SD_##type##_INIT; \ |
6938 | *sd = SD_##type##_INIT; \ | 7323 | SD_INIT_NAME(sd, type); \ |
6939 | sd->level = SD_LV_##type; \ | 7324 | sd->private = &tl->data; \ |
6940 | SD_INIT_NAME(sd, type); \ | 7325 | return sd; \ |
6941 | } | 7326 | } |
6942 | 7327 | ||
6943 | SD_INIT_FUNC(CPU) | 7328 | SD_INIT_FUNC(CPU) |
@@ -6951,15 +7336,19 @@ SD_INIT_FUNC(CPU) | |||
6951 | #ifdef CONFIG_SCHED_MC | 7336 | #ifdef CONFIG_SCHED_MC |
6952 | SD_INIT_FUNC(MC) | 7337 | SD_INIT_FUNC(MC) |
6953 | #endif | 7338 | #endif |
7339 | #ifdef CONFIG_SCHED_BOOK | ||
7340 | SD_INIT_FUNC(BOOK) | ||
7341 | #endif | ||
6954 | 7342 | ||
6955 | static int default_relax_domain_level = -1; | 7343 | static int default_relax_domain_level = -1; |
7344 | int sched_domain_level_max; | ||
6956 | 7345 | ||
6957 | static int __init setup_relax_domain_level(char *str) | 7346 | static int __init setup_relax_domain_level(char *str) |
6958 | { | 7347 | { |
6959 | unsigned long val; | 7348 | unsigned long val; |
6960 | 7349 | ||
6961 | val = simple_strtoul(str, NULL, 0); | 7350 | val = simple_strtoul(str, NULL, 0); |
6962 | if (val < SD_LV_MAX) | 7351 | if (val < sched_domain_level_max) |
6963 | default_relax_domain_level = val; | 7352 | default_relax_domain_level = val; |
6964 | 7353 | ||
6965 | return 1; | 7354 | return 1; |
@@ -6987,35 +7376,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
6987 | } | 7376 | } |
6988 | } | 7377 | } |
6989 | 7378 | ||
7379 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7380 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7381 | |||
6990 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7382 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
6991 | const struct cpumask *cpu_map) | 7383 | const struct cpumask *cpu_map) |
6992 | { | 7384 | { |
6993 | switch (what) { | 7385 | switch (what) { |
6994 | case sa_sched_groups: | ||
6995 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
6996 | d->sched_group_nodes = NULL; | ||
6997 | case sa_rootdomain: | 7386 | case sa_rootdomain: |
6998 | free_rootdomain(d->rd); /* fall through */ | 7387 | if (!atomic_read(&d->rd->refcount)) |
6999 | case sa_tmpmask: | 7388 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7000 | free_cpumask_var(d->tmpmask); /* fall through */ | 7389 | case sa_sd: |
7001 | case sa_send_covered: | 7390 | free_percpu(d->sd); /* fall through */ |
7002 | free_cpumask_var(d->send_covered); /* fall through */ | 7391 | case sa_sd_storage: |
7003 | case sa_this_core_map: | 7392 | __sdt_free(cpu_map); /* fall through */ |
7004 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7005 | case sa_this_sibling_map: | ||
7006 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7007 | case sa_nodemask: | ||
7008 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7009 | case sa_sched_group_nodes: | ||
7010 | #ifdef CONFIG_NUMA | ||
7011 | kfree(d->sched_group_nodes); /* fall through */ | ||
7012 | case sa_notcovered: | ||
7013 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7014 | case sa_covered: | ||
7015 | free_cpumask_var(d->covered); /* fall through */ | ||
7016 | case sa_domainspan: | ||
7017 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7018 | #endif | ||
7019 | case sa_none: | 7393 | case sa_none: |
7020 | break; | 7394 | break; |
7021 | } | 7395 | } |
@@ -7024,270 +7398,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7024 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7398 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7025 | const struct cpumask *cpu_map) | 7399 | const struct cpumask *cpu_map) |
7026 | { | 7400 | { |
7027 | #ifdef CONFIG_NUMA | 7401 | memset(d, 0, sizeof(*d)); |
7028 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7402 | |
7029 | return sa_none; | 7403 | if (__sdt_alloc(cpu_map)) |
7030 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7404 | return sa_sd_storage; |
7031 | return sa_domainspan; | 7405 | d->sd = alloc_percpu(struct sched_domain *); |
7032 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7406 | if (!d->sd) |
7033 | return sa_covered; | 7407 | return sa_sd_storage; |
7034 | /* Allocate the per-node list of sched groups */ | ||
7035 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7036 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7037 | if (!d->sched_group_nodes) { | ||
7038 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7039 | return sa_notcovered; | ||
7040 | } | ||
7041 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7042 | #endif | ||
7043 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7044 | return sa_sched_group_nodes; | ||
7045 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7046 | return sa_nodemask; | ||
7047 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7048 | return sa_this_sibling_map; | ||
7049 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7050 | return sa_this_core_map; | ||
7051 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7052 | return sa_send_covered; | ||
7053 | d->rd = alloc_rootdomain(); | 7408 | d->rd = alloc_rootdomain(); |
7054 | if (!d->rd) { | 7409 | if (!d->rd) |
7055 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7410 | return sa_sd; |
7056 | return sa_tmpmask; | ||
7057 | } | ||
7058 | return sa_rootdomain; | 7411 | return sa_rootdomain; |
7059 | } | 7412 | } |
7060 | 7413 | ||
7061 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7414 | /* |
7062 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7415 | * NULL the sd_data elements we've used to build the sched_domain and |
7416 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7417 | * will not free the data we're using. | ||
7418 | */ | ||
7419 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7063 | { | 7420 | { |
7064 | struct sched_domain *sd = NULL; | 7421 | struct sd_data *sdd = sd->private; |
7065 | #ifdef CONFIG_NUMA | ||
7066 | struct sched_domain *parent; | ||
7067 | |||
7068 | d->sd_allnodes = 0; | ||
7069 | if (cpumask_weight(cpu_map) > | ||
7070 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7071 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7072 | SD_INIT(sd, ALLNODES); | ||
7073 | set_domain_attribute(sd, attr); | ||
7074 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7075 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7076 | d->sd_allnodes = 1; | ||
7077 | } | ||
7078 | parent = sd; | ||
7079 | |||
7080 | sd = &per_cpu(node_domains, i).sd; | ||
7081 | SD_INIT(sd, NODE); | ||
7082 | set_domain_attribute(sd, attr); | ||
7083 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7084 | sd->parent = parent; | ||
7085 | if (parent) | ||
7086 | parent->child = sd; | ||
7087 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7088 | #endif | ||
7089 | return sd; | ||
7090 | } | ||
7091 | 7422 | ||
7092 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7423 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7093 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7424 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7094 | struct sched_domain *parent, int i) | ||
7095 | { | ||
7096 | struct sched_domain *sd; | ||
7097 | sd = &per_cpu(phys_domains, i).sd; | ||
7098 | SD_INIT(sd, CPU); | ||
7099 | set_domain_attribute(sd, attr); | ||
7100 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7101 | sd->parent = parent; | ||
7102 | if (parent) | ||
7103 | parent->child = sd; | ||
7104 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7105 | return sd; | ||
7106 | } | ||
7107 | 7425 | ||
7108 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7426 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
7109 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7427 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7110 | struct sched_domain *parent, int i) | 7428 | |
7111 | { | 7429 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) |
7112 | struct sched_domain *sd = parent; | 7430 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; |
7113 | #ifdef CONFIG_SCHED_MC | ||
7114 | sd = &per_cpu(core_domains, i).sd; | ||
7115 | SD_INIT(sd, MC); | ||
7116 | set_domain_attribute(sd, attr); | ||
7117 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7118 | sd->parent = parent; | ||
7119 | parent->child = sd; | ||
7120 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7121 | #endif | ||
7122 | return sd; | ||
7123 | } | 7431 | } |
7124 | 7432 | ||
7125 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7126 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7127 | struct sched_domain *parent, int i) | ||
7128 | { | ||
7129 | struct sched_domain *sd = parent; | ||
7130 | #ifdef CONFIG_SCHED_SMT | 7433 | #ifdef CONFIG_SCHED_SMT |
7131 | sd = &per_cpu(cpu_domains, i).sd; | 7434 | static const struct cpumask *cpu_smt_mask(int cpu) |
7132 | SD_INIT(sd, SIBLING); | 7435 | { |
7133 | set_domain_attribute(sd, attr); | 7436 | return topology_thread_cpumask(cpu); |
7134 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7135 | sd->parent = parent; | ||
7136 | parent->child = sd; | ||
7137 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7138 | #endif | ||
7139 | return sd; | ||
7140 | } | 7437 | } |
7438 | #endif | ||
7141 | 7439 | ||
7142 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7440 | /* |
7143 | const struct cpumask *cpu_map, int cpu) | 7441 | * Topology list, bottom-up. |
7144 | { | 7442 | */ |
7145 | switch (l) { | 7443 | static struct sched_domain_topology_level default_topology[] = { |
7146 | #ifdef CONFIG_SCHED_SMT | 7444 | #ifdef CONFIG_SCHED_SMT |
7147 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7445 | { sd_init_SIBLING, cpu_smt_mask, }, |
7148 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7149 | topology_thread_cpumask(cpu)); | ||
7150 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7151 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7152 | &cpu_to_cpu_group, | ||
7153 | d->send_covered, d->tmpmask); | ||
7154 | break; | ||
7155 | #endif | 7446 | #endif |
7156 | #ifdef CONFIG_SCHED_MC | 7447 | #ifdef CONFIG_SCHED_MC |
7157 | case SD_LV_MC: /* set up multi-core groups */ | 7448 | { sd_init_MC, cpu_coregroup_mask, }, |
7158 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7159 | if (cpu == cpumask_first(d->this_core_map)) | ||
7160 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7161 | &cpu_to_core_group, | ||
7162 | d->send_covered, d->tmpmask); | ||
7163 | break; | ||
7164 | #endif | 7449 | #endif |
7165 | case SD_LV_CPU: /* set up physical groups */ | 7450 | #ifdef CONFIG_SCHED_BOOK |
7166 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7451 | { sd_init_BOOK, cpu_book_mask, }, |
7167 | if (!cpumask_empty(d->nodemask)) | 7452 | #endif |
7168 | init_sched_build_groups(d->nodemask, cpu_map, | 7453 | { sd_init_CPU, cpu_cpu_mask, }, |
7169 | &cpu_to_phys_group, | ||
7170 | d->send_covered, d->tmpmask); | ||
7171 | break; | ||
7172 | #ifdef CONFIG_NUMA | 7454 | #ifdef CONFIG_NUMA |
7173 | case SD_LV_ALLNODES: | 7455 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, |
7174 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7456 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7175 | d->send_covered, d->tmpmask); | ||
7176 | break; | ||
7177 | #endif | 7457 | #endif |
7178 | default: | 7458 | { NULL, }, |
7179 | break; | 7459 | }; |
7460 | |||
7461 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7462 | |||
7463 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7464 | { | ||
7465 | struct sched_domain_topology_level *tl; | ||
7466 | int j; | ||
7467 | |||
7468 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7469 | struct sd_data *sdd = &tl->data; | ||
7470 | |||
7471 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7472 | if (!sdd->sd) | ||
7473 | return -ENOMEM; | ||
7474 | |||
7475 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7476 | if (!sdd->sg) | ||
7477 | return -ENOMEM; | ||
7478 | |||
7479 | sdd->sgp = alloc_percpu(struct sched_group_power *); | ||
7480 | if (!sdd->sgp) | ||
7481 | return -ENOMEM; | ||
7482 | |||
7483 | for_each_cpu(j, cpu_map) { | ||
7484 | struct sched_domain *sd; | ||
7485 | struct sched_group *sg; | ||
7486 | struct sched_group_power *sgp; | ||
7487 | |||
7488 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7489 | GFP_KERNEL, cpu_to_node(j)); | ||
7490 | if (!sd) | ||
7491 | return -ENOMEM; | ||
7492 | |||
7493 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7494 | |||
7495 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7496 | GFP_KERNEL, cpu_to_node(j)); | ||
7497 | if (!sg) | ||
7498 | return -ENOMEM; | ||
7499 | |||
7500 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7501 | |||
7502 | sgp = kzalloc_node(sizeof(struct sched_group_power), | ||
7503 | GFP_KERNEL, cpu_to_node(j)); | ||
7504 | if (!sgp) | ||
7505 | return -ENOMEM; | ||
7506 | |||
7507 | *per_cpu_ptr(sdd->sgp, j) = sgp; | ||
7508 | } | ||
7509 | } | ||
7510 | |||
7511 | return 0; | ||
7512 | } | ||
7513 | |||
7514 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7515 | { | ||
7516 | struct sched_domain_topology_level *tl; | ||
7517 | int j; | ||
7518 | |||
7519 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7520 | struct sd_data *sdd = &tl->data; | ||
7521 | |||
7522 | for_each_cpu(j, cpu_map) { | ||
7523 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | ||
7524 | if (sd && (sd->flags & SD_OVERLAP)) | ||
7525 | free_sched_groups(sd->groups, 0); | ||
7526 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7527 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
7528 | } | ||
7529 | free_percpu(sdd->sd); | ||
7530 | free_percpu(sdd->sg); | ||
7531 | free_percpu(sdd->sgp); | ||
7532 | } | ||
7533 | } | ||
7534 | |||
7535 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7536 | struct s_data *d, const struct cpumask *cpu_map, | ||
7537 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7538 | int cpu) | ||
7539 | { | ||
7540 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7541 | if (!sd) | ||
7542 | return child; | ||
7543 | |||
7544 | set_domain_attribute(sd, attr); | ||
7545 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7546 | if (child) { | ||
7547 | sd->level = child->level + 1; | ||
7548 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7549 | child->parent = sd; | ||
7180 | } | 7550 | } |
7551 | sd->child = child; | ||
7552 | |||
7553 | return sd; | ||
7181 | } | 7554 | } |
7182 | 7555 | ||
7183 | /* | 7556 | /* |
7184 | * Build sched domains for a given set of cpus and attach the sched domains | 7557 | * Build sched domains for a given set of cpus and attach the sched domains |
7185 | * to the individual cpus | 7558 | * to the individual cpus |
7186 | */ | 7559 | */ |
7187 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7560 | static int build_sched_domains(const struct cpumask *cpu_map, |
7188 | struct sched_domain_attr *attr) | 7561 | struct sched_domain_attr *attr) |
7189 | { | 7562 | { |
7190 | enum s_alloc alloc_state = sa_none; | 7563 | enum s_alloc alloc_state = sa_none; |
7191 | struct s_data d; | ||
7192 | struct sched_domain *sd; | 7564 | struct sched_domain *sd; |
7193 | int i; | 7565 | struct s_data d; |
7194 | #ifdef CONFIG_NUMA | 7566 | int i, ret = -ENOMEM; |
7195 | d.sd_allnodes = 0; | ||
7196 | #endif | ||
7197 | 7567 | ||
7198 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7568 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7199 | if (alloc_state != sa_rootdomain) | 7569 | if (alloc_state != sa_rootdomain) |
7200 | goto error; | 7570 | goto error; |
7201 | alloc_state = sa_sched_groups; | ||
7202 | |||
7203 | /* | ||
7204 | * Set up domains for cpus specified by the cpu_map. | ||
7205 | */ | ||
7206 | for_each_cpu(i, cpu_map) { | ||
7207 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | ||
7208 | cpu_map); | ||
7209 | |||
7210 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | ||
7211 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | ||
7212 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7213 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7214 | } | ||
7215 | 7571 | ||
7572 | /* Set up domains for cpus specified by the cpu_map. */ | ||
7216 | for_each_cpu(i, cpu_map) { | 7573 | for_each_cpu(i, cpu_map) { |
7217 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7574 | struct sched_domain_topology_level *tl; |
7218 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7575 | |
7219 | } | 7576 | sd = NULL; |
7220 | 7577 | for (tl = sched_domain_topology; tl->init; tl++) { | |
7221 | /* Set up physical groups */ | 7578 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7222 | for (i = 0; i < nr_node_ids; i++) | 7579 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) |
7223 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | 7580 | sd->flags |= SD_OVERLAP; |
7224 | 7581 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | |
7225 | #ifdef CONFIG_NUMA | 7582 | break; |
7226 | /* Set up node groups */ | 7583 | } |
7227 | if (d.sd_allnodes) | ||
7228 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7229 | 7584 | ||
7230 | for (i = 0; i < nr_node_ids; i++) | 7585 | while (sd->child) |
7231 | if (build_numa_sched_groups(&d, cpu_map, i)) | 7586 | sd = sd->child; |
7232 | goto error; | ||
7233 | #endif | ||
7234 | 7587 | ||
7235 | /* Calculate CPU power for physical packages and nodes */ | 7588 | *per_cpu_ptr(d.sd, i) = sd; |
7236 | #ifdef CONFIG_SCHED_SMT | ||
7237 | for_each_cpu(i, cpu_map) { | ||
7238 | sd = &per_cpu(cpu_domains, i).sd; | ||
7239 | init_sched_groups_power(i, sd); | ||
7240 | } | ||
7241 | #endif | ||
7242 | #ifdef CONFIG_SCHED_MC | ||
7243 | for_each_cpu(i, cpu_map) { | ||
7244 | sd = &per_cpu(core_domains, i).sd; | ||
7245 | init_sched_groups_power(i, sd); | ||
7246 | } | 7589 | } |
7247 | #endif | ||
7248 | 7590 | ||
7591 | /* Build the groups for the domains */ | ||
7249 | for_each_cpu(i, cpu_map) { | 7592 | for_each_cpu(i, cpu_map) { |
7250 | sd = &per_cpu(phys_domains, i).sd; | 7593 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7251 | init_sched_groups_power(i, sd); | 7594 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7595 | if (sd->flags & SD_OVERLAP) { | ||
7596 | if (build_overlap_sched_groups(sd, i)) | ||
7597 | goto error; | ||
7598 | } else { | ||
7599 | if (build_sched_groups(sd, i)) | ||
7600 | goto error; | ||
7601 | } | ||
7602 | } | ||
7252 | } | 7603 | } |
7253 | 7604 | ||
7254 | #ifdef CONFIG_NUMA | 7605 | /* Calculate CPU power for physical packages and nodes */ |
7255 | for (i = 0; i < nr_node_ids; i++) | 7606 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7256 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | 7607 | if (!cpumask_test_cpu(i, cpu_map)) |
7257 | 7608 | continue; | |
7258 | if (d.sd_allnodes) { | ||
7259 | struct sched_group *sg; | ||
7260 | 7609 | ||
7261 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7610 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7262 | d.tmpmask); | 7611 | claim_allocations(i, sd); |
7263 | init_numa_sched_groups_power(sg); | 7612 | init_sched_groups_power(i, sd); |
7613 | } | ||
7264 | } | 7614 | } |
7265 | #endif | ||
7266 | 7615 | ||
7267 | /* Attach the domains */ | 7616 | /* Attach the domains */ |
7617 | rcu_read_lock(); | ||
7268 | for_each_cpu(i, cpu_map) { | 7618 | for_each_cpu(i, cpu_map) { |
7269 | #ifdef CONFIG_SCHED_SMT | 7619 | sd = *per_cpu_ptr(d.sd, i); |
7270 | sd = &per_cpu(cpu_domains, i).sd; | ||
7271 | #elif defined(CONFIG_SCHED_MC) | ||
7272 | sd = &per_cpu(core_domains, i).sd; | ||
7273 | #else | ||
7274 | sd = &per_cpu(phys_domains, i).sd; | ||
7275 | #endif | ||
7276 | cpu_attach_domain(sd, d.rd, i); | 7620 | cpu_attach_domain(sd, d.rd, i); |
7277 | } | 7621 | } |
7622 | rcu_read_unlock(); | ||
7278 | 7623 | ||
7279 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7624 | ret = 0; |
7280 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7281 | return 0; | ||
7282 | |||
7283 | error: | 7625 | error: |
7284 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7626 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7285 | return -ENOMEM; | 7627 | return ret; |
7286 | } | ||
7287 | |||
7288 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7289 | { | ||
7290 | return __build_sched_domains(cpu_map, NULL); | ||
7291 | } | 7628 | } |
7292 | 7629 | ||
7293 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7630 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7342,7 +7679,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7342 | * For now this just excludes isolated cpus, but could be used to | 7679 | * For now this just excludes isolated cpus, but could be used to |
7343 | * exclude other special cases in the future. | 7680 | * exclude other special cases in the future. |
7344 | */ | 7681 | */ |
7345 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7682 | static int init_sched_domains(const struct cpumask *cpu_map) |
7346 | { | 7683 | { |
7347 | int err; | 7684 | int err; |
7348 | 7685 | ||
@@ -7353,32 +7690,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7353 | doms_cur = &fallback_doms; | 7690 | doms_cur = &fallback_doms; |
7354 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7691 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7355 | dattr_cur = NULL; | 7692 | dattr_cur = NULL; |
7356 | err = build_sched_domains(doms_cur[0]); | 7693 | err = build_sched_domains(doms_cur[0], NULL); |
7357 | register_sched_domain_sysctl(); | 7694 | register_sched_domain_sysctl(); |
7358 | 7695 | ||
7359 | return err; | 7696 | return err; |
7360 | } | 7697 | } |
7361 | 7698 | ||
7362 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7363 | struct cpumask *tmpmask) | ||
7364 | { | ||
7365 | free_sched_groups(cpu_map, tmpmask); | ||
7366 | } | ||
7367 | |||
7368 | /* | 7699 | /* |
7369 | * Detach sched domains from a group of cpus specified in cpu_map | 7700 | * Detach sched domains from a group of cpus specified in cpu_map |
7370 | * These cpus will now be attached to the NULL domain | 7701 | * These cpus will now be attached to the NULL domain |
7371 | */ | 7702 | */ |
7372 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7703 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7373 | { | 7704 | { |
7374 | /* Save because hotplug lock held. */ | ||
7375 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7376 | int i; | 7705 | int i; |
7377 | 7706 | ||
7707 | rcu_read_lock(); | ||
7378 | for_each_cpu(i, cpu_map) | 7708 | for_each_cpu(i, cpu_map) |
7379 | cpu_attach_domain(NULL, &def_root_domain, i); | 7709 | cpu_attach_domain(NULL, &def_root_domain, i); |
7380 | synchronize_sched(); | 7710 | rcu_read_unlock(); |
7381 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7382 | } | 7711 | } |
7383 | 7712 | ||
7384 | /* handle null as "default" */ | 7713 | /* handle null as "default" */ |
@@ -7467,8 +7796,7 @@ match1: | |||
7467 | goto match2; | 7796 | goto match2; |
7468 | } | 7797 | } |
7469 | /* no match - add a new doms_new */ | 7798 | /* no match - add a new doms_new */ |
7470 | __build_sched_domains(doms_new[i], | 7799 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7471 | dattr_new ? dattr_new + i : NULL); | ||
7472 | match2: | 7800 | match2: |
7473 | ; | 7801 | ; |
7474 | } | 7802 | } |
@@ -7487,7 +7815,7 @@ match2: | |||
7487 | } | 7815 | } |
7488 | 7816 | ||
7489 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7817 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7490 | static void arch_reinit_sched_domains(void) | 7818 | static void reinit_sched_domains(void) |
7491 | { | 7819 | { |
7492 | get_online_cpus(); | 7820 | get_online_cpus(); |
7493 | 7821 | ||
@@ -7520,7 +7848,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7520 | else | 7848 | else |
7521 | sched_mc_power_savings = level; | 7849 | sched_mc_power_savings = level; |
7522 | 7850 | ||
7523 | arch_reinit_sched_domains(); | 7851 | reinit_sched_domains(); |
7524 | 7852 | ||
7525 | return count; | 7853 | return count; |
7526 | } | 7854 | } |
@@ -7639,14 +7967,9 @@ void __init sched_init_smp(void) | |||
7639 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7967 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7640 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7968 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7641 | 7969 | ||
7642 | #if defined(CONFIG_NUMA) | ||
7643 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7644 | GFP_KERNEL); | ||
7645 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7646 | #endif | ||
7647 | get_online_cpus(); | 7970 | get_online_cpus(); |
7648 | mutex_lock(&sched_domains_mutex); | 7971 | mutex_lock(&sched_domains_mutex); |
7649 | arch_init_sched_domains(cpu_active_mask); | 7972 | init_sched_domains(cpu_active_mask); |
7650 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7973 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7651 | if (cpumask_empty(non_isolated_cpus)) | 7974 | if (cpumask_empty(non_isolated_cpus)) |
7652 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7975 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -7691,8 +8014,15 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7691 | INIT_LIST_HEAD(&cfs_rq->tasks); | 8014 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7692 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8015 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7693 | cfs_rq->rq = rq; | 8016 | cfs_rq->rq = rq; |
8017 | /* allow initial update_cfs_load() to truncate */ | ||
8018 | #ifdef CONFIG_SMP | ||
8019 | cfs_rq->load_stamp = 1; | ||
8020 | #endif | ||
7694 | #endif | 8021 | #endif |
7695 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8022 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
8023 | #ifndef CONFIG_64BIT | ||
8024 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
8025 | #endif | ||
7696 | } | 8026 | } |
7697 | 8027 | ||
7698 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 8028 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
@@ -7733,18 +8063,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7733 | 8063 | ||
7734 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8064 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7735 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 8065 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7736 | struct sched_entity *se, int cpu, int add, | 8066 | struct sched_entity *se, int cpu, |
7737 | struct sched_entity *parent) | 8067 | struct sched_entity *parent) |
7738 | { | 8068 | { |
7739 | struct rq *rq = cpu_rq(cpu); | 8069 | struct rq *rq = cpu_rq(cpu); |
7740 | tg->cfs_rq[cpu] = cfs_rq; | 8070 | tg->cfs_rq[cpu] = cfs_rq; |
7741 | init_cfs_rq(cfs_rq, rq); | 8071 | init_cfs_rq(cfs_rq, rq); |
7742 | cfs_rq->tg = tg; | 8072 | cfs_rq->tg = tg; |
7743 | if (add) | ||
7744 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7745 | 8073 | ||
7746 | tg->se[cpu] = se; | 8074 | tg->se[cpu] = se; |
7747 | /* se could be NULL for init_task_group */ | 8075 | /* se could be NULL for root_task_group */ |
7748 | if (!se) | 8076 | if (!se) |
7749 | return; | 8077 | return; |
7750 | 8078 | ||
@@ -7754,15 +8082,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7754 | se->cfs_rq = parent->my_q; | 8082 | se->cfs_rq = parent->my_q; |
7755 | 8083 | ||
7756 | se->my_q = cfs_rq; | 8084 | se->my_q = cfs_rq; |
7757 | se->load.weight = tg->shares; | 8085 | update_load_set(&se->load, 0); |
7758 | se->load.inv_weight = 0; | ||
7759 | se->parent = parent; | 8086 | se->parent = parent; |
7760 | } | 8087 | } |
7761 | #endif | 8088 | #endif |
7762 | 8089 | ||
7763 | #ifdef CONFIG_RT_GROUP_SCHED | 8090 | #ifdef CONFIG_RT_GROUP_SCHED |
7764 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 8091 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7765 | struct sched_rt_entity *rt_se, int cpu, int add, | 8092 | struct sched_rt_entity *rt_se, int cpu, |
7766 | struct sched_rt_entity *parent) | 8093 | struct sched_rt_entity *parent) |
7767 | { | 8094 | { |
7768 | struct rq *rq = cpu_rq(cpu); | 8095 | struct rq *rq = cpu_rq(cpu); |
@@ -7771,8 +8098,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7771 | init_rt_rq(rt_rq, rq); | 8098 | init_rt_rq(rt_rq, rq); |
7772 | rt_rq->tg = tg; | 8099 | rt_rq->tg = tg; |
7773 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 8100 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7774 | if (add) | ||
7775 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7776 | 8101 | ||
7777 | tg->rt_se[cpu] = rt_se; | 8102 | tg->rt_se[cpu] = rt_se; |
7778 | if (!rt_se) | 8103 | if (!rt_se) |
@@ -7807,18 +8132,18 @@ void __init sched_init(void) | |||
7807 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 8132 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7808 | 8133 | ||
7809 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8134 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7810 | init_task_group.se = (struct sched_entity **)ptr; | 8135 | root_task_group.se = (struct sched_entity **)ptr; |
7811 | ptr += nr_cpu_ids * sizeof(void **); | 8136 | ptr += nr_cpu_ids * sizeof(void **); |
7812 | 8137 | ||
7813 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 8138 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7814 | ptr += nr_cpu_ids * sizeof(void **); | 8139 | ptr += nr_cpu_ids * sizeof(void **); |
7815 | 8140 | ||
7816 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8141 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7817 | #ifdef CONFIG_RT_GROUP_SCHED | 8142 | #ifdef CONFIG_RT_GROUP_SCHED |
7818 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 8143 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7819 | ptr += nr_cpu_ids * sizeof(void **); | 8144 | ptr += nr_cpu_ids * sizeof(void **); |
7820 | 8145 | ||
7821 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 8146 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7822 | ptr += nr_cpu_ids * sizeof(void **); | 8147 | ptr += nr_cpu_ids * sizeof(void **); |
7823 | 8148 | ||
7824 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8149 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -7838,20 +8163,16 @@ void __init sched_init(void) | |||
7838 | global_rt_period(), global_rt_runtime()); | 8163 | global_rt_period(), global_rt_runtime()); |
7839 | 8164 | ||
7840 | #ifdef CONFIG_RT_GROUP_SCHED | 8165 | #ifdef CONFIG_RT_GROUP_SCHED |
7841 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 8166 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7842 | global_rt_period(), global_rt_runtime()); | 8167 | global_rt_period(), global_rt_runtime()); |
7843 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8168 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7844 | 8169 | ||
7845 | #ifdef CONFIG_CGROUP_SCHED | 8170 | #ifdef CONFIG_CGROUP_SCHED |
7846 | list_add(&init_task_group.list, &task_groups); | 8171 | list_add(&root_task_group.list, &task_groups); |
7847 | INIT_LIST_HEAD(&init_task_group.children); | 8172 | INIT_LIST_HEAD(&root_task_group.children); |
7848 | 8173 | autogroup_init(&init_task); | |
7849 | #endif /* CONFIG_CGROUP_SCHED */ | 8174 | #endif /* CONFIG_CGROUP_SCHED */ |
7850 | 8175 | ||
7851 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7852 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7853 | __alignof__(unsigned long)); | ||
7854 | #endif | ||
7855 | for_each_possible_cpu(i) { | 8176 | for_each_possible_cpu(i) { |
7856 | struct rq *rq; | 8177 | struct rq *rq; |
7857 | 8178 | ||
@@ -7863,38 +8184,34 @@ void __init sched_init(void) | |||
7863 | init_cfs_rq(&rq->cfs, rq); | 8184 | init_cfs_rq(&rq->cfs, rq); |
7864 | init_rt_rq(&rq->rt, rq); | 8185 | init_rt_rq(&rq->rt, rq); |
7865 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8186 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7866 | init_task_group.shares = init_task_group_load; | 8187 | root_task_group.shares = root_task_group_load; |
7867 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 8188 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7868 | #ifdef CONFIG_CGROUP_SCHED | ||
7869 | /* | 8189 | /* |
7870 | * How much cpu bandwidth does init_task_group get? | 8190 | * How much cpu bandwidth does root_task_group get? |
7871 | * | 8191 | * |
7872 | * In case of task-groups formed thr' the cgroup filesystem, it | 8192 | * In case of task-groups formed thr' the cgroup filesystem, it |
7873 | * gets 100% of the cpu resources in the system. This overall | 8193 | * gets 100% of the cpu resources in the system. This overall |
7874 | * system cpu resource is divided among the tasks of | 8194 | * system cpu resource is divided among the tasks of |
7875 | * init_task_group and its child task-groups in a fair manner, | 8195 | * root_task_group and its child task-groups in a fair manner, |
7876 | * based on each entity's (task or task-group's) weight | 8196 | * based on each entity's (task or task-group's) weight |
7877 | * (se->load.weight). | 8197 | * (se->load.weight). |
7878 | * | 8198 | * |
7879 | * In other words, if init_task_group has 10 tasks of weight | 8199 | * In other words, if root_task_group has 10 tasks of weight |
7880 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 8200 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7881 | * then A0's share of the cpu resource is: | 8201 | * then A0's share of the cpu resource is: |
7882 | * | 8202 | * |
7883 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 8203 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7884 | * | 8204 | * |
7885 | * We achieve this by letting init_task_group's tasks sit | 8205 | * We achieve this by letting root_task_group's tasks sit |
7886 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 8206 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
7887 | */ | 8207 | */ |
7888 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 8208 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
7889 | #endif | ||
7890 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8209 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7891 | 8210 | ||
7892 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 8211 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
7893 | #ifdef CONFIG_RT_GROUP_SCHED | 8212 | #ifdef CONFIG_RT_GROUP_SCHED |
7894 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 8213 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7895 | #ifdef CONFIG_CGROUP_SCHED | 8214 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
7896 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
7897 | #endif | ||
7898 | #endif | 8215 | #endif |
7899 | 8216 | ||
7900 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8217 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -7905,7 +8222,7 @@ void __init sched_init(void) | |||
7905 | #ifdef CONFIG_SMP | 8222 | #ifdef CONFIG_SMP |
7906 | rq->sd = NULL; | 8223 | rq->sd = NULL; |
7907 | rq->rd = NULL; | 8224 | rq->rd = NULL; |
7908 | rq->cpu_power = SCHED_LOAD_SCALE; | 8225 | rq->cpu_power = SCHED_POWER_SCALE; |
7909 | rq->post_schedule = 0; | 8226 | rq->post_schedule = 0; |
7910 | rq->active_balance = 0; | 8227 | rq->active_balance = 0; |
7911 | rq->next_balance = jiffies; | 8228 | rq->next_balance = jiffies; |
@@ -7962,6 +8279,7 @@ void __init sched_init(void) | |||
7962 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 8279 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
7963 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8280 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7964 | #ifdef CONFIG_SMP | 8281 | #ifdef CONFIG_SMP |
8282 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
7965 | #ifdef CONFIG_NO_HZ | 8283 | #ifdef CONFIG_NO_HZ |
7966 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 8284 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7967 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 8285 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -7974,8 +8292,6 @@ void __init sched_init(void) | |||
7974 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8292 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7975 | #endif /* SMP */ | 8293 | #endif /* SMP */ |
7976 | 8294 | ||
7977 | perf_event_init(); | ||
7978 | |||
7979 | scheduler_running = 1; | 8295 | scheduler_running = 1; |
7980 | } | 8296 | } |
7981 | 8297 | ||
@@ -7984,7 +8300,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
7984 | { | 8300 | { |
7985 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8301 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
7986 | 8302 | ||
7987 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 8303 | return (nested == preempt_offset); |
7988 | } | 8304 | } |
7989 | 8305 | ||
7990 | void __might_sleep(const char *file, int line, int preempt_offset) | 8306 | void __might_sleep(const char *file, int line, int preempt_offset) |
@@ -8019,9 +8335,11 @@ EXPORT_SYMBOL(__might_sleep); | |||
8019 | #ifdef CONFIG_MAGIC_SYSRQ | 8335 | #ifdef CONFIG_MAGIC_SYSRQ |
8020 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8336 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8021 | { | 8337 | { |
8338 | const struct sched_class *prev_class = p->sched_class; | ||
8339 | int old_prio = p->prio; | ||
8022 | int on_rq; | 8340 | int on_rq; |
8023 | 8341 | ||
8024 | on_rq = p->se.on_rq; | 8342 | on_rq = p->on_rq; |
8025 | if (on_rq) | 8343 | if (on_rq) |
8026 | deactivate_task(rq, p, 0); | 8344 | deactivate_task(rq, p, 0); |
8027 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8345 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8029,6 +8347,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8029 | activate_task(rq, p, 0); | 8347 | activate_task(rq, p, 0); |
8030 | resched_task(rq->curr); | 8348 | resched_task(rq->curr); |
8031 | } | 8349 | } |
8350 | |||
8351 | check_class_changed(rq, p, prev_class, old_prio); | ||
8032 | } | 8352 | } |
8033 | 8353 | ||
8034 | void normalize_rt_tasks(void) | 8354 | void normalize_rt_tasks(void) |
@@ -8144,7 +8464,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8144 | { | 8464 | { |
8145 | struct cfs_rq *cfs_rq; | 8465 | struct cfs_rq *cfs_rq; |
8146 | struct sched_entity *se; | 8466 | struct sched_entity *se; |
8147 | struct rq *rq; | ||
8148 | int i; | 8467 | int i; |
8149 | 8468 | ||
8150 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8469 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8157,8 +8476,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8157 | tg->shares = NICE_0_LOAD; | 8476 | tg->shares = NICE_0_LOAD; |
8158 | 8477 | ||
8159 | for_each_possible_cpu(i) { | 8478 | for_each_possible_cpu(i) { |
8160 | rq = cpu_rq(i); | ||
8161 | |||
8162 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8479 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8163 | GFP_KERNEL, cpu_to_node(i)); | 8480 | GFP_KERNEL, cpu_to_node(i)); |
8164 | if (!cfs_rq) | 8481 | if (!cfs_rq) |
@@ -8169,26 +8486,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8169 | if (!se) | 8486 | if (!se) |
8170 | goto err_free_rq; | 8487 | goto err_free_rq; |
8171 | 8488 | ||
8172 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8489 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8173 | } | 8490 | } |
8174 | 8491 | ||
8175 | return 1; | 8492 | return 1; |
8176 | 8493 | ||
8177 | err_free_rq: | 8494 | err_free_rq: |
8178 | kfree(cfs_rq); | 8495 | kfree(cfs_rq); |
8179 | err: | 8496 | err: |
8180 | return 0; | 8497 | return 0; |
8181 | } | 8498 | } |
8182 | 8499 | ||
8183 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8184 | { | ||
8185 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8186 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8187 | } | ||
8188 | |||
8189 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8500 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8190 | { | 8501 | { |
8191 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8502 | struct rq *rq = cpu_rq(cpu); |
8503 | unsigned long flags; | ||
8504 | |||
8505 | /* | ||
8506 | * Only empty task groups can be destroyed; so we can speculatively | ||
8507 | * check on_list without danger of it being re-added. | ||
8508 | */ | ||
8509 | if (!tg->cfs_rq[cpu]->on_list) | ||
8510 | return; | ||
8511 | |||
8512 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8513 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8192 | } | 8515 | } |
8193 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8516 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8194 | static inline void free_fair_sched_group(struct task_group *tg) | 8517 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8201,10 +8524,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8201 | return 1; | 8524 | return 1; |
8202 | } | 8525 | } |
8203 | 8526 | ||
8204 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8205 | { | ||
8206 | } | ||
8207 | |||
8208 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8527 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8209 | { | 8528 | { |
8210 | } | 8529 | } |
@@ -8233,7 +8552,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8233 | { | 8552 | { |
8234 | struct rt_rq *rt_rq; | 8553 | struct rt_rq *rt_rq; |
8235 | struct sched_rt_entity *rt_se; | 8554 | struct sched_rt_entity *rt_se; |
8236 | struct rq *rq; | ||
8237 | int i; | 8555 | int i; |
8238 | 8556 | ||
8239 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8557 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8247,8 +8565,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8247 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8565 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8248 | 8566 | ||
8249 | for_each_possible_cpu(i) { | 8567 | for_each_possible_cpu(i) { |
8250 | rq = cpu_rq(i); | ||
8251 | |||
8252 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8568 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8253 | GFP_KERNEL, cpu_to_node(i)); | 8569 | GFP_KERNEL, cpu_to_node(i)); |
8254 | if (!rt_rq) | 8570 | if (!rt_rq) |
@@ -8259,27 +8575,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8259 | if (!rt_se) | 8575 | if (!rt_se) |
8260 | goto err_free_rq; | 8576 | goto err_free_rq; |
8261 | 8577 | ||
8262 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8578 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8263 | } | 8579 | } |
8264 | 8580 | ||
8265 | return 1; | 8581 | return 1; |
8266 | 8582 | ||
8267 | err_free_rq: | 8583 | err_free_rq: |
8268 | kfree(rt_rq); | 8584 | kfree(rt_rq); |
8269 | err: | 8585 | err: |
8270 | return 0; | 8586 | return 0; |
8271 | } | 8587 | } |
8272 | |||
8273 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8274 | { | ||
8275 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8276 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8277 | } | ||
8278 | |||
8279 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8280 | { | ||
8281 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8282 | } | ||
8283 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8588 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8284 | static inline void free_rt_sched_group(struct task_group *tg) | 8589 | static inline void free_rt_sched_group(struct task_group *tg) |
8285 | { | 8590 | { |
@@ -8290,14 +8595,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8290 | { | 8595 | { |
8291 | return 1; | 8596 | return 1; |
8292 | } | 8597 | } |
8293 | |||
8294 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8295 | { | ||
8296 | } | ||
8297 | |||
8298 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8299 | { | ||
8300 | } | ||
8301 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8598 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8302 | 8599 | ||
8303 | #ifdef CONFIG_CGROUP_SCHED | 8600 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8305,6 +8602,7 @@ static void free_sched_group(struct task_group *tg) | |||
8305 | { | 8602 | { |
8306 | free_fair_sched_group(tg); | 8603 | free_fair_sched_group(tg); |
8307 | free_rt_sched_group(tg); | 8604 | free_rt_sched_group(tg); |
8605 | autogroup_free(tg); | ||
8308 | kfree(tg); | 8606 | kfree(tg); |
8309 | } | 8607 | } |
8310 | 8608 | ||
@@ -8313,7 +8611,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8313 | { | 8611 | { |
8314 | struct task_group *tg; | 8612 | struct task_group *tg; |
8315 | unsigned long flags; | 8613 | unsigned long flags; |
8316 | int i; | ||
8317 | 8614 | ||
8318 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8615 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8319 | if (!tg) | 8616 | if (!tg) |
@@ -8326,10 +8623,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8326 | goto err; | 8623 | goto err; |
8327 | 8624 | ||
8328 | spin_lock_irqsave(&task_group_lock, flags); | 8625 | spin_lock_irqsave(&task_group_lock, flags); |
8329 | for_each_possible_cpu(i) { | ||
8330 | register_fair_sched_group(tg, i); | ||
8331 | register_rt_sched_group(tg, i); | ||
8332 | } | ||
8333 | list_add_rcu(&tg->list, &task_groups); | 8626 | list_add_rcu(&tg->list, &task_groups); |
8334 | 8627 | ||
8335 | WARN_ON(!parent); /* root should already exist */ | 8628 | WARN_ON(!parent); /* root should already exist */ |
@@ -8359,11 +8652,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8359 | unsigned long flags; | 8652 | unsigned long flags; |
8360 | int i; | 8653 | int i; |
8361 | 8654 | ||
8362 | spin_lock_irqsave(&task_group_lock, flags); | 8655 | /* end participation in shares distribution */ |
8363 | for_each_possible_cpu(i) { | 8656 | for_each_possible_cpu(i) |
8364 | unregister_fair_sched_group(tg, i); | 8657 | unregister_fair_sched_group(tg, i); |
8365 | unregister_rt_sched_group(tg, i); | 8658 | |
8366 | } | 8659 | spin_lock_irqsave(&task_group_lock, flags); |
8367 | list_del_rcu(&tg->list); | 8660 | list_del_rcu(&tg->list); |
8368 | list_del_rcu(&tg->siblings); | 8661 | list_del_rcu(&tg->siblings); |
8369 | spin_unlock_irqrestore(&task_group_lock, flags); | 8662 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8386,57 +8679,30 @@ void sched_move_task(struct task_struct *tsk) | |||
8386 | rq = task_rq_lock(tsk, &flags); | 8679 | rq = task_rq_lock(tsk, &flags); |
8387 | 8680 | ||
8388 | running = task_current(rq, tsk); | 8681 | running = task_current(rq, tsk); |
8389 | on_rq = tsk->se.on_rq; | 8682 | on_rq = tsk->on_rq; |
8390 | 8683 | ||
8391 | if (on_rq) | 8684 | if (on_rq) |
8392 | dequeue_task(rq, tsk, 0); | 8685 | dequeue_task(rq, tsk, 0); |
8393 | if (unlikely(running)) | 8686 | if (unlikely(running)) |
8394 | tsk->sched_class->put_prev_task(rq, tsk); | 8687 | tsk->sched_class->put_prev_task(rq, tsk); |
8395 | 8688 | ||
8396 | set_task_rq(tsk, task_cpu(tsk)); | ||
8397 | |||
8398 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8689 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8399 | if (tsk->sched_class->moved_group) | 8690 | if (tsk->sched_class->task_move_group) |
8400 | tsk->sched_class->moved_group(tsk, on_rq); | 8691 | tsk->sched_class->task_move_group(tsk, on_rq); |
8692 | else | ||
8401 | #endif | 8693 | #endif |
8694 | set_task_rq(tsk, task_cpu(tsk)); | ||
8402 | 8695 | ||
8403 | if (unlikely(running)) | 8696 | if (unlikely(running)) |
8404 | tsk->sched_class->set_curr_task(rq); | 8697 | tsk->sched_class->set_curr_task(rq); |
8405 | if (on_rq) | 8698 | if (on_rq) |
8406 | enqueue_task(rq, tsk, 0); | 8699 | enqueue_task(rq, tsk, 0); |
8407 | 8700 | ||
8408 | task_rq_unlock(rq, &flags); | 8701 | task_rq_unlock(rq, tsk, &flags); |
8409 | } | 8702 | } |
8410 | #endif /* CONFIG_CGROUP_SCHED */ | 8703 | #endif /* CONFIG_CGROUP_SCHED */ |
8411 | 8704 | ||
8412 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8705 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8413 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8414 | { | ||
8415 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8416 | int on_rq; | ||
8417 | |||
8418 | on_rq = se->on_rq; | ||
8419 | if (on_rq) | ||
8420 | dequeue_entity(cfs_rq, se, 0); | ||
8421 | |||
8422 | se->load.weight = shares; | ||
8423 | se->load.inv_weight = 0; | ||
8424 | |||
8425 | if (on_rq) | ||
8426 | enqueue_entity(cfs_rq, se, 0); | ||
8427 | } | ||
8428 | |||
8429 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8430 | { | ||
8431 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8432 | struct rq *rq = cfs_rq->rq; | ||
8433 | unsigned long flags; | ||
8434 | |||
8435 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8436 | __set_se_shares(se, shares); | ||
8437 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8438 | } | ||
8439 | |||
8440 | static DEFINE_MUTEX(shares_mutex); | 8706 | static DEFINE_MUTEX(shares_mutex); |
8441 | 8707 | ||
8442 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8708 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8450,46 +8716,25 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8450 | if (!tg->se[0]) | 8716 | if (!tg->se[0]) |
8451 | return -EINVAL; | 8717 | return -EINVAL; |
8452 | 8718 | ||
8453 | if (shares < MIN_SHARES) | 8719 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); |
8454 | shares = MIN_SHARES; | ||
8455 | else if (shares > MAX_SHARES) | ||
8456 | shares = MAX_SHARES; | ||
8457 | 8720 | ||
8458 | mutex_lock(&shares_mutex); | 8721 | mutex_lock(&shares_mutex); |
8459 | if (tg->shares == shares) | 8722 | if (tg->shares == shares) |
8460 | goto done; | 8723 | goto done; |
8461 | 8724 | ||
8462 | spin_lock_irqsave(&task_group_lock, flags); | ||
8463 | for_each_possible_cpu(i) | ||
8464 | unregister_fair_sched_group(tg, i); | ||
8465 | list_del_rcu(&tg->siblings); | ||
8466 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8467 | |||
8468 | /* wait for any ongoing reference to this group to finish */ | ||
8469 | synchronize_sched(); | ||
8470 | |||
8471 | /* | ||
8472 | * Now we are free to modify the group's share on each cpu | ||
8473 | * w/o tripping rebalance_share or load_balance_fair. | ||
8474 | */ | ||
8475 | tg->shares = shares; | 8725 | tg->shares = shares; |
8476 | for_each_possible_cpu(i) { | 8726 | for_each_possible_cpu(i) { |
8477 | /* | 8727 | struct rq *rq = cpu_rq(i); |
8478 | * force a rebalance | 8728 | struct sched_entity *se; |
8479 | */ | 8729 | |
8480 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8730 | se = tg->se[i]; |
8481 | set_se_shares(tg->se[i], shares); | 8731 | /* Propagate contribution to hierarchy */ |
8732 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8733 | for_each_sched_entity(se) | ||
8734 | update_cfs_shares(group_cfs_rq(se)); | ||
8735 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8482 | } | 8736 | } |
8483 | 8737 | ||
8484 | /* | ||
8485 | * Enable load balance activity on this group, by inserting it back on | ||
8486 | * each cpu's rq->leaf_cfs_rq_list. | ||
8487 | */ | ||
8488 | spin_lock_irqsave(&task_group_lock, flags); | ||
8489 | for_each_possible_cpu(i) | ||
8490 | register_fair_sched_group(tg, i); | ||
8491 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8492 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8493 | done: | 8738 | done: |
8494 | mutex_unlock(&shares_mutex); | 8739 | mutex_unlock(&shares_mutex); |
8495 | return 0; | 8740 | return 0; |
@@ -8624,7 +8869,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8624 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8869 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8625 | } | 8870 | } |
8626 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8871 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8627 | unlock: | 8872 | unlock: |
8628 | read_unlock(&tasklist_lock); | 8873 | read_unlock(&tasklist_lock); |
8629 | mutex_unlock(&rt_constraints_mutex); | 8874 | mutex_unlock(&rt_constraints_mutex); |
8630 | 8875 | ||
@@ -8788,7 +9033,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8788 | 9033 | ||
8789 | if (!cgrp->parent) { | 9034 | if (!cgrp->parent) { |
8790 | /* This is early initialization for the top cgroup */ | 9035 | /* This is early initialization for the top cgroup */ |
8791 | return &init_task_group.css; | 9036 | return &root_task_group.css; |
8792 | } | 9037 | } |
8793 | 9038 | ||
8794 | parent = cgroup_tg(cgrp->parent); | 9039 | parent = cgroup_tg(cgrp->parent); |
@@ -8821,56 +9066,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
8821 | return 0; | 9066 | return 0; |
8822 | } | 9067 | } |
8823 | 9068 | ||
8824 | static int | 9069 | static void |
8825 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 9070 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
8826 | struct task_struct *tsk, bool threadgroup) | ||
8827 | { | 9071 | { |
8828 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | 9072 | sched_move_task(tsk); |
8829 | if (retval) | ||
8830 | return retval; | ||
8831 | if (threadgroup) { | ||
8832 | struct task_struct *c; | ||
8833 | rcu_read_lock(); | ||
8834 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
8835 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
8836 | if (retval) { | ||
8837 | rcu_read_unlock(); | ||
8838 | return retval; | ||
8839 | } | ||
8840 | } | ||
8841 | rcu_read_unlock(); | ||
8842 | } | ||
8843 | return 0; | ||
8844 | } | 9073 | } |
8845 | 9074 | ||
8846 | static void | 9075 | static void |
8847 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 9076 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8848 | struct cgroup *old_cont, struct task_struct *tsk, | 9077 | struct cgroup *old_cgrp, struct task_struct *task) |
8849 | bool threadgroup) | ||
8850 | { | 9078 | { |
8851 | sched_move_task(tsk); | 9079 | /* |
8852 | if (threadgroup) { | 9080 | * cgroup_exit() is called in the copy_process() failure path. |
8853 | struct task_struct *c; | 9081 | * Ignore this case since the task hasn't ran yet, this avoids |
8854 | rcu_read_lock(); | 9082 | * trying to poke a half freed task state from generic code. |
8855 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 9083 | */ |
8856 | sched_move_task(c); | 9084 | if (!(task->flags & PF_EXITING)) |
8857 | } | 9085 | return; |
8858 | rcu_read_unlock(); | 9086 | |
8859 | } | 9087 | sched_move_task(task); |
8860 | } | 9088 | } |
8861 | 9089 | ||
8862 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9090 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8863 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 9091 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8864 | u64 shareval) | 9092 | u64 shareval) |
8865 | { | 9093 | { |
8866 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 9094 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
8867 | } | 9095 | } |
8868 | 9096 | ||
8869 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 9097 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
8870 | { | 9098 | { |
8871 | struct task_group *tg = cgroup_tg(cgrp); | 9099 | struct task_group *tg = cgroup_tg(cgrp); |
8872 | 9100 | ||
8873 | return (u64) tg->shares; | 9101 | return (u64) scale_load_down(tg->shares); |
8874 | } | 9102 | } |
8875 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9103 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8876 | 9104 | ||
@@ -8929,8 +9157,9 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8929 | .name = "cpu", | 9157 | .name = "cpu", |
8930 | .create = cpu_cgroup_create, | 9158 | .create = cpu_cgroup_create, |
8931 | .destroy = cpu_cgroup_destroy, | 9159 | .destroy = cpu_cgroup_destroy, |
8932 | .can_attach = cpu_cgroup_can_attach, | 9160 | .can_attach_task = cpu_cgroup_can_attach_task, |
8933 | .attach = cpu_cgroup_attach, | 9161 | .attach_task = cpu_cgroup_attach_task, |
9162 | .exit = cpu_cgroup_exit, | ||
8934 | .populate = cpu_cgroup_populate, | 9163 | .populate = cpu_cgroup_populate, |
8935 | .subsys_id = cpu_cgroup_subsys_id, | 9164 | .subsys_id = cpu_cgroup_subsys_id, |
8936 | .early_init = 1, | 9165 | .early_init = 1, |
@@ -9215,72 +9444,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9215 | }; | 9444 | }; |
9216 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9445 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9217 | 9446 | ||
9218 | #ifndef CONFIG_SMP | ||
9219 | |||
9220 | void synchronize_sched_expedited(void) | ||
9221 | { | ||
9222 | barrier(); | ||
9223 | } | ||
9224 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9225 | |||
9226 | #else /* #ifndef CONFIG_SMP */ | ||
9227 | |||
9228 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9229 | |||
9230 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9231 | { | ||
9232 | /* | ||
9233 | * There must be a full memory barrier on each affected CPU | ||
9234 | * between the time that try_stop_cpus() is called and the | ||
9235 | * time that it returns. | ||
9236 | * | ||
9237 | * In the current initial implementation of cpu_stop, the | ||
9238 | * above condition is already met when the control reaches | ||
9239 | * this point and the following smp_mb() is not strictly | ||
9240 | * necessary. Do smp_mb() anyway for documentation and | ||
9241 | * robustness against future implementation changes. | ||
9242 | */ | ||
9243 | smp_mb(); /* See above comment block. */ | ||
9244 | return 0; | ||
9245 | } | ||
9246 | |||
9247 | /* | ||
9248 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9249 | * approach to force grace period to end quickly. This consumes | ||
9250 | * significant time on all CPUs, and is thus not recommended for | ||
9251 | * any sort of common-case code. | ||
9252 | * | ||
9253 | * Note that it is illegal to call this function while holding any | ||
9254 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9255 | * observe this restriction will result in deadlock. | ||
9256 | */ | ||
9257 | void synchronize_sched_expedited(void) | ||
9258 | { | ||
9259 | int snap, trycount = 0; | ||
9260 | |||
9261 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9262 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9263 | get_online_cpus(); | ||
9264 | while (try_stop_cpus(cpu_online_mask, | ||
9265 | synchronize_sched_expedited_cpu_stop, | ||
9266 | NULL) == -EAGAIN) { | ||
9267 | put_online_cpus(); | ||
9268 | if (trycount++ < 10) | ||
9269 | udelay(trycount * num_online_cpus()); | ||
9270 | else { | ||
9271 | synchronize_sched(); | ||
9272 | return; | ||
9273 | } | ||
9274 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9275 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9276 | return; | ||
9277 | } | ||
9278 | get_online_cpus(); | ||
9279 | } | ||
9280 | atomic_inc(&synchronize_sched_expedited_count); | ||
9281 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9282 | put_online_cpus(); | ||
9283 | } | ||
9284 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9285 | |||
9286 | #endif /* #else #ifndef CONFIG_SMP */ | ||
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 000000000000..429242f3c484 --- /dev/null +++ b/kernel/sched_autogroup.c | |||
@@ -0,0 +1,275 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | #include <linux/proc_fs.h> | ||
4 | #include <linux/seq_file.h> | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/utsname.h> | ||
7 | |||
8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | ||
9 | static struct autogroup autogroup_default; | ||
10 | static atomic_t autogroup_seq_nr; | ||
11 | |||
12 | static void __init autogroup_init(struct task_struct *init_task) | ||
13 | { | ||
14 | autogroup_default.tg = &root_task_group; | ||
15 | kref_init(&autogroup_default.kref); | ||
16 | init_rwsem(&autogroup_default.lock); | ||
17 | init_task->signal->autogroup = &autogroup_default; | ||
18 | } | ||
19 | |||
20 | static inline void autogroup_free(struct task_group *tg) | ||
21 | { | ||
22 | kfree(tg->autogroup); | ||
23 | } | ||
24 | |||
25 | static inline void autogroup_destroy(struct kref *kref) | ||
26 | { | ||
27 | struct autogroup *ag = container_of(kref, struct autogroup, kref); | ||
28 | |||
29 | #ifdef CONFIG_RT_GROUP_SCHED | ||
30 | /* We've redirected RT tasks to the root task group... */ | ||
31 | ag->tg->rt_se = NULL; | ||
32 | ag->tg->rt_rq = NULL; | ||
33 | #endif | ||
34 | sched_destroy_group(ag->tg); | ||
35 | } | ||
36 | |||
37 | static inline void autogroup_kref_put(struct autogroup *ag) | ||
38 | { | ||
39 | kref_put(&ag->kref, autogroup_destroy); | ||
40 | } | ||
41 | |||
42 | static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) | ||
43 | { | ||
44 | kref_get(&ag->kref); | ||
45 | return ag; | ||
46 | } | ||
47 | |||
48 | static inline struct autogroup *autogroup_task_get(struct task_struct *p) | ||
49 | { | ||
50 | struct autogroup *ag; | ||
51 | unsigned long flags; | ||
52 | |||
53 | if (!lock_task_sighand(p, &flags)) | ||
54 | return autogroup_kref_get(&autogroup_default); | ||
55 | |||
56 | ag = autogroup_kref_get(p->signal->autogroup); | ||
57 | unlock_task_sighand(p, &flags); | ||
58 | |||
59 | return ag; | ||
60 | } | ||
61 | |||
62 | #ifdef CONFIG_RT_GROUP_SCHED | ||
63 | static void free_rt_sched_group(struct task_group *tg); | ||
64 | #endif | ||
65 | |||
66 | static inline struct autogroup *autogroup_create(void) | ||
67 | { | ||
68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | ||
69 | struct task_group *tg; | ||
70 | |||
71 | if (!ag) | ||
72 | goto out_fail; | ||
73 | |||
74 | tg = sched_create_group(&root_task_group); | ||
75 | |||
76 | if (IS_ERR(tg)) | ||
77 | goto out_free; | ||
78 | |||
79 | kref_init(&ag->kref); | ||
80 | init_rwsem(&ag->lock); | ||
81 | ag->id = atomic_inc_return(&autogroup_seq_nr); | ||
82 | ag->tg = tg; | ||
83 | #ifdef CONFIG_RT_GROUP_SCHED | ||
84 | /* | ||
85 | * Autogroup RT tasks are redirected to the root task group | ||
86 | * so we don't have to move tasks around upon policy change, | ||
87 | * or flail around trying to allocate bandwidth on the fly. | ||
88 | * A bandwidth exception in __sched_setscheduler() allows | ||
89 | * the policy change to proceed. Thereafter, task_group() | ||
90 | * returns &root_task_group, so zero bandwidth is required. | ||
91 | */ | ||
92 | free_rt_sched_group(tg); | ||
93 | tg->rt_se = root_task_group.rt_se; | ||
94 | tg->rt_rq = root_task_group.rt_rq; | ||
95 | #endif | ||
96 | tg->autogroup = ag; | ||
97 | |||
98 | return ag; | ||
99 | |||
100 | out_free: | ||
101 | kfree(ag); | ||
102 | out_fail: | ||
103 | if (printk_ratelimit()) { | ||
104 | printk(KERN_WARNING "autogroup_create: %s failure.\n", | ||
105 | ag ? "sched_create_group()" : "kmalloc()"); | ||
106 | } | ||
107 | |||
108 | return autogroup_kref_get(&autogroup_default); | ||
109 | } | ||
110 | |||
111 | static inline bool | ||
112 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
113 | { | ||
114 | if (tg != &root_task_group) | ||
115 | return false; | ||
116 | |||
117 | if (p->sched_class != &fair_sched_class) | ||
118 | return false; | ||
119 | |||
120 | /* | ||
121 | * We can only assume the task group can't go away on us if | ||
122 | * autogroup_move_group() can see us on ->thread_group list. | ||
123 | */ | ||
124 | if (p->flags & PF_EXITING) | ||
125 | return false; | ||
126 | |||
127 | return true; | ||
128 | } | ||
129 | |||
130 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
131 | { | ||
132 | return !!tg->autogroup; | ||
133 | } | ||
134 | |||
135 | static inline struct task_group * | ||
136 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
137 | { | ||
138 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
139 | |||
140 | if (enabled && task_wants_autogroup(p, tg)) | ||
141 | return p->signal->autogroup->tg; | ||
142 | |||
143 | return tg; | ||
144 | } | ||
145 | |||
146 | static void | ||
147 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | ||
148 | { | ||
149 | struct autogroup *prev; | ||
150 | struct task_struct *t; | ||
151 | unsigned long flags; | ||
152 | |||
153 | BUG_ON(!lock_task_sighand(p, &flags)); | ||
154 | |||
155 | prev = p->signal->autogroup; | ||
156 | if (prev == ag) { | ||
157 | unlock_task_sighand(p, &flags); | ||
158 | return; | ||
159 | } | ||
160 | |||
161 | p->signal->autogroup = autogroup_kref_get(ag); | ||
162 | |||
163 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | ||
164 | goto out; | ||
165 | |||
166 | t = p; | ||
167 | do { | ||
168 | sched_move_task(t); | ||
169 | } while_each_thread(p, t); | ||
170 | |||
171 | out: | ||
172 | unlock_task_sighand(p, &flags); | ||
173 | autogroup_kref_put(prev); | ||
174 | } | ||
175 | |||
176 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | ||
177 | void sched_autogroup_create_attach(struct task_struct *p) | ||
178 | { | ||
179 | struct autogroup *ag = autogroup_create(); | ||
180 | |||
181 | autogroup_move_group(p, ag); | ||
182 | /* drop extra reference added by autogroup_create() */ | ||
183 | autogroup_kref_put(ag); | ||
184 | } | ||
185 | EXPORT_SYMBOL(sched_autogroup_create_attach); | ||
186 | |||
187 | /* Cannot be called under siglock. Currently has no users */ | ||
188 | void sched_autogroup_detach(struct task_struct *p) | ||
189 | { | ||
190 | autogroup_move_group(p, &autogroup_default); | ||
191 | } | ||
192 | EXPORT_SYMBOL(sched_autogroup_detach); | ||
193 | |||
194 | void sched_autogroup_fork(struct signal_struct *sig) | ||
195 | { | ||
196 | sig->autogroup = autogroup_task_get(current); | ||
197 | } | ||
198 | |||
199 | void sched_autogroup_exit(struct signal_struct *sig) | ||
200 | { | ||
201 | autogroup_kref_put(sig->autogroup); | ||
202 | } | ||
203 | |||
204 | static int __init setup_autogroup(char *str) | ||
205 | { | ||
206 | sysctl_sched_autogroup_enabled = 0; | ||
207 | |||
208 | return 1; | ||
209 | } | ||
210 | |||
211 | __setup("noautogroup", setup_autogroup); | ||
212 | |||
213 | #ifdef CONFIG_PROC_FS | ||
214 | |||
215 | int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | ||
216 | { | ||
217 | static unsigned long next = INITIAL_JIFFIES; | ||
218 | struct autogroup *ag; | ||
219 | int err; | ||
220 | |||
221 | if (*nice < -20 || *nice > 19) | ||
222 | return -EINVAL; | ||
223 | |||
224 | err = security_task_setnice(current, *nice); | ||
225 | if (err) | ||
226 | return err; | ||
227 | |||
228 | if (*nice < 0 && !can_nice(current, *nice)) | ||
229 | return -EPERM; | ||
230 | |||
231 | /* this is a heavy operation taking global locks.. */ | ||
232 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | ||
233 | return -EAGAIN; | ||
234 | |||
235 | next = HZ / 10 + jiffies; | ||
236 | ag = autogroup_task_get(p); | ||
237 | |||
238 | down_write(&ag->lock); | ||
239 | err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); | ||
240 | if (!err) | ||
241 | ag->nice = *nice; | ||
242 | up_write(&ag->lock); | ||
243 | |||
244 | autogroup_kref_put(ag); | ||
245 | |||
246 | return err; | ||
247 | } | ||
248 | |||
249 | void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | ||
250 | { | ||
251 | struct autogroup *ag = autogroup_task_get(p); | ||
252 | |||
253 | if (!task_group_is_autogroup(ag->tg)) | ||
254 | goto out; | ||
255 | |||
256 | down_read(&ag->lock); | ||
257 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | ||
258 | up_read(&ag->lock); | ||
259 | |||
260 | out: | ||
261 | autogroup_kref_put(ag); | ||
262 | } | ||
263 | #endif /* CONFIG_PROC_FS */ | ||
264 | |||
265 | #ifdef CONFIG_SCHED_DEBUG | ||
266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
267 | { | ||
268 | if (!task_group_is_autogroup(tg)) | ||
269 | return 0; | ||
270 | |||
271 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | ||
272 | } | ||
273 | #endif /* CONFIG_SCHED_DEBUG */ | ||
274 | |||
275 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 000000000000..05577055cfca --- /dev/null +++ b/kernel/sched_autogroup.h | |||
@@ -0,0 +1,41 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
2 | |||
3 | struct autogroup { | ||
4 | /* | ||
5 | * reference doesn't mean how many thread attach to this | ||
6 | * autogroup now. It just stands for the number of task | ||
7 | * could use this autogroup. | ||
8 | */ | ||
9 | struct kref kref; | ||
10 | struct task_group *tg; | ||
11 | struct rw_semaphore lock; | ||
12 | unsigned long id; | ||
13 | int nice; | ||
14 | }; | ||
15 | |||
16 | static inline struct task_group * | ||
17 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | ||
18 | |||
19 | #else /* !CONFIG_SCHED_AUTOGROUP */ | ||
20 | |||
21 | static inline void autogroup_init(struct task_struct *init_task) { } | ||
22 | static inline void autogroup_free(struct task_group *tg) { } | ||
23 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
24 | { | ||
25 | return 0; | ||
26 | } | ||
27 | |||
28 | static inline struct task_group * | ||
29 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
30 | { | ||
31 | return tg; | ||
32 | } | ||
33 | |||
34 | #ifdef CONFIG_SCHED_DEBUG | ||
35 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | ||
36 | { | ||
37 | return 0; | ||
38 | } | ||
39 | #endif | ||
40 | |||
41 | #endif /* CONFIG_SCHED_AUTOGROUP */ | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb1..9d8af0b3fb64 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
79 | } | 79 | } |
80 | EXPORT_SYMBOL_GPL(sched_clock); | 80 | EXPORT_SYMBOL_GPL(sched_clock); |
81 | 81 | ||
82 | static __read_mostly int sched_clock_running; | 82 | __read_mostly int sched_clock_running; |
83 | 83 | ||
84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 84 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
85 | __read_mostly int sched_clock_stable; | 85 | __read_mostly int sched_clock_stable; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9b..a6710a112b4f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | 18 | ||
19 | static DEFINE_SPINLOCK(sched_debug_lock); | ||
20 | |||
19 | /* | 21 | /* |
20 | * This allows printing both to /proc/sched_debug and | 22 | * This allows printing both to /proc/sched_debug and |
21 | * to the console | 23 | * to the console |
@@ -54,8 +56,7 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
54 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | 56 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) |
55 | 57 | ||
56 | #ifdef CONFIG_FAIR_GROUP_SCHED | 58 | #ifdef CONFIG_FAIR_GROUP_SCHED |
57 | static void print_cfs_group_stats(struct seq_file *m, int cpu, | 59 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
58 | struct task_group *tg) | ||
59 | { | 60 | { |
60 | struct sched_entity *se = tg->se[cpu]; | 61 | struct sched_entity *se = tg->se[cpu]; |
61 | if (!se) | 62 | if (!se) |
@@ -87,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, | |||
87 | } | 88 | } |
88 | #endif | 89 | #endif |
89 | 90 | ||
91 | #ifdef CONFIG_CGROUP_SCHED | ||
92 | static char group_path[PATH_MAX]; | ||
93 | |||
94 | static char *task_group_path(struct task_group *tg) | ||
95 | { | ||
96 | if (autogroup_path(tg, group_path, PATH_MAX)) | ||
97 | return group_path; | ||
98 | |||
99 | /* | ||
100 | * May be NULL if the underlying cgroup isn't fully-created yet | ||
101 | */ | ||
102 | if (!tg->css.cgroup) { | ||
103 | group_path[0] = '\0'; | ||
104 | return group_path; | ||
105 | } | ||
106 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | ||
107 | return group_path; | ||
108 | } | ||
109 | #endif | ||
110 | |||
90 | static void | 111 | static void |
91 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | 112 | print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) |
92 | { | 113 | { |
@@ -109,17 +130,10 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
109 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 130 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
110 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 131 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
111 | #endif | 132 | #endif |
112 | |||
113 | #ifdef CONFIG_CGROUP_SCHED | 133 | #ifdef CONFIG_CGROUP_SCHED |
114 | { | 134 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
115 | char path[64]; | ||
116 | |||
117 | rcu_read_lock(); | ||
118 | cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); | ||
119 | rcu_read_unlock(); | ||
120 | SEQ_printf(m, " %s", path); | ||
121 | } | ||
122 | #endif | 135 | #endif |
136 | |||
123 | SEQ_printf(m, "\n"); | 137 | SEQ_printf(m, "\n"); |
124 | } | 138 | } |
125 | 139 | ||
@@ -138,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
138 | read_lock_irqsave(&tasklist_lock, flags); | 152 | read_lock_irqsave(&tasklist_lock, flags); |
139 | 153 | ||
140 | do_each_thread(g, p) { | 154 | do_each_thread(g, p) { |
141 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 155 | if (!p->on_rq || task_cpu(p) != rq_cpu) |
142 | continue; | 156 | continue; |
143 | 157 | ||
144 | print_task(m, rq, p); | 158 | print_task(m, rq, p); |
@@ -147,19 +161,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
147 | read_unlock_irqrestore(&tasklist_lock, flags); | 161 | read_unlock_irqrestore(&tasklist_lock, flags); |
148 | } | 162 | } |
149 | 163 | ||
150 | #if defined(CONFIG_CGROUP_SCHED) && \ | ||
151 | (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) | ||
152 | static void task_group_path(struct task_group *tg, char *buf, int buflen) | ||
153 | { | ||
154 | /* may be NULL if the underlying cgroup isn't fully-created yet */ | ||
155 | if (!tg->css.cgroup) { | ||
156 | buf[0] = '\0'; | ||
157 | return; | ||
158 | } | ||
159 | cgroup_path(tg->css.cgroup, buf, buflen); | ||
160 | } | ||
161 | #endif | ||
162 | |||
163 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 164 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
164 | { | 165 | { |
165 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, | 166 | s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, |
@@ -168,13 +169,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
168 | struct sched_entity *last; | 169 | struct sched_entity *last; |
169 | unsigned long flags; | 170 | unsigned long flags; |
170 | 171 | ||
171 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) | 172 | #ifdef CONFIG_FAIR_GROUP_SCHED |
172 | char path[128]; | 173 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); |
173 | struct task_group *tg = cfs_rq->tg; | ||
174 | |||
175 | task_group_path(tg, path, sizeof(path)); | ||
176 | |||
177 | SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); | ||
178 | #else | 174 | #else |
179 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); | 175 | SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); |
180 | #endif | 176 | #endif |
@@ -183,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
183 | 179 | ||
184 | raw_spin_lock_irqsave(&rq->lock, flags); | 180 | raw_spin_lock_irqsave(&rq->lock, flags); |
185 | if (cfs_rq->rb_leftmost) | 181 | if (cfs_rq->rb_leftmost) |
186 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | 182 | MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; |
187 | last = __pick_last_entity(cfs_rq); | 183 | last = __pick_last_entity(cfs_rq); |
188 | if (last) | 184 | if (last) |
189 | max_vruntime = last->vruntime; | 185 | max_vruntime = last->vruntime; |
@@ -202,33 +198,34 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | spread0 = min_vruntime - rq0_min_vruntime; | 198 | spread0 = min_vruntime - rq0_min_vruntime; |
203 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | 199 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", |
204 | SPLIT_NS(spread0)); | 200 | SPLIT_NS(spread0)); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
207 | |||
208 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 201 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
209 | cfs_rq->nr_spread_over); | 202 | cfs_rq->nr_spread_over); |
203 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
204 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
210 | #ifdef CONFIG_FAIR_GROUP_SCHED | 205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
211 | #ifdef CONFIG_SMP | 206 | #ifdef CONFIG_SMP |
212 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | 207 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", |
208 | SPLIT_NS(cfs_rq->load_avg)); | ||
209 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | ||
210 | SPLIT_NS(cfs_rq->load_period)); | ||
211 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | ||
212 | cfs_rq->load_contribution); | ||
213 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | ||
214 | atomic_read(&cfs_rq->tg->load_weight)); | ||
213 | #endif | 215 | #endif |
216 | |||
214 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 217 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
215 | #endif | 218 | #endif |
216 | } | 219 | } |
217 | 220 | ||
218 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | 221 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) |
219 | { | 222 | { |
220 | #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) | 223 | #ifdef CONFIG_RT_GROUP_SCHED |
221 | char path[128]; | 224 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); |
222 | struct task_group *tg = rt_rq->tg; | ||
223 | |||
224 | task_group_path(tg, path, sizeof(path)); | ||
225 | |||
226 | SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); | ||
227 | #else | 225 | #else |
228 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); | 226 | SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); |
229 | #endif | 227 | #endif |
230 | 228 | ||
231 | |||
232 | #define P(x) \ | 229 | #define P(x) \ |
233 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | 230 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) |
234 | #define PN(x) \ | 231 | #define PN(x) \ |
@@ -243,9 +240,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
243 | #undef P | 240 | #undef P |
244 | } | 241 | } |
245 | 242 | ||
243 | extern __read_mostly int sched_clock_running; | ||
244 | |||
246 | static void print_cpu(struct seq_file *m, int cpu) | 245 | static void print_cpu(struct seq_file *m, int cpu) |
247 | { | 246 | { |
248 | struct rq *rq = cpu_rq(cpu); | 247 | struct rq *rq = cpu_rq(cpu); |
248 | unsigned long flags; | ||
249 | 249 | ||
250 | #ifdef CONFIG_X86 | 250 | #ifdef CONFIG_X86 |
251 | { | 251 | { |
@@ -296,14 +296,17 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
296 | P(ttwu_count); | 296 | P(ttwu_count); |
297 | P(ttwu_local); | 297 | P(ttwu_local); |
298 | 298 | ||
299 | P(bkl_count); | ||
300 | |||
301 | #undef P | 299 | #undef P |
300 | #undef P64 | ||
302 | #endif | 301 | #endif |
302 | spin_lock_irqsave(&sched_debug_lock, flags); | ||
303 | print_cfs_stats(m, cpu); | 303 | print_cfs_stats(m, cpu); |
304 | print_rt_stats(m, cpu); | 304 | print_rt_stats(m, cpu); |
305 | 305 | ||
306 | rcu_read_lock(); | ||
306 | print_rq(m, rq, cpu); | 307 | print_rq(m, rq, cpu); |
308 | rcu_read_unlock(); | ||
309 | spin_unlock_irqrestore(&sched_debug_lock, flags); | ||
307 | } | 310 | } |
308 | 311 | ||
309 | static const char *sched_tunable_scaling_names[] = { | 312 | static const char *sched_tunable_scaling_names[] = { |
@@ -314,21 +317,42 @@ static const char *sched_tunable_scaling_names[] = { | |||
314 | 317 | ||
315 | static int sched_debug_show(struct seq_file *m, void *v) | 318 | static int sched_debug_show(struct seq_file *m, void *v) |
316 | { | 319 | { |
317 | u64 now = ktime_to_ns(ktime_get()); | 320 | u64 ktime, sched_clk, cpu_clk; |
321 | unsigned long flags; | ||
318 | int cpu; | 322 | int cpu; |
319 | 323 | ||
320 | SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", | 324 | local_irq_save(flags); |
325 | ktime = ktime_to_ns(ktime_get()); | ||
326 | sched_clk = sched_clock(); | ||
327 | cpu_clk = local_clock(); | ||
328 | local_irq_restore(flags); | ||
329 | |||
330 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | ||
321 | init_utsname()->release, | 331 | init_utsname()->release, |
322 | (int)strcspn(init_utsname()->version, " "), | 332 | (int)strcspn(init_utsname()->version, " "), |
323 | init_utsname()->version); | 333 | init_utsname()->version); |
324 | 334 | ||
325 | SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); | 335 | #define P(x) \ |
336 | SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) | ||
337 | #define PN(x) \ | ||
338 | SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | ||
339 | PN(ktime); | ||
340 | PN(sched_clk); | ||
341 | PN(cpu_clk); | ||
342 | P(jiffies); | ||
343 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
344 | P(sched_clock_stable); | ||
345 | #endif | ||
346 | #undef PN | ||
347 | #undef P | ||
348 | |||
349 | SEQ_printf(m, "\n"); | ||
350 | SEQ_printf(m, "sysctl_sched\n"); | ||
326 | 351 | ||
327 | #define P(x) \ | 352 | #define P(x) \ |
328 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) | 353 | SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) |
329 | #define PN(x) \ | 354 | #define PN(x) \ |
330 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | 355 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) |
331 | P(jiffies); | ||
332 | PN(sysctl_sched_latency); | 356 | PN(sysctl_sched_latency); |
333 | PN(sysctl_sched_min_granularity); | 357 | PN(sysctl_sched_min_granularity); |
334 | PN(sysctl_sched_wakeup_granularity); | 358 | PN(sysctl_sched_wakeup_granularity); |
@@ -414,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
414 | P(se.statistics.wait_count); | 438 | P(se.statistics.wait_count); |
415 | PN(se.statistics.iowait_sum); | 439 | PN(se.statistics.iowait_sum); |
416 | P(se.statistics.iowait_count); | 440 | P(se.statistics.iowait_count); |
417 | P(sched_info.bkl_count); | ||
418 | P(se.nr_migrations); | 441 | P(se.nr_migrations); |
419 | P(se.statistics.nr_migrations_cold); | 442 | P(se.statistics.nr_migrations_cold); |
420 | P(se.statistics.nr_failed_migrations_affine); | 443 | P(se.statistics.nr_failed_migrations_affine); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e0e8d5ca3c98..334eb474af93 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -22,10 +22,11 @@ | |||
22 | 22 | ||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | ||
25 | 26 | ||
26 | /* | 27 | /* |
27 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
28 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) | 29 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) |
29 | * | 30 | * |
30 | * NOTE: this latency value is not the same as the concept of | 31 | * NOTE: this latency value is not the same as the concept of |
31 | * 'timeslice length' - timeslices in CFS are of variable length | 32 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -52,7 +53,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
52 | 53 | ||
53 | /* | 54 | /* |
54 | * Minimal preemption granularity for CPU-bound tasks: | 55 | * Minimal preemption granularity for CPU-bound tasks: |
55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 56 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) |
56 | */ | 57 | */ |
57 | unsigned int sysctl_sched_min_granularity = 750000ULL; | 58 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; | 59 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8; | |||
69 | unsigned int sysctl_sched_child_runs_first __read_mostly; | 70 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
70 | 71 | ||
71 | /* | 72 | /* |
72 | * sys_sched_yield() compat mode | ||
73 | * | ||
74 | * This option switches the agressive yield implementation of the | ||
75 | * old scheduler back on. | ||
76 | */ | ||
77 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
78 | |||
79 | /* | ||
80 | * SCHED_OTHER wake-up granularity. | 73 | * SCHED_OTHER wake-up granularity. |
81 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 74 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
82 | * | 75 | * |
@@ -89,6 +82,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | |||
89 | 82 | ||
90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 83 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
91 | 84 | ||
85 | /* | ||
86 | * The exponential sliding window over which load is averaged for shares | ||
87 | * distribution. | ||
88 | * (default: 10msec) | ||
89 | */ | ||
90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | ||
91 | |||
92 | static const struct sched_class fair_sched_class; | 92 | static const struct sched_class fair_sched_class; |
93 | 93 | ||
94 | /************************************************************** | 94 | /************************************************************** |
@@ -143,6 +143,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
143 | return cfs_rq->tg->cfs_rq[this_cpu]; | 143 | return cfs_rq->tg->cfs_rq[this_cpu]; |
144 | } | 144 | } |
145 | 145 | ||
146 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
147 | { | ||
148 | if (!cfs_rq->on_list) { | ||
149 | /* | ||
150 | * Ensure we either appear before our parent (if already | ||
151 | * enqueued) or force our parent to appear after us when it is | ||
152 | * enqueued. The fact that we always enqueue bottom-up | ||
153 | * reduces this to two cases. | ||
154 | */ | ||
155 | if (cfs_rq->tg->parent && | ||
156 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | ||
157 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
158 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
159 | } else { | ||
160 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
161 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
162 | } | ||
163 | |||
164 | cfs_rq->on_list = 1; | ||
165 | } | ||
166 | } | ||
167 | |||
168 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
169 | { | ||
170 | if (cfs_rq->on_list) { | ||
171 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
172 | cfs_rq->on_list = 0; | ||
173 | } | ||
174 | } | ||
175 | |||
146 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 176 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
147 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 177 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
148 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 178 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
@@ -246,6 +276,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
246 | return &cpu_rq(this_cpu)->cfs; | 276 | return &cpu_rq(this_cpu)->cfs; |
247 | } | 277 | } |
248 | 278 | ||
279 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
280 | { | ||
281 | } | ||
282 | |||
283 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
284 | { | ||
285 | } | ||
286 | |||
249 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 287 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
250 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 288 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
251 | 289 | ||
@@ -320,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
320 | } | 358 | } |
321 | 359 | ||
322 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
361 | #ifndef CONFIG_64BIT | ||
362 | smp_wmb(); | ||
363 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
364 | #endif | ||
323 | } | 365 | } |
324 | 366 | ||
325 | /* | 367 | /* |
@@ -374,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
374 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 416 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
375 | } | 417 | } |
376 | 418 | ||
377 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 419 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
378 | { | 420 | { |
379 | struct rb_node *left = cfs_rq->rb_leftmost; | 421 | struct rb_node *left = cfs_rq->rb_leftmost; |
380 | 422 | ||
@@ -384,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
384 | return rb_entry(left, struct sched_entity, run_node); | 426 | return rb_entry(left, struct sched_entity, run_node); |
385 | } | 427 | } |
386 | 428 | ||
429 | static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||
430 | { | ||
431 | struct rb_node *next = rb_next(&se->run_node); | ||
432 | |||
433 | if (!next) | ||
434 | return NULL; | ||
435 | |||
436 | return rb_entry(next, struct sched_entity, run_node); | ||
437 | } | ||
438 | |||
439 | #ifdef CONFIG_SCHED_DEBUG | ||
387 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 440 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
388 | { | 441 | { |
389 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 442 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
@@ -398,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
398 | * Scheduling class statistics methods: | 451 | * Scheduling class statistics methods: |
399 | */ | 452 | */ |
400 | 453 | ||
401 | #ifdef CONFIG_SCHED_DEBUG | ||
402 | int sched_proc_update_handler(struct ctl_table *table, int write, | 454 | int sched_proc_update_handler(struct ctl_table *table, int write, |
403 | void __user *buffer, size_t *lenp, | 455 | void __user *buffer, size_t *lenp, |
404 | loff_t *ppos) | 456 | loff_t *ppos) |
@@ -417,7 +469,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 469 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 470 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 471 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 472 | #undef WRT_SYSCTL |
422 | 473 | ||
423 | return 0; | 474 | return 0; |
@@ -495,6 +546,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
495 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 546 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
496 | } | 547 | } |
497 | 548 | ||
549 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
550 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | ||
551 | |||
498 | /* | 552 | /* |
499 | * Update the current task's runtime statistics. Skip current tasks that | 553 | * Update the current task's runtime statistics. Skip current tasks that |
500 | * are not in our scheduling class. | 554 | * are not in our scheduling class. |
@@ -514,12 +568,16 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
514 | 568 | ||
515 | curr->vruntime += delta_exec_weighted; | 569 | curr->vruntime += delta_exec_weighted; |
516 | update_min_vruntime(cfs_rq); | 570 | update_min_vruntime(cfs_rq); |
571 | |||
572 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
573 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
574 | #endif | ||
517 | } | 575 | } |
518 | 576 | ||
519 | static void update_curr(struct cfs_rq *cfs_rq) | 577 | static void update_curr(struct cfs_rq *cfs_rq) |
520 | { | 578 | { |
521 | struct sched_entity *curr = cfs_rq->curr; | 579 | struct sched_entity *curr = cfs_rq->curr; |
522 | u64 now = rq_of(cfs_rq)->clock; | 580 | u64 now = rq_of(cfs_rq)->clock_task; |
523 | unsigned long delta_exec; | 581 | unsigned long delta_exec; |
524 | 582 | ||
525 | if (unlikely(!curr)) | 583 | if (unlikely(!curr)) |
@@ -602,7 +660,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
602 | /* | 660 | /* |
603 | * We are starting a new run period: | 661 | * We are starting a new run period: |
604 | */ | 662 | */ |
605 | se->exec_start = rq_of(cfs_rq)->clock; | 663 | se->exec_start = rq_of(cfs_rq)->clock_task; |
606 | } | 664 | } |
607 | 665 | ||
608 | /************************************************** | 666 | /************************************************** |
@@ -633,7 +691,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 691 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 692 | } |
635 | cfs_rq->nr_running++; | 693 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 694 | } |
638 | 695 | ||
639 | static void | 696 | static void |
@@ -647,9 +704,164 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 704 | list_del_init(&se->group_node); |
648 | } | 705 | } |
649 | cfs_rq->nr_running--; | 706 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 707 | } |
652 | 708 | ||
709 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
710 | # ifdef CONFIG_SMP | ||
711 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
712 | int global_update) | ||
713 | { | ||
714 | struct task_group *tg = cfs_rq->tg; | ||
715 | long load_avg; | ||
716 | |||
717 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
718 | load_avg -= cfs_rq->load_contribution; | ||
719 | |||
720 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
721 | atomic_add(load_avg, &tg->load_weight); | ||
722 | cfs_rq->load_contribution += load_avg; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
727 | { | ||
728 | u64 period = sysctl_sched_shares_window; | ||
729 | u64 now, delta; | ||
730 | unsigned long load = cfs_rq->load.weight; | ||
731 | |||
732 | if (cfs_rq->tg == &root_task_group) | ||
733 | return; | ||
734 | |||
735 | now = rq_of(cfs_rq)->clock_task; | ||
736 | delta = now - cfs_rq->load_stamp; | ||
737 | |||
738 | /* truncate load history at 4 idle periods */ | ||
739 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
740 | now - cfs_rq->load_last > 4 * period) { | ||
741 | cfs_rq->load_period = 0; | ||
742 | cfs_rq->load_avg = 0; | ||
743 | delta = period - 1; | ||
744 | } | ||
745 | |||
746 | cfs_rq->load_stamp = now; | ||
747 | cfs_rq->load_unacc_exec_time = 0; | ||
748 | cfs_rq->load_period += delta; | ||
749 | if (load) { | ||
750 | cfs_rq->load_last = now; | ||
751 | cfs_rq->load_avg += delta * load; | ||
752 | } | ||
753 | |||
754 | /* consider updating load contribution on each fold or truncate */ | ||
755 | if (global_update || cfs_rq->load_period > period | ||
756 | || !cfs_rq->load_period) | ||
757 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
758 | |||
759 | while (cfs_rq->load_period > period) { | ||
760 | /* | ||
761 | * Inline assembly required to prevent the compiler | ||
762 | * optimising this loop into a divmod call. | ||
763 | * See __iter_div_u64_rem() for another example of this. | ||
764 | */ | ||
765 | asm("" : "+rm" (cfs_rq->load_period)); | ||
766 | cfs_rq->load_period /= 2; | ||
767 | cfs_rq->load_avg /= 2; | ||
768 | } | ||
769 | |||
770 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
771 | list_del_leaf_cfs_rq(cfs_rq); | ||
772 | } | ||
773 | |||
774 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
775 | { | ||
776 | long load_weight, load, shares; | ||
777 | |||
778 | load = cfs_rq->load.weight; | ||
779 | |||
780 | load_weight = atomic_read(&tg->load_weight); | ||
781 | load_weight += load; | ||
782 | load_weight -= cfs_rq->load_contribution; | ||
783 | |||
784 | shares = (tg->shares * load); | ||
785 | if (load_weight) | ||
786 | shares /= load_weight; | ||
787 | |||
788 | if (shares < MIN_SHARES) | ||
789 | shares = MIN_SHARES; | ||
790 | if (shares > tg->shares) | ||
791 | shares = tg->shares; | ||
792 | |||
793 | return shares; | ||
794 | } | ||
795 | |||
796 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
797 | { | ||
798 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
799 | update_cfs_load(cfs_rq, 0); | ||
800 | update_cfs_shares(cfs_rq); | ||
801 | } | ||
802 | } | ||
803 | # else /* CONFIG_SMP */ | ||
804 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
805 | { | ||
806 | } | ||
807 | |||
808 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
809 | { | ||
810 | return tg->shares; | ||
811 | } | ||
812 | |||
813 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
814 | { | ||
815 | } | ||
816 | # endif /* CONFIG_SMP */ | ||
817 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
818 | unsigned long weight) | ||
819 | { | ||
820 | if (se->on_rq) { | ||
821 | /* commit outstanding execution time */ | ||
822 | if (cfs_rq->curr == se) | ||
823 | update_curr(cfs_rq); | ||
824 | account_entity_dequeue(cfs_rq, se); | ||
825 | } | ||
826 | |||
827 | update_load_set(&se->load, weight); | ||
828 | |||
829 | if (se->on_rq) | ||
830 | account_entity_enqueue(cfs_rq, se); | ||
831 | } | ||
832 | |||
833 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
834 | { | ||
835 | struct task_group *tg; | ||
836 | struct sched_entity *se; | ||
837 | long shares; | ||
838 | |||
839 | tg = cfs_rq->tg; | ||
840 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
841 | if (!se) | ||
842 | return; | ||
843 | #ifndef CONFIG_SMP | ||
844 | if (likely(se->load.weight == tg->shares)) | ||
845 | return; | ||
846 | #endif | ||
847 | shares = calc_cfs_shares(cfs_rq, tg); | ||
848 | |||
849 | reweight_entity(cfs_rq_of(se), se, shares); | ||
850 | } | ||
851 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
852 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
853 | { | ||
854 | } | ||
855 | |||
856 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
857 | { | ||
858 | } | ||
859 | |||
860 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
861 | { | ||
862 | } | ||
863 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
864 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 865 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 866 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 867 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,7 +983,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 983 | * Update run-time statistics of the 'current'. |
772 | */ | 984 | */ |
773 | update_curr(cfs_rq); | 985 | update_curr(cfs_rq); |
986 | update_cfs_load(cfs_rq, 0); | ||
774 | account_entity_enqueue(cfs_rq, se); | 987 | account_entity_enqueue(cfs_rq, se); |
988 | update_cfs_shares(cfs_rq); | ||
775 | 989 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 990 | if (flags & ENQUEUE_WAKEUP) { |
777 | place_entity(cfs_rq, se, 0); | 991 | place_entity(cfs_rq, se, 0); |
@@ -782,21 +996,55 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 996 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 997 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 998 | __enqueue_entity(cfs_rq, se); |
999 | se->on_rq = 1; | ||
1000 | |||
1001 | if (cfs_rq->nr_running == 1) | ||
1002 | list_add_leaf_cfs_rq(cfs_rq); | ||
1003 | } | ||
1004 | |||
1005 | static void __clear_buddies_last(struct sched_entity *se) | ||
1006 | { | ||
1007 | for_each_sched_entity(se) { | ||
1008 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1009 | if (cfs_rq->last == se) | ||
1010 | cfs_rq->last = NULL; | ||
1011 | else | ||
1012 | break; | ||
1013 | } | ||
785 | } | 1014 | } |
786 | 1015 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1016 | static void __clear_buddies_next(struct sched_entity *se) |
788 | { | 1017 | { |
789 | if (!se || cfs_rq->last == se) | 1018 | for_each_sched_entity(se) { |
790 | cfs_rq->last = NULL; | 1019 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1020 | if (cfs_rq->next == se) | ||
1021 | cfs_rq->next = NULL; | ||
1022 | else | ||
1023 | break; | ||
1024 | } | ||
1025 | } | ||
791 | 1026 | ||
792 | if (!se || cfs_rq->next == se) | 1027 | static void __clear_buddies_skip(struct sched_entity *se) |
793 | cfs_rq->next = NULL; | 1028 | { |
1029 | for_each_sched_entity(se) { | ||
1030 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1031 | if (cfs_rq->skip == se) | ||
1032 | cfs_rq->skip = NULL; | ||
1033 | else | ||
1034 | break; | ||
1035 | } | ||
794 | } | 1036 | } |
795 | 1037 | ||
796 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1038 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
797 | { | 1039 | { |
798 | for_each_sched_entity(se) | 1040 | if (cfs_rq->last == se) |
799 | __clear_buddies(cfs_rq_of(se), se); | 1041 | __clear_buddies_last(se); |
1042 | |||
1043 | if (cfs_rq->next == se) | ||
1044 | __clear_buddies_next(se); | ||
1045 | |||
1046 | if (cfs_rq->skip == se) | ||
1047 | __clear_buddies_skip(se); | ||
800 | } | 1048 | } |
801 | 1049 | ||
802 | static void | 1050 | static void |
@@ -825,8 +1073,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 1073 | ||
826 | if (se != cfs_rq->curr) | 1074 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 1075 | __dequeue_entity(cfs_rq, se); |
1076 | se->on_rq = 0; | ||
1077 | update_cfs_load(cfs_rq, 0); | ||
828 | account_entity_dequeue(cfs_rq, se); | 1078 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | ||
830 | 1079 | ||
831 | /* | 1080 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 1081 | * Normalize the entity after updating the min_vruntime because the |
@@ -835,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
835 | */ | 1084 | */ |
836 | if (!(flags & DEQUEUE_SLEEP)) | 1085 | if (!(flags & DEQUEUE_SLEEP)) |
837 | se->vruntime -= cfs_rq->min_vruntime; | 1086 | se->vruntime -= cfs_rq->min_vruntime; |
1087 | |||
1088 | update_min_vruntime(cfs_rq); | ||
1089 | update_cfs_shares(cfs_rq); | ||
838 | } | 1090 | } |
839 | 1091 | ||
840 | /* | 1092 | /* |
@@ -869,9 +1121,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
869 | return; | 1121 | return; |
870 | 1122 | ||
871 | if (cfs_rq->nr_running > 1) { | 1123 | if (cfs_rq->nr_running > 1) { |
872 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1124 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
873 | s64 delta = curr->vruntime - se->vruntime; | 1125 | s64 delta = curr->vruntime - se->vruntime; |
874 | 1126 | ||
1127 | if (delta < 0) | ||
1128 | return; | ||
1129 | |||
875 | if (delta > ideal_runtime) | 1130 | if (delta > ideal_runtime) |
876 | resched_task(rq_of(cfs_rq)->curr); | 1131 | resched_task(rq_of(cfs_rq)->curr); |
877 | } | 1132 | } |
@@ -910,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
910 | static int | 1165 | static int |
911 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | 1166 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
912 | 1167 | ||
1168 | /* | ||
1169 | * Pick the next process, keeping these things in mind, in this order: | ||
1170 | * 1) keep things fair between processes/task groups | ||
1171 | * 2) pick the "next" process, since someone really wants that to run | ||
1172 | * 3) pick the "last" process, for cache locality | ||
1173 | * 4) do not run the "skip" process, if something else is available | ||
1174 | */ | ||
913 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 1175 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
914 | { | 1176 | { |
915 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1177 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
916 | struct sched_entity *left = se; | 1178 | struct sched_entity *left = se; |
917 | 1179 | ||
918 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | 1180 | /* |
919 | se = cfs_rq->next; | 1181 | * Avoid running the skip buddy, if running something else can |
1182 | * be done without getting too unfair. | ||
1183 | */ | ||
1184 | if (cfs_rq->skip == se) { | ||
1185 | struct sched_entity *second = __pick_next_entity(se); | ||
1186 | if (second && wakeup_preempt_entity(second, left) < 1) | ||
1187 | se = second; | ||
1188 | } | ||
920 | 1189 | ||
921 | /* | 1190 | /* |
922 | * Prefer last buddy, try to return the CPU to a preempted task. | 1191 | * Prefer last buddy, try to return the CPU to a preempted task. |
@@ -924,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
924 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | 1193 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
925 | se = cfs_rq->last; | 1194 | se = cfs_rq->last; |
926 | 1195 | ||
1196 | /* | ||
1197 | * Someone really wants this to run. If it's not unfair, run it. | ||
1198 | */ | ||
1199 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | ||
1200 | se = cfs_rq->next; | ||
1201 | |||
927 | clear_buddies(cfs_rq, se); | 1202 | clear_buddies(cfs_rq, se); |
928 | 1203 | ||
929 | return se; | 1204 | return se; |
@@ -955,6 +1230,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
955 | */ | 1230 | */ |
956 | update_curr(cfs_rq); | 1231 | update_curr(cfs_rq); |
957 | 1232 | ||
1233 | /* | ||
1234 | * Update share accounting for long-running entities. | ||
1235 | */ | ||
1236 | update_entity_shares_tick(cfs_rq); | ||
1237 | |||
958 | #ifdef CONFIG_SCHED_HRTICK | 1238 | #ifdef CONFIG_SCHED_HRTICK |
959 | /* | 1239 | /* |
960 | * queued ticks are scheduled to match the slice, so don't bother | 1240 | * queued ticks are scheduled to match the slice, so don't bother |
@@ -1055,9 +1335,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1335 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1336 | } |
1057 | 1337 | ||
1338 | for_each_sched_entity(se) { | ||
1339 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1340 | |||
1341 | update_cfs_load(cfs_rq, 0); | ||
1342 | update_cfs_shares(cfs_rq); | ||
1343 | } | ||
1344 | |||
1058 | hrtick_update(rq); | 1345 | hrtick_update(rq); |
1059 | } | 1346 | } |
1060 | 1347 | ||
1348 | static void set_next_buddy(struct sched_entity *se); | ||
1349 | |||
1061 | /* | 1350 | /* |
1062 | * The dequeue_task method is called before nr_running is | 1351 | * The dequeue_task method is called before nr_running is |
1063 | * decreased. We remove the task from the rbtree and | 1352 | * decreased. We remove the task from the rbtree and |
@@ -1067,73 +1356,56 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1067 | { | 1356 | { |
1068 | struct cfs_rq *cfs_rq; | 1357 | struct cfs_rq *cfs_rq; |
1069 | struct sched_entity *se = &p->se; | 1358 | struct sched_entity *se = &p->se; |
1359 | int task_sleep = flags & DEQUEUE_SLEEP; | ||
1070 | 1360 | ||
1071 | for_each_sched_entity(se) { | 1361 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1362 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1363 | dequeue_entity(cfs_rq, se, flags); |
1364 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1365 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1366 | if (cfs_rq->load.weight) { |
1367 | /* | ||
1368 | * Bias pick_next to pick a task from this cfs_rq, as | ||
1369 | * p is sleeping when it is within its sched_slice. | ||
1370 | */ | ||
1371 | if (task_sleep && parent_entity(se)) | ||
1372 | set_next_buddy(parent_entity(se)); | ||
1076 | break; | 1373 | break; |
1374 | } | ||
1077 | flags |= DEQUEUE_SLEEP; | 1375 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1376 | } |
1079 | 1377 | ||
1080 | hrtick_update(rq); | 1378 | for_each_sched_entity(se) { |
1081 | } | 1379 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1082 | |||
1083 | /* | ||
1084 | * sched_yield() support is very simple - we dequeue and enqueue. | ||
1085 | * | ||
1086 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
1087 | */ | ||
1088 | static void yield_task_fair(struct rq *rq) | ||
1089 | { | ||
1090 | struct task_struct *curr = rq->curr; | ||
1091 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1092 | struct sched_entity *rightmost, *se = &curr->se; | ||
1093 | |||
1094 | /* | ||
1095 | * Are we the only task in the tree? | ||
1096 | */ | ||
1097 | if (unlikely(cfs_rq->nr_running == 1)) | ||
1098 | return; | ||
1099 | |||
1100 | clear_buddies(cfs_rq, se); | ||
1101 | |||
1102 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | ||
1103 | update_rq_clock(rq); | ||
1104 | /* | ||
1105 | * Update run-time statistics of the 'current'. | ||
1106 | */ | ||
1107 | update_curr(cfs_rq); | ||
1108 | 1380 | ||
1109 | return; | 1381 | update_cfs_load(cfs_rq, 0); |
1382 | update_cfs_shares(cfs_rq); | ||
1110 | } | 1383 | } |
1111 | /* | ||
1112 | * Find the rightmost entry in the rbtree: | ||
1113 | */ | ||
1114 | rightmost = __pick_last_entity(cfs_rq); | ||
1115 | /* | ||
1116 | * Already in the rightmost position? | ||
1117 | */ | ||
1118 | if (unlikely(!rightmost || entity_before(rightmost, se))) | ||
1119 | return; | ||
1120 | 1384 | ||
1121 | /* | 1385 | hrtick_update(rq); |
1122 | * Minimally necessary key value to be last in the tree: | ||
1123 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
1124 | * 'current' within the tree based on its new key value. | ||
1125 | */ | ||
1126 | se->vruntime = rightmost->vruntime + 1; | ||
1127 | } | 1386 | } |
1128 | 1387 | ||
1129 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
1130 | 1389 | ||
1131 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1390 | static void task_waking_fair(struct task_struct *p) |
1132 | { | 1391 | { |
1133 | struct sched_entity *se = &p->se; | 1392 | struct sched_entity *se = &p->se; |
1134 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1393 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1394 | u64 min_vruntime; | ||
1135 | 1395 | ||
1136 | se->vruntime -= cfs_rq->min_vruntime; | 1396 | #ifndef CONFIG_64BIT |
1397 | u64 min_vruntime_copy; | ||
1398 | |||
1399 | do { | ||
1400 | min_vruntime_copy = cfs_rq->min_vruntime_copy; | ||
1401 | smp_rmb(); | ||
1402 | min_vruntime = cfs_rq->min_vruntime; | ||
1403 | } while (min_vruntime != min_vruntime_copy); | ||
1404 | #else | ||
1405 | min_vruntime = cfs_rq->min_vruntime; | ||
1406 | #endif | ||
1407 | |||
1408 | se->vruntime -= min_vruntime; | ||
1137 | } | 1409 | } |
1138 | 1410 | ||
1139 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1411 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1143,67 +1415,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1415 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1416 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1417 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1418 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1419 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1420 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1421 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1422 | ||
1166 | if (!tg->parent) | 1423 | if (!tg->parent) |
1167 | return wl; | 1424 | return wl; |
1168 | 1425 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1426 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1427 | long lw, w; |
1178 | long more_w; | ||
1179 | 1428 | ||
1180 | /* | 1429 | tg = se->my_q->tg; |
1181 | * Instead of using this increment, also add the difference | 1430 | w = se->my_q->load.weight; |
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1431 | ||
1188 | S = se->my_q->tg->shares; | 1432 | /* use this cpu's instantaneous contribution */ |
1189 | s = se->my_q->shares; | 1433 | lw = atomic_read(&tg->load_weight); |
1190 | rw = se->my_q->rq_weight; | 1434 | lw -= se->my_q->load_contribution; |
1435 | lw += w + wg; | ||
1191 | 1436 | ||
1192 | a = S*(rw + wl); | 1437 | wl += w; |
1193 | b = S*rw + s*wg; | ||
1194 | 1438 | ||
1195 | wl = s*(a-b); | 1439 | if (lw > 0 && wl < lw) |
1196 | 1440 | wl = (wl * tg->shares) / lw; | |
1197 | if (likely(b)) | 1441 | else |
1198 | wl /= b; | 1442 | wl = tg->shares; |
1199 | 1443 | ||
1200 | /* | 1444 | /* zero point is MIN_SHARES */ |
1201 | * Assume the group is already running and will | 1445 | if (wl < MIN_SHARES) |
1202 | * thus already be accounted for in the weight. | 1446 | wl = MIN_SHARES; |
1203 | * | 1447 | wl -= se->load.weight; |
1204 | * That is, moving shares between CPUs, does not | ||
1205 | * alter the group weight. | ||
1206 | */ | ||
1207 | wg = 0; | 1448 | wg = 0; |
1208 | } | 1449 | } |
1209 | 1450 | ||
@@ -1222,7 +1463,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
1222 | 1463 | ||
1223 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 1464 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
1224 | { | 1465 | { |
1225 | unsigned long this_load, load; | 1466 | s64 this_load, load; |
1226 | int idx, this_cpu, prev_cpu; | 1467 | int idx, this_cpu, prev_cpu; |
1227 | unsigned long tl_per_task; | 1468 | unsigned long tl_per_task; |
1228 | struct task_group *tg; | 1469 | struct task_group *tg; |
@@ -1261,8 +1502,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1261 | * Otherwise check if either cpus are near enough in load to allow this | 1502 | * Otherwise check if either cpus are near enough in load to allow this |
1262 | * task to be woken on this_cpu. | 1503 | * task to be woken on this_cpu. |
1263 | */ | 1504 | */ |
1264 | if (this_load) { | 1505 | if (this_load > 0) { |
1265 | unsigned long this_eff_load, prev_eff_load; | 1506 | s64 this_eff_load, prev_eff_load; |
1266 | 1507 | ||
1267 | this_eff_load = 100; | 1508 | this_eff_load = 100; |
1268 | this_eff_load *= power_of(prev_cpu); | 1509 | this_eff_load *= power_of(prev_cpu); |
@@ -1344,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1344 | } | 1585 | } |
1345 | 1586 | ||
1346 | /* Adjust by relative CPU power of the group */ | 1587 | /* Adjust by relative CPU power of the group */ |
1347 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1588 | avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; |
1348 | 1589 | ||
1349 | if (local_group) { | 1590 | if (local_group) { |
1350 | this_load = avg_load; | 1591 | this_load = avg_load; |
@@ -1409,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1409 | /* | 1650 | /* |
1410 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1651 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1411 | */ | 1652 | */ |
1653 | rcu_read_lock(); | ||
1412 | for_each_domain(target, sd) { | 1654 | for_each_domain(target, sd) { |
1413 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1655 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1414 | break; | 1656 | break; |
@@ -1428,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1428 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1670 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
1429 | break; | 1671 | break; |
1430 | } | 1672 | } |
1673 | rcu_read_unlock(); | ||
1431 | 1674 | ||
1432 | return target; | 1675 | return target; |
1433 | } | 1676 | } |
@@ -1444,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1444 | * preempt must be disabled. | 1687 | * preempt must be disabled. |
1445 | */ | 1688 | */ |
1446 | static int | 1689 | static int |
1447 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | 1690 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
1448 | { | 1691 | { |
1449 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1692 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1450 | int cpu = smp_processor_id(); | 1693 | int cpu = smp_processor_id(); |
@@ -1460,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1460 | new_cpu = prev_cpu; | 1703 | new_cpu = prev_cpu; |
1461 | } | 1704 | } |
1462 | 1705 | ||
1706 | rcu_read_lock(); | ||
1463 | for_each_domain(cpu, tmp) { | 1707 | for_each_domain(cpu, tmp) { |
1464 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1708 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
1465 | continue; | 1709 | continue; |
@@ -1479,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1479 | nr_running += cpu_rq(i)->cfs.nr_running; | 1723 | nr_running += cpu_rq(i)->cfs.nr_running; |
1480 | } | 1724 | } |
1481 | 1725 | ||
1482 | capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 1726 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
1483 | 1727 | ||
1484 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1728 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1485 | nr_running /= 2; | 1729 | nr_running /= 2; |
@@ -1508,28 +1752,12 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1752 | sd = tmp; |
1509 | } | 1753 | } |
1510 | 1754 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1755 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1756 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1757 | prev_cpu = cpu; |
1531 | else | 1758 | |
1532 | return select_idle_sibling(p, prev_cpu); | 1759 | new_cpu = select_idle_sibling(p, prev_cpu); |
1760 | goto unlock; | ||
1533 | } | 1761 | } |
1534 | 1762 | ||
1535 | while (sd) { | 1763 | while (sd) { |
@@ -1570,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1570 | } | 1798 | } |
1571 | /* while loop will break here if sd == NULL */ | 1799 | /* while loop will break here if sd == NULL */ |
1572 | } | 1800 | } |
1801 | unlock: | ||
1802 | rcu_read_unlock(); | ||
1573 | 1803 | ||
1574 | return new_cpu; | 1804 | return new_cpu; |
1575 | } | 1805 | } |
@@ -1593,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | |||
1593 | * This is especially important for buddies when the leftmost | 1823 | * This is especially important for buddies when the leftmost |
1594 | * task is higher priority than the buddy. | 1824 | * task is higher priority than the buddy. |
1595 | */ | 1825 | */ |
1596 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1826 | return calc_delta_fair(gran, se); |
1597 | gran = calc_delta_fair(gran, se); | ||
1598 | |||
1599 | return gran; | ||
1600 | } | 1827 | } |
1601 | 1828 | ||
1602 | /* | 1829 | /* |
@@ -1630,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
1630 | 1857 | ||
1631 | static void set_last_buddy(struct sched_entity *se) | 1858 | static void set_last_buddy(struct sched_entity *se) |
1632 | { | 1859 | { |
1633 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1860 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1634 | for_each_sched_entity(se) | 1861 | return; |
1635 | cfs_rq_of(se)->last = se; | 1862 | |
1636 | } | 1863 | for_each_sched_entity(se) |
1864 | cfs_rq_of(se)->last = se; | ||
1637 | } | 1865 | } |
1638 | 1866 | ||
1639 | static void set_next_buddy(struct sched_entity *se) | 1867 | static void set_next_buddy(struct sched_entity *se) |
1640 | { | 1868 | { |
1641 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1869 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1642 | for_each_sched_entity(se) | 1870 | return; |
1643 | cfs_rq_of(se)->next = se; | 1871 | |
1644 | } | 1872 | for_each_sched_entity(se) |
1873 | cfs_rq_of(se)->next = se; | ||
1874 | } | ||
1875 | |||
1876 | static void set_skip_buddy(struct sched_entity *se) | ||
1877 | { | ||
1878 | for_each_sched_entity(se) | ||
1879 | cfs_rq_of(se)->skip = se; | ||
1645 | } | 1880 | } |
1646 | 1881 | ||
1647 | /* | 1882 | /* |
@@ -1653,18 +1888,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1653 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1888 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1654 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1889 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1655 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1890 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1891 | int next_buddy_marked = 0; | ||
1656 | 1892 | ||
1657 | if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) | 1893 | if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) |
1658 | goto preempt; | 1894 | goto preempt; |
1659 | 1895 | ||
1660 | if (unlikely(p->sched_class != &fair_sched_class)) | ||
1661 | return; | ||
1662 | |||
1663 | if (unlikely(se == pse)) | 1896 | if (unlikely(se == pse)) |
1664 | return; | 1897 | return; |
1665 | 1898 | ||
1666 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) | 1899 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1667 | set_next_buddy(pse); | 1900 | set_next_buddy(pse); |
1901 | next_buddy_marked = 1; | ||
1902 | } | ||
1668 | 1903 | ||
1669 | /* | 1904 | /* |
1670 | * We can come here with TIF_NEED_RESCHED already set from new task | 1905 | * We can come here with TIF_NEED_RESCHED already set from new task |
@@ -1673,16 +1908,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1673 | if (test_tsk_need_resched(curr)) | 1908 | if (test_tsk_need_resched(curr)) |
1674 | return; | 1909 | return; |
1675 | 1910 | ||
1911 | /* Idle tasks are by definition preempted by non-idle tasks. */ | ||
1912 | if (unlikely(curr->policy == SCHED_IDLE) && | ||
1913 | likely(p->policy != SCHED_IDLE)) | ||
1914 | goto preempt; | ||
1915 | |||
1676 | /* | 1916 | /* |
1677 | * Batch and idle tasks do not preempt (their preemption is driven by | 1917 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
1678 | * the tick): | 1918 | * is driven by the tick): |
1679 | */ | 1919 | */ |
1680 | if (unlikely(p->policy != SCHED_NORMAL)) | 1920 | if (unlikely(p->policy != SCHED_NORMAL)) |
1681 | return; | 1921 | return; |
1682 | 1922 | ||
1683 | /* Idle tasks are by definition preempted by everybody. */ | ||
1684 | if (unlikely(curr->policy == SCHED_IDLE)) | ||
1685 | goto preempt; | ||
1686 | 1923 | ||
1687 | if (!sched_feat(WAKEUP_PREEMPT)) | 1924 | if (!sched_feat(WAKEUP_PREEMPT)) |
1688 | return; | 1925 | return; |
@@ -1690,8 +1927,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1690 | update_curr(cfs_rq); | 1927 | update_curr(cfs_rq); |
1691 | find_matching_se(&se, &pse); | 1928 | find_matching_se(&se, &pse); |
1692 | BUG_ON(!pse); | 1929 | BUG_ON(!pse); |
1693 | if (wakeup_preempt_entity(se, pse) == 1) | 1930 | if (wakeup_preempt_entity(se, pse) == 1) { |
1931 | /* | ||
1932 | * Bias pick_next to pick the sched entity that is | ||
1933 | * triggering this preemption. | ||
1934 | */ | ||
1935 | if (!next_buddy_marked) | ||
1936 | set_next_buddy(pse); | ||
1694 | goto preempt; | 1937 | goto preempt; |
1938 | } | ||
1695 | 1939 | ||
1696 | return; | 1940 | return; |
1697 | 1941 | ||
@@ -1748,6 +1992,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1748 | } | 1992 | } |
1749 | } | 1993 | } |
1750 | 1994 | ||
1995 | /* | ||
1996 | * sched_yield() is very simple | ||
1997 | * | ||
1998 | * The magic of dealing with the ->skip buddy is in pick_next_entity. | ||
1999 | */ | ||
2000 | static void yield_task_fair(struct rq *rq) | ||
2001 | { | ||
2002 | struct task_struct *curr = rq->curr; | ||
2003 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
2004 | struct sched_entity *se = &curr->se; | ||
2005 | |||
2006 | /* | ||
2007 | * Are we the only task in the tree? | ||
2008 | */ | ||
2009 | if (unlikely(rq->nr_running == 1)) | ||
2010 | return; | ||
2011 | |||
2012 | clear_buddies(cfs_rq, se); | ||
2013 | |||
2014 | if (curr->policy != SCHED_BATCH) { | ||
2015 | update_rq_clock(rq); | ||
2016 | /* | ||
2017 | * Update run-time statistics of the 'current'. | ||
2018 | */ | ||
2019 | update_curr(cfs_rq); | ||
2020 | } | ||
2021 | |||
2022 | set_skip_buddy(se); | ||
2023 | } | ||
2024 | |||
2025 | static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) | ||
2026 | { | ||
2027 | struct sched_entity *se = &p->se; | ||
2028 | |||
2029 | if (!se->on_rq) | ||
2030 | return false; | ||
2031 | |||
2032 | /* Tell the scheduler that we'd really like pse to run next. */ | ||
2033 | set_next_buddy(se); | ||
2034 | |||
2035 | yield_task_fair(rq); | ||
2036 | |||
2037 | return true; | ||
2038 | } | ||
2039 | |||
1751 | #ifdef CONFIG_SMP | 2040 | #ifdef CONFIG_SMP |
1752 | /************************************************** | 2041 | /************************************************** |
1753 | * Fair scheduling class load-balancing methods: | 2042 | * Fair scheduling class load-balancing methods: |
@@ -1798,7 +2087,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
1798 | * 2) too many balance attempts have failed. | 2087 | * 2) too many balance attempts have failed. |
1799 | */ | 2088 | */ |
1800 | 2089 | ||
1801 | tsk_cache_hot = task_hot(p, rq->clock, sd); | 2090 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); |
1802 | if (!tsk_cache_hot || | 2091 | if (!tsk_cache_hot || |
1803 | sd->nr_balance_failed > sd->cache_nice_tries) { | 2092 | sd->nr_balance_failed > sd->cache_nice_tries) { |
1804 | #ifdef CONFIG_SCHEDSTATS | 2093 | #ifdef CONFIG_SCHEDSTATS |
@@ -1857,23 +2146,22 @@ static unsigned long | |||
1857 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2146 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1858 | unsigned long max_load_move, struct sched_domain *sd, | 2147 | unsigned long max_load_move, struct sched_domain *sd, |
1859 | enum cpu_idle_type idle, int *all_pinned, | 2148 | enum cpu_idle_type idle, int *all_pinned, |
1860 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2149 | struct cfs_rq *busiest_cfs_rq) |
1861 | { | 2150 | { |
1862 | int loops = 0, pulled = 0, pinned = 0; | 2151 | int loops = 0, pulled = 0; |
1863 | long rem_load_move = max_load_move; | 2152 | long rem_load_move = max_load_move; |
1864 | struct task_struct *p, *n; | 2153 | struct task_struct *p, *n; |
1865 | 2154 | ||
1866 | if (max_load_move == 0) | 2155 | if (max_load_move == 0) |
1867 | goto out; | 2156 | goto out; |
1868 | 2157 | ||
1869 | pinned = 1; | ||
1870 | |||
1871 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 2158 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
1872 | if (loops++ > sysctl_sched_nr_migrate) | 2159 | if (loops++ > sysctl_sched_nr_migrate) |
1873 | break; | 2160 | break; |
1874 | 2161 | ||
1875 | if ((p->se.load.weight >> 1) > rem_load_move || | 2162 | if ((p->se.load.weight >> 1) > rem_load_move || |
1876 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | 2163 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2164 | all_pinned)) | ||
1877 | continue; | 2165 | continue; |
1878 | 2166 | ||
1879 | pull_task(busiest, p, this_rq, this_cpu); | 2167 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -1896,9 +2184,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1896 | */ | 2184 | */ |
1897 | if (rem_load_move <= 0) | 2185 | if (rem_load_move <= 0) |
1898 | break; | 2186 | break; |
1899 | |||
1900 | if (p->prio < *this_best_prio) | ||
1901 | *this_best_prio = p->prio; | ||
1902 | } | 2187 | } |
1903 | out: | 2188 | out: |
1904 | /* | 2189 | /* |
@@ -1908,18 +2193,57 @@ out: | |||
1908 | */ | 2193 | */ |
1909 | schedstat_add(sd, lb_gained[idle], pulled); | 2194 | schedstat_add(sd, lb_gained[idle], pulled); |
1910 | 2195 | ||
1911 | if (all_pinned) | ||
1912 | *all_pinned = pinned; | ||
1913 | |||
1914 | return max_load_move - rem_load_move; | 2196 | return max_load_move - rem_load_move; |
1915 | } | 2197 | } |
1916 | 2198 | ||
1917 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2199 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2200 | /* | ||
2201 | * update tg->load_weight by folding this cpu's load_avg | ||
2202 | */ | ||
2203 | static int update_shares_cpu(struct task_group *tg, int cpu) | ||
2204 | { | ||
2205 | struct cfs_rq *cfs_rq; | ||
2206 | unsigned long flags; | ||
2207 | struct rq *rq; | ||
2208 | |||
2209 | if (!tg->se[cpu]) | ||
2210 | return 0; | ||
2211 | |||
2212 | rq = cpu_rq(cpu); | ||
2213 | cfs_rq = tg->cfs_rq[cpu]; | ||
2214 | |||
2215 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2216 | |||
2217 | update_rq_clock(rq); | ||
2218 | update_cfs_load(cfs_rq, 1); | ||
2219 | |||
2220 | /* | ||
2221 | * We need to update shares after updating tg->load_weight in | ||
2222 | * order to adjust the weight of groups with long running tasks. | ||
2223 | */ | ||
2224 | update_cfs_shares(cfs_rq); | ||
2225 | |||
2226 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
2227 | |||
2228 | return 0; | ||
2229 | } | ||
2230 | |||
2231 | static void update_shares(int cpu) | ||
2232 | { | ||
2233 | struct cfs_rq *cfs_rq; | ||
2234 | struct rq *rq = cpu_rq(cpu); | ||
2235 | |||
2236 | rcu_read_lock(); | ||
2237 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
2238 | update_shares_cpu(cfs_rq->tg, cpu); | ||
2239 | rcu_read_unlock(); | ||
2240 | } | ||
2241 | |||
1918 | static unsigned long | 2242 | static unsigned long |
1919 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2243 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1920 | unsigned long max_load_move, | 2244 | unsigned long max_load_move, |
1921 | struct sched_domain *sd, enum cpu_idle_type idle, | 2245 | struct sched_domain *sd, enum cpu_idle_type idle, |
1922 | int *all_pinned, int *this_best_prio) | 2246 | int *all_pinned) |
1923 | { | 2247 | { |
1924 | long rem_load_move = max_load_move; | 2248 | long rem_load_move = max_load_move; |
1925 | int busiest_cpu = cpu_of(busiest); | 2249 | int busiest_cpu = cpu_of(busiest); |
@@ -1944,7 +2268,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1944 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2268 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
1945 | 2269 | ||
1946 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 2270 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
1947 | rem_load, sd, idle, all_pinned, this_best_prio, | 2271 | rem_load, sd, idle, all_pinned, |
1948 | busiest_cfs_rq); | 2272 | busiest_cfs_rq); |
1949 | 2273 | ||
1950 | if (!moved_load) | 2274 | if (!moved_load) |
@@ -1962,15 +2286,19 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1962 | return max_load_move - rem_load_move; | 2286 | return max_load_move - rem_load_move; |
1963 | } | 2287 | } |
1964 | #else | 2288 | #else |
2289 | static inline void update_shares(int cpu) | ||
2290 | { | ||
2291 | } | ||
2292 | |||
1965 | static unsigned long | 2293 | static unsigned long |
1966 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2294 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1967 | unsigned long max_load_move, | 2295 | unsigned long max_load_move, |
1968 | struct sched_domain *sd, enum cpu_idle_type idle, | 2296 | struct sched_domain *sd, enum cpu_idle_type idle, |
1969 | int *all_pinned, int *this_best_prio) | 2297 | int *all_pinned) |
1970 | { | 2298 | { |
1971 | return balance_tasks(this_rq, this_cpu, busiest, | 2299 | return balance_tasks(this_rq, this_cpu, busiest, |
1972 | max_load_move, sd, idle, all_pinned, | 2300 | max_load_move, sd, idle, all_pinned, |
1973 | this_best_prio, &busiest->cfs); | 2301 | &busiest->cfs); |
1974 | } | 2302 | } |
1975 | #endif | 2303 | #endif |
1976 | 2304 | ||
@@ -1987,12 +2315,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1987 | int *all_pinned) | 2315 | int *all_pinned) |
1988 | { | 2316 | { |
1989 | unsigned long total_load_moved = 0, load_moved; | 2317 | unsigned long total_load_moved = 0, load_moved; |
1990 | int this_best_prio = this_rq->curr->prio; | ||
1991 | 2318 | ||
1992 | do { | 2319 | do { |
1993 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 2320 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
1994 | max_load_move - total_load_moved, | 2321 | max_load_move - total_load_moved, |
1995 | sd, idle, all_pinned, &this_best_prio); | 2322 | sd, idle, all_pinned); |
1996 | 2323 | ||
1997 | total_load_moved += load_moved; | 2324 | total_load_moved += load_moved; |
1998 | 2325 | ||
@@ -2030,12 +2357,17 @@ struct sd_lb_stats { | |||
2030 | unsigned long this_load; | 2357 | unsigned long this_load; |
2031 | unsigned long this_load_per_task; | 2358 | unsigned long this_load_per_task; |
2032 | unsigned long this_nr_running; | 2359 | unsigned long this_nr_running; |
2360 | unsigned long this_has_capacity; | ||
2361 | unsigned int this_idle_cpus; | ||
2033 | 2362 | ||
2034 | /* Statistics of the busiest group */ | 2363 | /* Statistics of the busiest group */ |
2364 | unsigned int busiest_idle_cpus; | ||
2035 | unsigned long max_load; | 2365 | unsigned long max_load; |
2036 | unsigned long busiest_load_per_task; | 2366 | unsigned long busiest_load_per_task; |
2037 | unsigned long busiest_nr_running; | 2367 | unsigned long busiest_nr_running; |
2038 | unsigned long busiest_group_capacity; | 2368 | unsigned long busiest_group_capacity; |
2369 | unsigned long busiest_has_capacity; | ||
2370 | unsigned int busiest_group_weight; | ||
2039 | 2371 | ||
2040 | int group_imb; /* Is there imbalance in this sd */ | 2372 | int group_imb; /* Is there imbalance in this sd */ |
2041 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2373 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -2057,7 +2389,10 @@ struct sg_lb_stats { | |||
2057 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | 2389 | unsigned long sum_nr_running; /* Nr tasks running in the group */ |
2058 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2390 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
2059 | unsigned long group_capacity; | 2391 | unsigned long group_capacity; |
2392 | unsigned long idle_cpus; | ||
2393 | unsigned long group_weight; | ||
2060 | int group_imb; /* Is there an imbalance in the group ? */ | 2394 | int group_imb; /* Is there an imbalance in the group ? */ |
2395 | int group_has_capacity; /* Is there extra capacity in the group? */ | ||
2061 | }; | 2396 | }; |
2062 | 2397 | ||
2063 | /** | 2398 | /** |
@@ -2239,7 +2574,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
2239 | 2574 | ||
2240 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 2575 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
2241 | { | 2576 | { |
2242 | return SCHED_LOAD_SCALE; | 2577 | return SCHED_POWER_SCALE; |
2243 | } | 2578 | } |
2244 | 2579 | ||
2245 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | 2580 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) |
@@ -2268,12 +2603,18 @@ unsigned long scale_rt_power(int cpu) | |||
2268 | u64 total, available; | 2603 | u64 total, available; |
2269 | 2604 | ||
2270 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2605 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
2271 | available = total - rq->rt_avg; | ||
2272 | 2606 | ||
2273 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2607 | if (unlikely(total < rq->rt_avg)) { |
2274 | total = SCHED_LOAD_SCALE; | 2608 | /* Ensures that power won't end up being negative */ |
2609 | available = 0; | ||
2610 | } else { | ||
2611 | available = total - rq->rt_avg; | ||
2612 | } | ||
2613 | |||
2614 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | ||
2615 | total = SCHED_POWER_SCALE; | ||
2275 | 2616 | ||
2276 | total >>= SCHED_LOAD_SHIFT; | 2617 | total >>= SCHED_POWER_SHIFT; |
2277 | 2618 | ||
2278 | return div_u64(available, total); | 2619 | return div_u64(available, total); |
2279 | } | 2620 | } |
@@ -2281,7 +2622,7 @@ unsigned long scale_rt_power(int cpu) | |||
2281 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 2622 | static void update_cpu_power(struct sched_domain *sd, int cpu) |
2282 | { | 2623 | { |
2283 | unsigned long weight = sd->span_weight; | 2624 | unsigned long weight = sd->span_weight; |
2284 | unsigned long power = SCHED_LOAD_SCALE; | 2625 | unsigned long power = SCHED_POWER_SCALE; |
2285 | struct sched_group *sdg = sd->groups; | 2626 | struct sched_group *sdg = sd->groups; |
2286 | 2627 | ||
2287 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2628 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
@@ -2290,26 +2631,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2290 | else | 2631 | else |
2291 | power *= default_scale_smt_power(sd, cpu); | 2632 | power *= default_scale_smt_power(sd, cpu); |
2292 | 2633 | ||
2293 | power >>= SCHED_LOAD_SHIFT; | 2634 | power >>= SCHED_POWER_SHIFT; |
2294 | } | 2635 | } |
2295 | 2636 | ||
2296 | sdg->cpu_power_orig = power; | 2637 | sdg->sgp->power_orig = power; |
2297 | 2638 | ||
2298 | if (sched_feat(ARCH_POWER)) | 2639 | if (sched_feat(ARCH_POWER)) |
2299 | power *= arch_scale_freq_power(sd, cpu); | 2640 | power *= arch_scale_freq_power(sd, cpu); |
2300 | else | 2641 | else |
2301 | power *= default_scale_freq_power(sd, cpu); | 2642 | power *= default_scale_freq_power(sd, cpu); |
2302 | 2643 | ||
2303 | power >>= SCHED_LOAD_SHIFT; | 2644 | power >>= SCHED_POWER_SHIFT; |
2304 | 2645 | ||
2305 | power *= scale_rt_power(cpu); | 2646 | power *= scale_rt_power(cpu); |
2306 | power >>= SCHED_LOAD_SHIFT; | 2647 | power >>= SCHED_POWER_SHIFT; |
2307 | 2648 | ||
2308 | if (!power) | 2649 | if (!power) |
2309 | power = 1; | 2650 | power = 1; |
2310 | 2651 | ||
2311 | cpu_rq(cpu)->cpu_power = power; | 2652 | cpu_rq(cpu)->cpu_power = power; |
2312 | sdg->cpu_power = power; | 2653 | sdg->sgp->power = power; |
2313 | } | 2654 | } |
2314 | 2655 | ||
2315 | static void update_group_power(struct sched_domain *sd, int cpu) | 2656 | static void update_group_power(struct sched_domain *sd, int cpu) |
@@ -2327,11 +2668,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
2327 | 2668 | ||
2328 | group = child->groups; | 2669 | group = child->groups; |
2329 | do { | 2670 | do { |
2330 | power += group->cpu_power; | 2671 | power += group->sgp->power; |
2331 | group = group->next; | 2672 | group = group->next; |
2332 | } while (group != child->groups); | 2673 | } while (group != child->groups); |
2333 | 2674 | ||
2334 | sdg->cpu_power = power; | 2675 | sdg->sgp->power = power; |
2335 | } | 2676 | } |
2336 | 2677 | ||
2337 | /* | 2678 | /* |
@@ -2345,15 +2686,15 @@ static inline int | |||
2345 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 2686 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) |
2346 | { | 2687 | { |
2347 | /* | 2688 | /* |
2348 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | 2689 | * Only siblings can have significantly less than SCHED_POWER_SCALE |
2349 | */ | 2690 | */ |
2350 | if (sd->level != SD_LV_SIBLING) | 2691 | if (!(sd->flags & SD_SHARE_CPUPOWER)) |
2351 | return 0; | 2692 | return 0; |
2352 | 2693 | ||
2353 | /* | 2694 | /* |
2354 | * If ~90% of the cpu_power is still there, we're good. | 2695 | * If ~90% of the cpu_power is still there, we're good. |
2355 | */ | 2696 | */ |
2356 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | 2697 | if (group->sgp->power * 32 > group->sgp->power_orig * 29) |
2357 | return 1; | 2698 | return 1; |
2358 | 2699 | ||
2359 | return 0; | 2700 | return 0; |
@@ -2366,7 +2707,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2366 | * @this_cpu: Cpu for which load balance is currently performed. | 2707 | * @this_cpu: Cpu for which load balance is currently performed. |
2367 | * @idle: Idle status of this_cpu | 2708 | * @idle: Idle status of this_cpu |
2368 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 2709 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
2369 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2370 | * @local_group: Does group contain this_cpu. | 2710 | * @local_group: Does group contain this_cpu. |
2371 | * @cpus: Set of cpus considered for load balancing. | 2711 | * @cpus: Set of cpus considered for load balancing. |
2372 | * @balance: Should we balance. | 2712 | * @balance: Should we balance. |
@@ -2374,11 +2714,11 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2374 | */ | 2714 | */ |
2375 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 2715 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
2376 | struct sched_group *group, int this_cpu, | 2716 | struct sched_group *group, int this_cpu, |
2377 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 2717 | enum cpu_idle_type idle, int load_idx, |
2378 | int local_group, const struct cpumask *cpus, | 2718 | int local_group, const struct cpumask *cpus, |
2379 | int *balance, struct sg_lb_stats *sgs) | 2719 | int *balance, struct sg_lb_stats *sgs) |
2380 | { | 2720 | { |
2381 | unsigned long load, max_cpu_load, min_cpu_load; | 2721 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; |
2382 | int i; | 2722 | int i; |
2383 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2723 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2384 | unsigned long avg_load_per_task = 0; | 2724 | unsigned long avg_load_per_task = 0; |
@@ -2389,13 +2729,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2389 | /* Tally up the load of all CPUs in the group */ | 2729 | /* Tally up the load of all CPUs in the group */ |
2390 | max_cpu_load = 0; | 2730 | max_cpu_load = 0; |
2391 | min_cpu_load = ~0UL; | 2731 | min_cpu_load = ~0UL; |
2732 | max_nr_running = 0; | ||
2392 | 2733 | ||
2393 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2734 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
2394 | struct rq *rq = cpu_rq(i); | 2735 | struct rq *rq = cpu_rq(i); |
2395 | 2736 | ||
2396 | if (*sd_idle && rq->nr_running) | ||
2397 | *sd_idle = 0; | ||
2398 | |||
2399 | /* Bias balancing toward cpus of our domain */ | 2737 | /* Bias balancing toward cpus of our domain */ |
2400 | if (local_group) { | 2738 | if (local_group) { |
2401 | if (idle_cpu(i) && !first_idle_cpu) { | 2739 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -2406,8 +2744,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2406 | load = target_load(i, load_idx); | 2744 | load = target_load(i, load_idx); |
2407 | } else { | 2745 | } else { |
2408 | load = source_load(i, load_idx); | 2746 | load = source_load(i, load_idx); |
2409 | if (load > max_cpu_load) | 2747 | if (load > max_cpu_load) { |
2410 | max_cpu_load = load; | 2748 | max_cpu_load = load; |
2749 | max_nr_running = rq->nr_running; | ||
2750 | } | ||
2411 | if (min_cpu_load > load) | 2751 | if (min_cpu_load > load) |
2412 | min_cpu_load = load; | 2752 | min_cpu_load = load; |
2413 | } | 2753 | } |
@@ -2415,7 +2755,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2415 | sgs->group_load += load; | 2755 | sgs->group_load += load; |
2416 | sgs->sum_nr_running += rq->nr_running; | 2756 | sgs->sum_nr_running += rq->nr_running; |
2417 | sgs->sum_weighted_load += weighted_cpuload(i); | 2757 | sgs->sum_weighted_load += weighted_cpuload(i); |
2418 | 2758 | if (idle_cpu(i)) | |
2759 | sgs->idle_cpus++; | ||
2419 | } | 2760 | } |
2420 | 2761 | ||
2421 | /* | 2762 | /* |
@@ -2433,11 +2774,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2433 | } | 2774 | } |
2434 | 2775 | ||
2435 | /* Adjust by relative CPU power of the group */ | 2776 | /* Adjust by relative CPU power of the group */ |
2436 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2777 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; |
2437 | 2778 | ||
2438 | /* | 2779 | /* |
2439 | * Consider the group unbalanced when the imbalance is larger | 2780 | * Consider the group unbalanced when the imbalance is larger |
2440 | * than the average weight of two tasks. | 2781 | * than the average weight of a task. |
2441 | * | 2782 | * |
2442 | * APZ: with cgroup the avg task weight can vary wildly and | 2783 | * APZ: with cgroup the avg task weight can vary wildly and |
2443 | * might not be a suitable number - should we keep a | 2784 | * might not be a suitable number - should we keep a |
@@ -2447,13 +2788,17 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2447 | if (sgs->sum_nr_running) | 2788 | if (sgs->sum_nr_running) |
2448 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2789 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2449 | 2790 | ||
2450 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 2791 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2451 | sgs->group_imb = 1; | 2792 | sgs->group_imb = 1; |
2452 | 2793 | ||
2453 | sgs->group_capacity = | 2794 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
2454 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2795 | SCHED_POWER_SCALE); |
2455 | if (!sgs->group_capacity) | 2796 | if (!sgs->group_capacity) |
2456 | sgs->group_capacity = fix_small_capacity(sd, group); | 2797 | sgs->group_capacity = fix_small_capacity(sd, group); |
2798 | sgs->group_weight = group->group_weight; | ||
2799 | |||
2800 | if (sgs->group_capacity > sgs->sum_nr_running) | ||
2801 | sgs->group_has_capacity = 1; | ||
2457 | } | 2802 | } |
2458 | 2803 | ||
2459 | /** | 2804 | /** |
@@ -2504,15 +2849,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
2504 | * @sd: sched_domain whose statistics are to be updated. | 2849 | * @sd: sched_domain whose statistics are to be updated. |
2505 | * @this_cpu: Cpu for which load balance is currently performed. | 2850 | * @this_cpu: Cpu for which load balance is currently performed. |
2506 | * @idle: Idle status of this_cpu | 2851 | * @idle: Idle status of this_cpu |
2507 | * @sd_idle: Idle status of the sched_domain containing sg. | ||
2508 | * @cpus: Set of cpus considered for load balancing. | 2852 | * @cpus: Set of cpus considered for load balancing. |
2509 | * @balance: Should we balance. | 2853 | * @balance: Should we balance. |
2510 | * @sds: variable to hold the statistics for this sched_domain. | 2854 | * @sds: variable to hold the statistics for this sched_domain. |
2511 | */ | 2855 | */ |
2512 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 2856 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, |
2513 | enum cpu_idle_type idle, int *sd_idle, | 2857 | enum cpu_idle_type idle, const struct cpumask *cpus, |
2514 | const struct cpumask *cpus, int *balance, | 2858 | int *balance, struct sd_lb_stats *sds) |
2515 | struct sd_lb_stats *sds) | ||
2516 | { | 2859 | { |
2517 | struct sched_domain *child = sd->child; | 2860 | struct sched_domain *child = sd->child; |
2518 | struct sched_group *sg = sd->groups; | 2861 | struct sched_group *sg = sd->groups; |
@@ -2530,21 +2873,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2530 | 2873 | ||
2531 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 2874 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2532 | memset(&sgs, 0, sizeof(sgs)); | 2875 | memset(&sgs, 0, sizeof(sgs)); |
2533 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, | 2876 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, |
2534 | local_group, cpus, balance, &sgs); | 2877 | local_group, cpus, balance, &sgs); |
2535 | 2878 | ||
2536 | if (local_group && !(*balance)) | 2879 | if (local_group && !(*balance)) |
2537 | return; | 2880 | return; |
2538 | 2881 | ||
2539 | sds->total_load += sgs.group_load; | 2882 | sds->total_load += sgs.group_load; |
2540 | sds->total_pwr += sg->cpu_power; | 2883 | sds->total_pwr += sg->sgp->power; |
2541 | 2884 | ||
2542 | /* | 2885 | /* |
2543 | * In case the child domain prefers tasks go to siblings | 2886 | * In case the child domain prefers tasks go to siblings |
2544 | * first, lower the sg capacity to one so that we'll try | 2887 | * first, lower the sg capacity to one so that we'll try |
2545 | * and move all the excess tasks away. | 2888 | * and move all the excess tasks away. We lower the capacity |
2889 | * of a group only if the local group has the capacity to fit | ||
2890 | * these excess tasks, i.e. nr_running < group_capacity. The | ||
2891 | * extra check prevents the case where you always pull from the | ||
2892 | * heaviest group when it is already under-utilized (possible | ||
2893 | * with a large weight task outweighs the tasks on the system). | ||
2546 | */ | 2894 | */ |
2547 | if (prefer_sibling) | 2895 | if (prefer_sibling && !local_group && sds->this_has_capacity) |
2548 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 2896 | sgs.group_capacity = min(sgs.group_capacity, 1UL); |
2549 | 2897 | ||
2550 | if (local_group) { | 2898 | if (local_group) { |
@@ -2552,12 +2900,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2552 | sds->this = sg; | 2900 | sds->this = sg; |
2553 | sds->this_nr_running = sgs.sum_nr_running; | 2901 | sds->this_nr_running = sgs.sum_nr_running; |
2554 | sds->this_load_per_task = sgs.sum_weighted_load; | 2902 | sds->this_load_per_task = sgs.sum_weighted_load; |
2903 | sds->this_has_capacity = sgs.group_has_capacity; | ||
2904 | sds->this_idle_cpus = sgs.idle_cpus; | ||
2555 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2905 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2556 | sds->max_load = sgs.avg_load; | 2906 | sds->max_load = sgs.avg_load; |
2557 | sds->busiest = sg; | 2907 | sds->busiest = sg; |
2558 | sds->busiest_nr_running = sgs.sum_nr_running; | 2908 | sds->busiest_nr_running = sgs.sum_nr_running; |
2909 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
2559 | sds->busiest_group_capacity = sgs.group_capacity; | 2910 | sds->busiest_group_capacity = sgs.group_capacity; |
2560 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2911 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2912 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
2913 | sds->busiest_group_weight = sgs.group_weight; | ||
2561 | sds->group_imb = sgs.group_imb; | 2914 | sds->group_imb = sgs.group_imb; |
2562 | } | 2915 | } |
2563 | 2916 | ||
@@ -2612,8 +2965,8 @@ static int check_asym_packing(struct sched_domain *sd, | |||
2612 | if (this_cpu > busiest_cpu) | 2965 | if (this_cpu > busiest_cpu) |
2613 | return 0; | 2966 | return 0; |
2614 | 2967 | ||
2615 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | 2968 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, |
2616 | SCHED_LOAD_SCALE); | 2969 | SCHED_POWER_SCALE); |
2617 | return 1; | 2970 | return 1; |
2618 | } | 2971 | } |
2619 | 2972 | ||
@@ -2642,8 +2995,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2642 | cpu_avg_load_per_task(this_cpu); | 2995 | cpu_avg_load_per_task(this_cpu); |
2643 | 2996 | ||
2644 | scaled_busy_load_per_task = sds->busiest_load_per_task | 2997 | scaled_busy_load_per_task = sds->busiest_load_per_task |
2645 | * SCHED_LOAD_SCALE; | 2998 | * SCHED_POWER_SCALE; |
2646 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | 2999 | scaled_busy_load_per_task /= sds->busiest->sgp->power; |
2647 | 3000 | ||
2648 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3001 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
2649 | (scaled_busy_load_per_task * imbn)) { | 3002 | (scaled_busy_load_per_task * imbn)) { |
@@ -2657,30 +3010,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2657 | * moving them. | 3010 | * moving them. |
2658 | */ | 3011 | */ |
2659 | 3012 | ||
2660 | pwr_now += sds->busiest->cpu_power * | 3013 | pwr_now += sds->busiest->sgp->power * |
2661 | min(sds->busiest_load_per_task, sds->max_load); | 3014 | min(sds->busiest_load_per_task, sds->max_load); |
2662 | pwr_now += sds->this->cpu_power * | 3015 | pwr_now += sds->this->sgp->power * |
2663 | min(sds->this_load_per_task, sds->this_load); | 3016 | min(sds->this_load_per_task, sds->this_load); |
2664 | pwr_now /= SCHED_LOAD_SCALE; | 3017 | pwr_now /= SCHED_POWER_SCALE; |
2665 | 3018 | ||
2666 | /* Amount of load we'd subtract */ | 3019 | /* Amount of load we'd subtract */ |
2667 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3020 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
2668 | sds->busiest->cpu_power; | 3021 | sds->busiest->sgp->power; |
2669 | if (sds->max_load > tmp) | 3022 | if (sds->max_load > tmp) |
2670 | pwr_move += sds->busiest->cpu_power * | 3023 | pwr_move += sds->busiest->sgp->power * |
2671 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3024 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
2672 | 3025 | ||
2673 | /* Amount of load we'd add */ | 3026 | /* Amount of load we'd add */ |
2674 | if (sds->max_load * sds->busiest->cpu_power < | 3027 | if (sds->max_load * sds->busiest->sgp->power < |
2675 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3028 | sds->busiest_load_per_task * SCHED_POWER_SCALE) |
2676 | tmp = (sds->max_load * sds->busiest->cpu_power) / | 3029 | tmp = (sds->max_load * sds->busiest->sgp->power) / |
2677 | sds->this->cpu_power; | 3030 | sds->this->sgp->power; |
2678 | else | 3031 | else |
2679 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3032 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
2680 | sds->this->cpu_power; | 3033 | sds->this->sgp->power; |
2681 | pwr_move += sds->this->cpu_power * | 3034 | pwr_move += sds->this->sgp->power * |
2682 | min(sds->this_load_per_task, sds->this_load + tmp); | 3035 | min(sds->this_load_per_task, sds->this_load + tmp); |
2683 | pwr_move /= SCHED_LOAD_SCALE; | 3036 | pwr_move /= SCHED_POWER_SCALE; |
2684 | 3037 | ||
2685 | /* Move if we gain throughput */ | 3038 | /* Move if we gain throughput */ |
2686 | if (pwr_move > pwr_now) | 3039 | if (pwr_move > pwr_now) |
@@ -2722,9 +3075,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2722 | load_above_capacity = (sds->busiest_nr_running - | 3075 | load_above_capacity = (sds->busiest_nr_running - |
2723 | sds->busiest_group_capacity); | 3076 | sds->busiest_group_capacity); |
2724 | 3077 | ||
2725 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | 3078 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
2726 | 3079 | ||
2727 | load_above_capacity /= sds->busiest->cpu_power; | 3080 | load_above_capacity /= sds->busiest->sgp->power; |
2728 | } | 3081 | } |
2729 | 3082 | ||
2730 | /* | 3083 | /* |
@@ -2740,13 +3093,13 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2740 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 3093 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
2741 | 3094 | ||
2742 | /* How much load to actually move to equalise the imbalance */ | 3095 | /* How much load to actually move to equalise the imbalance */ |
2743 | *imbalance = min(max_pull * sds->busiest->cpu_power, | 3096 | *imbalance = min(max_pull * sds->busiest->sgp->power, |
2744 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | 3097 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
2745 | / SCHED_LOAD_SCALE; | 3098 | / SCHED_POWER_SCALE; |
2746 | 3099 | ||
2747 | /* | 3100 | /* |
2748 | * if *imbalance is less than the average load per runnable task | 3101 | * if *imbalance is less than the average load per runnable task |
2749 | * there is no gaurantee that any tasks will be moved so we'll have | 3102 | * there is no guarantee that any tasks will be moved so we'll have |
2750 | * a think about bumping its value to force at least one task to be | 3103 | * a think about bumping its value to force at least one task to be |
2751 | * moved | 3104 | * moved |
2752 | */ | 3105 | */ |
@@ -2754,6 +3107,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2754 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3107 | return fix_small_imbalance(sds, this_cpu, imbalance); |
2755 | 3108 | ||
2756 | } | 3109 | } |
3110 | |||
2757 | /******* find_busiest_group() helpers end here *********************/ | 3111 | /******* find_busiest_group() helpers end here *********************/ |
2758 | 3112 | ||
2759 | /** | 3113 | /** |
@@ -2771,7 +3125,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2771 | * @imbalance: Variable which stores amount of weighted load which should | 3125 | * @imbalance: Variable which stores amount of weighted load which should |
2772 | * be moved to restore balance/put a group to idle. | 3126 | * be moved to restore balance/put a group to idle. |
2773 | * @idle: The idle status of this_cpu. | 3127 | * @idle: The idle status of this_cpu. |
2774 | * @sd_idle: The idleness of sd | ||
2775 | * @cpus: The set of CPUs under consideration for load-balancing. | 3128 | * @cpus: The set of CPUs under consideration for load-balancing. |
2776 | * @balance: Pointer to a variable indicating if this_cpu | 3129 | * @balance: Pointer to a variable indicating if this_cpu |
2777 | * is the appropriate cpu to perform load balancing at this_level. | 3130 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -2784,7 +3137,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2784 | static struct sched_group * | 3137 | static struct sched_group * |
2785 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3138 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2786 | unsigned long *imbalance, enum cpu_idle_type idle, | 3139 | unsigned long *imbalance, enum cpu_idle_type idle, |
2787 | int *sd_idle, const struct cpumask *cpus, int *balance) | 3140 | const struct cpumask *cpus, int *balance) |
2788 | { | 3141 | { |
2789 | struct sd_lb_stats sds; | 3142 | struct sd_lb_stats sds; |
2790 | 3143 | ||
@@ -2794,17 +3147,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2794 | * Compute the various statistics relavent for load balancing at | 3147 | * Compute the various statistics relavent for load balancing at |
2795 | * this level. | 3148 | * this level. |
2796 | */ | 3149 | */ |
2797 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | 3150 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); |
2798 | balance, &sds); | ||
2799 | 3151 | ||
2800 | /* Cases where imbalance does not exist from POV of this_cpu */ | 3152 | /* |
2801 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | 3153 | * this_cpu is not the appropriate cpu to perform load balancing at |
2802 | * at this level. | 3154 | * this level. |
2803 | * 2) There is no busy sibling group to pull from. | ||
2804 | * 3) This group is the busiest group. | ||
2805 | * 4) This group is more busy than the avg busieness at this | ||
2806 | * sched_domain. | ||
2807 | * 5) The imbalance is within the specified limit. | ||
2808 | */ | 3155 | */ |
2809 | if (!(*balance)) | 3156 | if (!(*balance)) |
2810 | goto ret; | 3157 | goto ret; |
@@ -2813,20 +3160,59 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2813 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 3160 | check_asym_packing(sd, &sds, this_cpu, imbalance)) |
2814 | return sds.busiest; | 3161 | return sds.busiest; |
2815 | 3162 | ||
3163 | /* There is no busy sibling group to pull tasks from */ | ||
2816 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3164 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2817 | goto out_balanced; | 3165 | goto out_balanced; |
2818 | 3166 | ||
3167 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | ||
3168 | |||
3169 | /* | ||
3170 | * If the busiest group is imbalanced the below checks don't | ||
3171 | * work because they assumes all things are equal, which typically | ||
3172 | * isn't true due to cpus_allowed constraints and the like. | ||
3173 | */ | ||
3174 | if (sds.group_imb) | ||
3175 | goto force_balance; | ||
3176 | |||
3177 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
3178 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | ||
3179 | !sds.busiest_has_capacity) | ||
3180 | goto force_balance; | ||
3181 | |||
3182 | /* | ||
3183 | * If the local group is more busy than the selected busiest group | ||
3184 | * don't try and pull any tasks. | ||
3185 | */ | ||
2819 | if (sds.this_load >= sds.max_load) | 3186 | if (sds.this_load >= sds.max_load) |
2820 | goto out_balanced; | 3187 | goto out_balanced; |
2821 | 3188 | ||
2822 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3189 | /* |
2823 | 3190 | * Don't pull any tasks if this group is already above the domain | |
3191 | * average load. | ||
3192 | */ | ||
2824 | if (sds.this_load >= sds.avg_load) | 3193 | if (sds.this_load >= sds.avg_load) |
2825 | goto out_balanced; | 3194 | goto out_balanced; |
2826 | 3195 | ||
2827 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 3196 | if (idle == CPU_IDLE) { |
2828 | goto out_balanced; | 3197 | /* |
3198 | * This cpu is idle. If the busiest group load doesn't | ||
3199 | * have more tasks than the number of available cpu's and | ||
3200 | * there is no imbalance between this and busiest group | ||
3201 | * wrt to idle cpu's, it is balanced. | ||
3202 | */ | ||
3203 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | ||
3204 | sds.busiest_nr_running <= sds.busiest_group_weight) | ||
3205 | goto out_balanced; | ||
3206 | } else { | ||
3207 | /* | ||
3208 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | ||
3209 | * imbalance_pct to be conservative. | ||
3210 | */ | ||
3211 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3212 | goto out_balanced; | ||
3213 | } | ||
2829 | 3214 | ||
3215 | force_balance: | ||
2830 | /* Looks like there is an imbalance. Compute it */ | 3216 | /* Looks like there is an imbalance. Compute it */ |
2831 | calculate_imbalance(&sds, this_cpu, imbalance); | 3217 | calculate_imbalance(&sds, this_cpu, imbalance); |
2832 | return sds.busiest; | 3218 | return sds.busiest; |
@@ -2857,7 +3243,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2857 | 3243 | ||
2858 | for_each_cpu(i, sched_group_cpus(group)) { | 3244 | for_each_cpu(i, sched_group_cpus(group)) { |
2859 | unsigned long power = power_of(i); | 3245 | unsigned long power = power_of(i); |
2860 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 3246 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
3247 | SCHED_POWER_SCALE); | ||
2861 | unsigned long wl; | 3248 | unsigned long wl; |
2862 | 3249 | ||
2863 | if (!capacity) | 3250 | if (!capacity) |
@@ -2882,7 +3269,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2882 | * the load can be moved away from the cpu that is potentially | 3269 | * the load can be moved away from the cpu that is potentially |
2883 | * running at a lower capacity. | 3270 | * running at a lower capacity. |
2884 | */ | 3271 | */ |
2885 | wl = (wl * SCHED_LOAD_SCALE) / power; | 3272 | wl = (wl * SCHED_POWER_SCALE) / power; |
2886 | 3273 | ||
2887 | if (wl > max_load) { | 3274 | if (wl > max_load) { |
2888 | max_load = wl; | 3275 | max_load = wl; |
@@ -2902,7 +3289,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2902 | /* Working cpumask for load_balance and load_balance_newidle. */ | 3289 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2903 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 3290 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2904 | 3291 | ||
2905 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | 3292 | static int need_active_balance(struct sched_domain *sd, int idle, |
2906 | int busiest_cpu, int this_cpu) | 3293 | int busiest_cpu, int this_cpu) |
2907 | { | 3294 | { |
2908 | if (idle == CPU_NEWLY_IDLE) { | 3295 | if (idle == CPU_NEWLY_IDLE) { |
@@ -2934,10 +3321,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | |||
2934 | * move_tasks() will succeed. ld_moved will be true and this | 3321 | * move_tasks() will succeed. ld_moved will be true and this |
2935 | * active balance code will not be triggered. | 3322 | * active balance code will not be triggered. |
2936 | */ | 3323 | */ |
2937 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
2938 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2939 | return 0; | ||
2940 | |||
2941 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | 3324 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) |
2942 | return 0; | 3325 | return 0; |
2943 | } | 3326 | } |
@@ -2955,7 +3338,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2955 | struct sched_domain *sd, enum cpu_idle_type idle, | 3338 | struct sched_domain *sd, enum cpu_idle_type idle, |
2956 | int *balance) | 3339 | int *balance) |
2957 | { | 3340 | { |
2958 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3341 | int ld_moved, all_pinned = 0, active_balance = 0; |
2959 | struct sched_group *group; | 3342 | struct sched_group *group; |
2960 | unsigned long imbalance; | 3343 | unsigned long imbalance; |
2961 | struct rq *busiest; | 3344 | struct rq *busiest; |
@@ -2964,21 +3347,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2964 | 3347 | ||
2965 | cpumask_copy(cpus, cpu_active_mask); | 3348 | cpumask_copy(cpus, cpu_active_mask); |
2966 | 3349 | ||
2967 | /* | ||
2968 | * When power savings policy is enabled for the parent domain, idle | ||
2969 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
2970 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
2971 | * portraying it as CPU_NOT_IDLE. | ||
2972 | */ | ||
2973 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
2974 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2975 | sd_idle = 1; | ||
2976 | |||
2977 | schedstat_inc(sd, lb_count[idle]); | 3350 | schedstat_inc(sd, lb_count[idle]); |
2978 | 3351 | ||
2979 | redo: | 3352 | redo: |
2980 | update_shares(sd); | 3353 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, |
2981 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
2982 | cpus, balance); | 3354 | cpus, balance); |
2983 | 3355 | ||
2984 | if (*balance == 0) | 3356 | if (*balance == 0) |
@@ -3007,6 +3379,7 @@ redo: | |||
3007 | * still unbalanced. ld_moved simply stays zero, so it is | 3379 | * still unbalanced. ld_moved simply stays zero, so it is |
3008 | * correctly treated as an imbalance. | 3380 | * correctly treated as an imbalance. |
3009 | */ | 3381 | */ |
3382 | all_pinned = 1; | ||
3010 | local_irq_save(flags); | 3383 | local_irq_save(flags); |
3011 | double_rq_lock(this_rq, busiest); | 3384 | double_rq_lock(this_rq, busiest); |
3012 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3385 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
@@ -3031,10 +3404,16 @@ redo: | |||
3031 | 3404 | ||
3032 | if (!ld_moved) { | 3405 | if (!ld_moved) { |
3033 | schedstat_inc(sd, lb_failed[idle]); | 3406 | schedstat_inc(sd, lb_failed[idle]); |
3034 | sd->nr_balance_failed++; | 3407 | /* |
3408 | * Increment the failure counter only on periodic balance. | ||
3409 | * We do not want newidle balance, which can be very | ||
3410 | * frequent, pollute the failure counter causing | ||
3411 | * excessive cache_hot migrations and active balances. | ||
3412 | */ | ||
3413 | if (idle != CPU_NEWLY_IDLE) | ||
3414 | sd->nr_balance_failed++; | ||
3035 | 3415 | ||
3036 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3416 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { |
3037 | this_cpu)) { | ||
3038 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3417 | raw_spin_lock_irqsave(&busiest->lock, flags); |
3039 | 3418 | ||
3040 | /* don't kick the active_load_balance_cpu_stop, | 3419 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3089,10 +3468,6 @@ redo: | |||
3089 | sd->balance_interval *= 2; | 3468 | sd->balance_interval *= 2; |
3090 | } | 3469 | } |
3091 | 3470 | ||
3092 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3093 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3094 | ld_moved = -1; | ||
3095 | |||
3096 | goto out; | 3471 | goto out; |
3097 | 3472 | ||
3098 | out_balanced: | 3473 | out_balanced: |
@@ -3106,14 +3481,8 @@ out_one_pinned: | |||
3106 | (sd->balance_interval < sd->max_interval)) | 3481 | (sd->balance_interval < sd->max_interval)) |
3107 | sd->balance_interval *= 2; | 3482 | sd->balance_interval *= 2; |
3108 | 3483 | ||
3109 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3484 | ld_moved = 0; |
3110 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3111 | ld_moved = -1; | ||
3112 | else | ||
3113 | ld_moved = 0; | ||
3114 | out: | 3485 | out: |
3115 | if (ld_moved) | ||
3116 | update_shares(sd); | ||
3117 | return ld_moved; | 3486 | return ld_moved; |
3118 | } | 3487 | } |
3119 | 3488 | ||
@@ -3137,6 +3506,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3137 | */ | 3506 | */ |
3138 | raw_spin_unlock(&this_rq->lock); | 3507 | raw_spin_unlock(&this_rq->lock); |
3139 | 3508 | ||
3509 | update_shares(this_cpu); | ||
3510 | rcu_read_lock(); | ||
3140 | for_each_domain(this_cpu, sd) { | 3511 | for_each_domain(this_cpu, sd) { |
3141 | unsigned long interval; | 3512 | unsigned long interval; |
3142 | int balance = 1; | 3513 | int balance = 1; |
@@ -3158,6 +3529,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3158 | break; | 3529 | break; |
3159 | } | 3530 | } |
3160 | } | 3531 | } |
3532 | rcu_read_unlock(); | ||
3161 | 3533 | ||
3162 | raw_spin_lock(&this_rq->lock); | 3534 | raw_spin_lock(&this_rq->lock); |
3163 | 3535 | ||
@@ -3206,6 +3578,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3206 | double_lock_balance(busiest_rq, target_rq); | 3578 | double_lock_balance(busiest_rq, target_rq); |
3207 | 3579 | ||
3208 | /* Search for an sd spanning us and the target CPU. */ | 3580 | /* Search for an sd spanning us and the target CPU. */ |
3581 | rcu_read_lock(); | ||
3209 | for_each_domain(target_cpu, sd) { | 3582 | for_each_domain(target_cpu, sd) { |
3210 | if ((sd->flags & SD_LOAD_BALANCE) && | 3583 | if ((sd->flags & SD_LOAD_BALANCE) && |
3211 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3584 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
@@ -3221,6 +3594,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3221 | else | 3594 | else |
3222 | schedstat_inc(sd, alb_failed); | 3595 | schedstat_inc(sd, alb_failed); |
3223 | } | 3596 | } |
3597 | rcu_read_unlock(); | ||
3224 | double_unlock_balance(busiest_rq, target_rq); | 3598 | double_unlock_balance(busiest_rq, target_rq); |
3225 | out_unlock: | 3599 | out_unlock: |
3226 | busiest_rq->active_balance = 0; | 3600 | busiest_rq->active_balance = 0; |
@@ -3347,6 +3721,7 @@ static int find_new_ilb(int cpu) | |||
3347 | { | 3721 | { |
3348 | struct sched_domain *sd; | 3722 | struct sched_domain *sd; |
3349 | struct sched_group *ilb_group; | 3723 | struct sched_group *ilb_group; |
3724 | int ilb = nr_cpu_ids; | ||
3350 | 3725 | ||
3351 | /* | 3726 | /* |
3352 | * Have idle load balancer selection from semi-idle packages only | 3727 | * Have idle load balancer selection from semi-idle packages only |
@@ -3362,20 +3737,25 @@ static int find_new_ilb(int cpu) | |||
3362 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3737 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3363 | goto out_done; | 3738 | goto out_done; |
3364 | 3739 | ||
3740 | rcu_read_lock(); | ||
3365 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3741 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3366 | ilb_group = sd->groups; | 3742 | ilb_group = sd->groups; |
3367 | 3743 | ||
3368 | do { | 3744 | do { |
3369 | if (is_semi_idle_group(ilb_group)) | 3745 | if (is_semi_idle_group(ilb_group)) { |
3370 | return cpumask_first(nohz.grp_idle_mask); | 3746 | ilb = cpumask_first(nohz.grp_idle_mask); |
3747 | goto unlock; | ||
3748 | } | ||
3371 | 3749 | ||
3372 | ilb_group = ilb_group->next; | 3750 | ilb_group = ilb_group->next; |
3373 | 3751 | ||
3374 | } while (ilb_group != sd->groups); | 3752 | } while (ilb_group != sd->groups); |
3375 | } | 3753 | } |
3754 | unlock: | ||
3755 | rcu_read_unlock(); | ||
3376 | 3756 | ||
3377 | out_done: | 3757 | out_done: |
3378 | return nr_cpu_ids; | 3758 | return ilb; |
3379 | } | 3759 | } |
3380 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3760 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3381 | static inline int find_new_ilb(int call_cpu) | 3761 | static inline int find_new_ilb(int call_cpu) |
@@ -3490,6 +3870,17 @@ void select_nohz_load_balancer(int stop_tick) | |||
3490 | 3870 | ||
3491 | static DEFINE_SPINLOCK(balancing); | 3871 | static DEFINE_SPINLOCK(balancing); |
3492 | 3872 | ||
3873 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3874 | |||
3875 | /* | ||
3876 | * Scale the max load_balance interval with the number of CPUs in the system. | ||
3877 | * This trades load-balance latency on larger machines for less cross talk. | ||
3878 | */ | ||
3879 | static void update_max_interval(void) | ||
3880 | { | ||
3881 | max_load_balance_interval = HZ*num_online_cpus()/10; | ||
3882 | } | ||
3883 | |||
3493 | /* | 3884 | /* |
3494 | * It checks each scheduling domain to see if it is due to be balanced, | 3885 | * It checks each scheduling domain to see if it is due to be balanced, |
3495 | * and initiates a balancing operation if so. | 3886 | * and initiates a balancing operation if so. |
@@ -3507,6 +3898,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3507 | int update_next_balance = 0; | 3898 | int update_next_balance = 0; |
3508 | int need_serialize; | 3899 | int need_serialize; |
3509 | 3900 | ||
3901 | update_shares(cpu); | ||
3902 | |||
3903 | rcu_read_lock(); | ||
3510 | for_each_domain(cpu, sd) { | 3904 | for_each_domain(cpu, sd) { |
3511 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3905 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3512 | continue; | 3906 | continue; |
@@ -3517,10 +3911,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3517 | 3911 | ||
3518 | /* scale ms to jiffies */ | 3912 | /* scale ms to jiffies */ |
3519 | interval = msecs_to_jiffies(interval); | 3913 | interval = msecs_to_jiffies(interval); |
3520 | if (unlikely(!interval)) | 3914 | interval = clamp(interval, 1UL, max_load_balance_interval); |
3521 | interval = 1; | ||
3522 | if (interval > HZ*NR_CPUS/10) | ||
3523 | interval = HZ*NR_CPUS/10; | ||
3524 | 3915 | ||
3525 | need_serialize = sd->flags & SD_SERIALIZE; | 3916 | need_serialize = sd->flags & SD_SERIALIZE; |
3526 | 3917 | ||
@@ -3533,8 +3924,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3533 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3924 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3534 | /* | 3925 | /* |
3535 | * We've pulled tasks over so either we're no | 3926 | * We've pulled tasks over so either we're no |
3536 | * longer idle, or one of our SMT siblings is | 3927 | * longer idle. |
3537 | * not idle. | ||
3538 | */ | 3928 | */ |
3539 | idle = CPU_NOT_IDLE; | 3929 | idle = CPU_NOT_IDLE; |
3540 | } | 3930 | } |
@@ -3556,6 +3946,7 @@ out: | |||
3556 | if (!balance) | 3946 | if (!balance) |
3557 | break; | 3947 | break; |
3558 | } | 3948 | } |
3949 | rcu_read_unlock(); | ||
3559 | 3950 | ||
3560 | /* | 3951 | /* |
3561 | * next_balance will be updated only when there is a need. | 3952 | * next_balance will be updated only when there is a need. |
@@ -3751,8 +4142,11 @@ static void task_fork_fair(struct task_struct *p) | |||
3751 | 4142 | ||
3752 | update_rq_clock(rq); | 4143 | update_rq_clock(rq); |
3753 | 4144 | ||
3754 | if (unlikely(task_cpu(p) != this_cpu)) | 4145 | if (unlikely(task_cpu(p) != this_cpu)) { |
4146 | rcu_read_lock(); | ||
3755 | __set_task_cpu(p, this_cpu); | 4147 | __set_task_cpu(p, this_cpu); |
4148 | rcu_read_unlock(); | ||
4149 | } | ||
3756 | 4150 | ||
3757 | update_curr(cfs_rq); | 4151 | update_curr(cfs_rq); |
3758 | 4152 | ||
@@ -3778,33 +4172,62 @@ static void task_fork_fair(struct task_struct *p) | |||
3778 | * Priority of the task has changed. Check to see if we preempt | 4172 | * Priority of the task has changed. Check to see if we preempt |
3779 | * the current task. | 4173 | * the current task. |
3780 | */ | 4174 | */ |
3781 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | 4175 | static void |
3782 | int oldprio, int running) | 4176 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
3783 | { | 4177 | { |
4178 | if (!p->se.on_rq) | ||
4179 | return; | ||
4180 | |||
3784 | /* | 4181 | /* |
3785 | * Reschedule if we are currently running on this runqueue and | 4182 | * Reschedule if we are currently running on this runqueue and |
3786 | * our priority decreased, or if we are not currently running on | 4183 | * our priority decreased, or if we are not currently running on |
3787 | * this runqueue and our priority is higher than the current's | 4184 | * this runqueue and our priority is higher than the current's |
3788 | */ | 4185 | */ |
3789 | if (running) { | 4186 | if (rq->curr == p) { |
3790 | if (p->prio > oldprio) | 4187 | if (p->prio > oldprio) |
3791 | resched_task(rq->curr); | 4188 | resched_task(rq->curr); |
3792 | } else | 4189 | } else |
3793 | check_preempt_curr(rq, p, 0); | 4190 | check_preempt_curr(rq, p, 0); |
3794 | } | 4191 | } |
3795 | 4192 | ||
4193 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
4194 | { | ||
4195 | struct sched_entity *se = &p->se; | ||
4196 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
4197 | |||
4198 | /* | ||
4199 | * Ensure the task's vruntime is normalized, so that when its | ||
4200 | * switched back to the fair class the enqueue_entity(.flags=0) will | ||
4201 | * do the right thing. | ||
4202 | * | ||
4203 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | ||
4204 | * have normalized the vruntime, if it was !on_rq, then only when | ||
4205 | * the task is sleeping will it still have non-normalized vruntime. | ||
4206 | */ | ||
4207 | if (!se->on_rq && p->state != TASK_RUNNING) { | ||
4208 | /* | ||
4209 | * Fix up our vruntime so that the current sleep doesn't | ||
4210 | * cause 'unlimited' sleep bonus. | ||
4211 | */ | ||
4212 | place_entity(cfs_rq, se, 0); | ||
4213 | se->vruntime -= cfs_rq->min_vruntime; | ||
4214 | } | ||
4215 | } | ||
4216 | |||
3796 | /* | 4217 | /* |
3797 | * We switched to the sched_fair class. | 4218 | * We switched to the sched_fair class. |
3798 | */ | 4219 | */ |
3799 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | 4220 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
3800 | int running) | ||
3801 | { | 4221 | { |
4222 | if (!p->se.on_rq) | ||
4223 | return; | ||
4224 | |||
3802 | /* | 4225 | /* |
3803 | * We were most likely switched from sched_rt, so | 4226 | * We were most likely switched from sched_rt, so |
3804 | * kick off the schedule if running, otherwise just see | 4227 | * kick off the schedule if running, otherwise just see |
3805 | * if we can still preempt the current task. | 4228 | * if we can still preempt the current task. |
3806 | */ | 4229 | */ |
3807 | if (running) | 4230 | if (rq->curr == p) |
3808 | resched_task(rq->curr); | 4231 | resched_task(rq->curr); |
3809 | else | 4232 | else |
3810 | check_preempt_curr(rq, p, 0); | 4233 | check_preempt_curr(rq, p, 0); |
@@ -3824,13 +4247,26 @@ static void set_curr_task_fair(struct rq *rq) | |||
3824 | } | 4247 | } |
3825 | 4248 | ||
3826 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4249 | #ifdef CONFIG_FAIR_GROUP_SCHED |
3827 | static void moved_group_fair(struct task_struct *p, int on_rq) | 4250 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
3828 | { | 4251 | { |
3829 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4252 | /* |
3830 | 4253 | * If the task was not on the rq at the time of this cgroup movement | |
3831 | update_curr(cfs_rq); | 4254 | * it must have been asleep, sleeping tasks keep their ->vruntime |
4255 | * absolute on their old rq until wakeup (needed for the fair sleeper | ||
4256 | * bonus in place_entity()). | ||
4257 | * | ||
4258 | * If it was on the rq, we've just 'preempted' it, which does convert | ||
4259 | * ->vruntime to a relative base. | ||
4260 | * | ||
4261 | * Make sure both cases convert their relative position when migrating | ||
4262 | * to another cgroup's rq. This does somewhat interfere with the | ||
4263 | * fair sleeper stuff for the first placement, but who cares. | ||
4264 | */ | ||
4265 | if (!on_rq) | ||
4266 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | ||
4267 | set_task_rq(p, task_cpu(p)); | ||
3832 | if (!on_rq) | 4268 | if (!on_rq) |
3833 | place_entity(cfs_rq, &p->se, 1); | 4269 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; |
3834 | } | 4270 | } |
3835 | #endif | 4271 | #endif |
3836 | 4272 | ||
@@ -3857,6 +4293,7 @@ static const struct sched_class fair_sched_class = { | |||
3857 | .enqueue_task = enqueue_task_fair, | 4293 | .enqueue_task = enqueue_task_fair, |
3858 | .dequeue_task = dequeue_task_fair, | 4294 | .dequeue_task = dequeue_task_fair, |
3859 | .yield_task = yield_task_fair, | 4295 | .yield_task = yield_task_fair, |
4296 | .yield_to_task = yield_to_task_fair, | ||
3860 | 4297 | ||
3861 | .check_preempt_curr = check_preempt_wakeup, | 4298 | .check_preempt_curr = check_preempt_wakeup, |
3862 | 4299 | ||
@@ -3877,12 +4314,13 @@ static const struct sched_class fair_sched_class = { | |||
3877 | .task_fork = task_fork_fair, | 4314 | .task_fork = task_fork_fair, |
3878 | 4315 | ||
3879 | .prio_changed = prio_changed_fair, | 4316 | .prio_changed = prio_changed_fair, |
4317 | .switched_from = switched_from_fair, | ||
3880 | .switched_to = switched_to_fair, | 4318 | .switched_to = switched_to_fair, |
3881 | 4319 | ||
3882 | .get_rr_interval = get_rr_interval_fair, | 4320 | .get_rr_interval = get_rr_interval_fair, |
3883 | 4321 | ||
3884 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4322 | #ifdef CONFIG_FAIR_GROUP_SCHED |
3885 | .moved_group = moved_group_fair, | 4323 | .task_move_group = task_move_group_fair, |
3886 | #endif | 4324 | #endif |
3887 | }; | 4325 | }; |
3888 | 4326 | ||
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8ad3ee..1e7066d76c26 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) | |||
52 | SCHED_FEAT(HRTICK, 0) | 52 | SCHED_FEAT(HRTICK, 0) |
53 | SCHED_FEAT(DOUBLE_TICK, 0) | 53 | SCHED_FEAT(DOUBLE_TICK, 0) |
54 | SCHED_FEAT(LB_BIAS, 1) | 54 | SCHED_FEAT(LB_BIAS, 1) |
55 | SCHED_FEAT(LB_SHARES_UPDATE, 1) | ||
56 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
57 | 55 | ||
58 | /* | 56 | /* |
59 | * Spin-wait on mutex acquisition when the mutex owner is running on | 57 | * Spin-wait on mutex acquisition when the mutex owner is running on |
@@ -61,3 +59,16 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) | |||
61 | * release the lock. Decreases scheduling overhead. | 59 | * release the lock. Decreases scheduling overhead. |
62 | */ | 60 | */ |
63 | SCHED_FEAT(OWNER_SPIN, 1) | 61 | SCHED_FEAT(OWNER_SPIN, 1) |
62 | |||
63 | /* | ||
64 | * Decrement CPU power based on irq activity | ||
65 | */ | ||
66 | SCHED_FEAT(NONIRQ_POWER, 1) | ||
67 | |||
68 | /* | ||
69 | * Queue remote wakeups on the target CPU and process them | ||
70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | ||
71 | */ | ||
72 | SCHED_FEAT(TTWU_QUEUE, 1) | ||
73 | |||
74 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -7,7 +7,7 @@ | |||
7 | 7 | ||
8 | #ifdef CONFIG_SMP | 8 | #ifdef CONFIG_SMP |
9 | static int | 9 | static int |
10 | select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 10 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) |
11 | { | 11 | { |
12 | return task_cpu(p); /* IDLE tasks as never migrated */ | 12 | return task_cpu(p); /* IDLE tasks as never migrated */ |
13 | } | 13 | } |
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) | |||
52 | { | 52 | { |
53 | } | 53 | } |
54 | 54 | ||
55 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | 55 | static void switched_to_idle(struct rq *rq, struct task_struct *p) |
56 | int running) | ||
57 | { | 56 | { |
58 | /* Can this actually happen?? */ | 57 | BUG(); |
59 | if (running) | ||
60 | resched_task(rq->curr); | ||
61 | else | ||
62 | check_preempt_curr(rq, p, 0); | ||
63 | } | 58 | } |
64 | 59 | ||
65 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | 60 | static void |
66 | int oldprio, int running) | 61 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) |
67 | { | 62 | { |
68 | /* This can happen for hot plug CPUS */ | 63 | BUG(); |
69 | |||
70 | /* | ||
71 | * Reschedule if we are currently running on this runqueue and | ||
72 | * our priority decreased, or if we are not currently running on | ||
73 | * this runqueue and our priority is higher than the current's | ||
74 | */ | ||
75 | if (running) { | ||
76 | if (p->prio > oldprio) | ||
77 | resched_task(rq->curr); | ||
78 | } else | ||
79 | check_preempt_curr(rq, p, 0); | ||
80 | } | 64 | } |
81 | 65 | ||
82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 66 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = { | |||
110 | 94 | ||
111 | .prio_changed = prio_changed_idle, | 95 | .prio_changed = prio_changed_idle, |
112 | .switched_to = switched_to_idle, | 96 | .switched_to = switched_to_idle, |
113 | |||
114 | /* no .task_new for idle tasks */ | ||
115 | }; | 97 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e40e7fe43170..58cf5d18dfdc 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -183,6 +183,25 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
184 | } | 184 | } |
185 | 185 | ||
186 | typedef struct task_group *rt_rq_iter_t; | ||
187 | |||
188 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
189 | for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ | ||
190 | (&iter->list != &task_groups) && \ | ||
191 | (rt_rq = iter->rt_rq[cpu_of(rq)]); \ | ||
192 | iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) | ||
193 | |||
194 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
195 | { | ||
196 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | ||
197 | &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); | ||
198 | } | ||
199 | |||
200 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
201 | { | ||
202 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
203 | } | ||
204 | |||
186 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 205 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
187 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | 206 | list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) |
188 | 207 | ||
@@ -199,11 +218,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
199 | 218 | ||
200 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 219 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
201 | { | 220 | { |
202 | int this_cpu = smp_processor_id(); | ||
203 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 221 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
204 | struct sched_rt_entity *rt_se; | 222 | struct sched_rt_entity *rt_se; |
205 | 223 | ||
206 | rt_se = rt_rq->tg->rt_se[this_cpu]; | 224 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); |
225 | |||
226 | rt_se = rt_rq->tg->rt_se[cpu]; | ||
207 | 227 | ||
208 | if (rt_rq->rt_nr_running) { | 228 | if (rt_rq->rt_nr_running) { |
209 | if (rt_se && !on_rt_rq(rt_se)) | 229 | if (rt_se && !on_rt_rq(rt_se)) |
@@ -215,10 +235,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
215 | 235 | ||
216 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 236 | static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
217 | { | 237 | { |
218 | int this_cpu = smp_processor_id(); | ||
219 | struct sched_rt_entity *rt_se; | 238 | struct sched_rt_entity *rt_se; |
239 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); | ||
220 | 240 | ||
221 | rt_se = rt_rq->tg->rt_se[this_cpu]; | 241 | rt_se = rt_rq->tg->rt_se[cpu]; |
222 | 242 | ||
223 | if (rt_se && on_rt_rq(rt_se)) | 243 | if (rt_se && on_rt_rq(rt_se)) |
224 | dequeue_rt_entity(rt_se); | 244 | dequeue_rt_entity(rt_se); |
@@ -276,6 +296,19 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
276 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 296 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
277 | } | 297 | } |
278 | 298 | ||
299 | typedef struct rt_rq *rt_rq_iter_t; | ||
300 | |||
301 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
302 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
303 | |||
304 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | ||
305 | { | ||
306 | } | ||
307 | |||
308 | static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) | ||
309 | { | ||
310 | } | ||
311 | |||
279 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | 312 | #define for_each_leaf_rt_rq(rt_rq, rq) \ |
280 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | 313 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) |
281 | 314 | ||
@@ -382,12 +415,13 @@ next: | |||
382 | static void __disable_runtime(struct rq *rq) | 415 | static void __disable_runtime(struct rq *rq) |
383 | { | 416 | { |
384 | struct root_domain *rd = rq->rd; | 417 | struct root_domain *rd = rq->rd; |
418 | rt_rq_iter_t iter; | ||
385 | struct rt_rq *rt_rq; | 419 | struct rt_rq *rt_rq; |
386 | 420 | ||
387 | if (unlikely(!scheduler_running)) | 421 | if (unlikely(!scheduler_running)) |
388 | return; | 422 | return; |
389 | 423 | ||
390 | for_each_leaf_rt_rq(rt_rq, rq) { | 424 | for_each_rt_rq(rt_rq, iter, rq) { |
391 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 425 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
392 | s64 want; | 426 | s64 want; |
393 | int i; | 427 | int i; |
@@ -467,6 +501,7 @@ static void disable_runtime(struct rq *rq) | |||
467 | 501 | ||
468 | static void __enable_runtime(struct rq *rq) | 502 | static void __enable_runtime(struct rq *rq) |
469 | { | 503 | { |
504 | rt_rq_iter_t iter; | ||
470 | struct rt_rq *rt_rq; | 505 | struct rt_rq *rt_rq; |
471 | 506 | ||
472 | if (unlikely(!scheduler_running)) | 507 | if (unlikely(!scheduler_running)) |
@@ -475,7 +510,7 @@ static void __enable_runtime(struct rq *rq) | |||
475 | /* | 510 | /* |
476 | * Reset each runqueue's bandwidth settings | 511 | * Reset each runqueue's bandwidth settings |
477 | */ | 512 | */ |
478 | for_each_leaf_rt_rq(rt_rq, rq) { | 513 | for_each_rt_rq(rt_rq, iter, rq) { |
479 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 514 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
480 | 515 | ||
481 | raw_spin_lock(&rt_b->rt_runtime_lock); | 516 | raw_spin_lock(&rt_b->rt_runtime_lock); |
@@ -542,12 +577,22 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
542 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | 577 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { |
543 | rt_rq->rt_throttled = 0; | 578 | rt_rq->rt_throttled = 0; |
544 | enqueue = 1; | 579 | enqueue = 1; |
580 | |||
581 | /* | ||
582 | * Force a clock update if the CPU was idle, | ||
583 | * lest wakeup -> unthrottle time accumulate. | ||
584 | */ | ||
585 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) | ||
586 | rq->skip_clock_update = -1; | ||
545 | } | 587 | } |
546 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 588 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
547 | idle = 0; | 589 | idle = 0; |
548 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 590 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
549 | } else if (rt_rq->rt_nr_running) | 591 | } else if (rt_rq->rt_nr_running) { |
550 | idle = 0; | 592 | idle = 0; |
593 | if (!rt_rq_throttled(rt_rq)) | ||
594 | enqueue = 1; | ||
595 | } | ||
551 | 596 | ||
552 | if (enqueue) | 597 | if (enqueue) |
553 | sched_rt_rq_enqueue(rt_rq); | 598 | sched_rt_rq_enqueue(rt_rq); |
@@ -606,10 +651,10 @@ static void update_curr_rt(struct rq *rq) | |||
606 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 651 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
607 | u64 delta_exec; | 652 | u64 delta_exec; |
608 | 653 | ||
609 | if (!task_has_rt_policy(curr)) | 654 | if (curr->sched_class != &rt_sched_class) |
610 | return; | 655 | return; |
611 | 656 | ||
612 | delta_exec = rq->clock - curr->se.exec_start; | 657 | delta_exec = rq->clock_task - curr->se.exec_start; |
613 | if (unlikely((s64)delta_exec < 0)) | 658 | if (unlikely((s64)delta_exec < 0)) |
614 | delta_exec = 0; | 659 | delta_exec = 0; |
615 | 660 | ||
@@ -618,7 +663,7 @@ static void update_curr_rt(struct rq *rq) | |||
618 | curr->se.sum_exec_runtime += delta_exec; | 663 | curr->se.sum_exec_runtime += delta_exec; |
619 | account_group_exec_runtime(curr, delta_exec); | 664 | account_group_exec_runtime(curr, delta_exec); |
620 | 665 | ||
621 | curr->se.exec_start = rq->clock; | 666 | curr->se.exec_start = rq->clock_task; |
622 | cpuacct_charge(curr, delta_exec); | 667 | cpuacct_charge(curr, delta_exec); |
623 | 668 | ||
624 | sched_rt_avg_update(rq, delta_exec); | 669 | sched_rt_avg_update(rq, delta_exec); |
@@ -825,6 +870,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
825 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 870 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) |
826 | return; | 871 | return; |
827 | 872 | ||
873 | if (!rt_rq->rt_nr_running) | ||
874 | list_add_leaf_rt_rq(rt_rq); | ||
875 | |||
828 | if (head) | 876 | if (head) |
829 | list_add(&rt_se->run_list, queue); | 877 | list_add(&rt_se->run_list, queue); |
830 | else | 878 | else |
@@ -844,6 +892,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
844 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 892 | __clear_bit(rt_se_prio(rt_se), array->bitmap); |
845 | 893 | ||
846 | dec_rt_tasks(rt_se, rt_rq); | 894 | dec_rt_tasks(rt_se, rt_rq); |
895 | if (!rt_rq->rt_nr_running) | ||
896 | list_del_leaf_rt_rq(rt_rq); | ||
847 | } | 897 | } |
848 | 898 | ||
849 | /* | 899 | /* |
@@ -949,40 +999,55 @@ static void yield_task_rt(struct rq *rq) | |||
949 | static int find_lowest_rq(struct task_struct *task); | 999 | static int find_lowest_rq(struct task_struct *task); |
950 | 1000 | ||
951 | static int | 1001 | static int |
952 | select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 1002 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) |
953 | { | 1003 | { |
1004 | struct task_struct *curr; | ||
1005 | struct rq *rq; | ||
1006 | int cpu; | ||
1007 | |||
954 | if (sd_flag != SD_BALANCE_WAKE) | 1008 | if (sd_flag != SD_BALANCE_WAKE) |
955 | return smp_processor_id(); | 1009 | return smp_processor_id(); |
956 | 1010 | ||
1011 | cpu = task_cpu(p); | ||
1012 | rq = cpu_rq(cpu); | ||
1013 | |||
1014 | rcu_read_lock(); | ||
1015 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
1016 | |||
957 | /* | 1017 | /* |
958 | * If the current task is an RT task, then | 1018 | * If the current task on @p's runqueue is an RT task, then |
959 | * try to see if we can wake this RT task up on another | 1019 | * try to see if we can wake this RT task up on another |
960 | * runqueue. Otherwise simply start this RT task | 1020 | * runqueue. Otherwise simply start this RT task |
961 | * on its current runqueue. | 1021 | * on its current runqueue. |
962 | * | 1022 | * |
963 | * We want to avoid overloading runqueues. Even if | 1023 | * We want to avoid overloading runqueues. If the woken |
964 | * the RT task is of higher priority than the current RT task. | 1024 | * task is a higher priority, then it will stay on this CPU |
965 | * RT tasks behave differently than other tasks. If | 1025 | * and the lower prio task should be moved to another CPU. |
966 | * one gets preempted, we try to push it off to another queue. | 1026 | * Even though this will probably make the lower prio task |
967 | * So trying to keep a preempting RT task on the same | 1027 | * lose its cache, we do not want to bounce a higher task |
968 | * cache hot CPU will force the running RT task to | 1028 | * around just because it gave up its CPU, perhaps for a |
969 | * a cold CPU. So we waste all the cache for the lower | 1029 | * lock? |
970 | * RT task in hopes of saving some of a RT task | 1030 | * |
971 | * that is just being woken and probably will have | 1031 | * For equal prio tasks, we just let the scheduler sort it out. |
972 | * cold cache anyway. | 1032 | * |
1033 | * Otherwise, just let it ride on the affined RQ and the | ||
1034 | * post-schedule router will push the preempted task away | ||
1035 | * | ||
1036 | * This test is optimistic, if we get it wrong the load-balancer | ||
1037 | * will have to sort it out. | ||
973 | */ | 1038 | */ |
974 | if (unlikely(rt_task(rq->curr)) && | 1039 | if (curr && unlikely(rt_task(curr)) && |
1040 | (curr->rt.nr_cpus_allowed < 2 || | ||
1041 | curr->prio < p->prio) && | ||
975 | (p->rt.nr_cpus_allowed > 1)) { | 1042 | (p->rt.nr_cpus_allowed > 1)) { |
976 | int cpu = find_lowest_rq(p); | 1043 | int target = find_lowest_rq(p); |
977 | 1044 | ||
978 | return (cpu == -1) ? task_cpu(p) : cpu; | 1045 | if (target != -1) |
1046 | cpu = target; | ||
979 | } | 1047 | } |
1048 | rcu_read_unlock(); | ||
980 | 1049 | ||
981 | /* | 1050 | return cpu; |
982 | * Otherwise, just let it ride on the affined RQ and the | ||
983 | * post-schedule router will push the preempted task away | ||
984 | */ | ||
985 | return task_cpu(p); | ||
986 | } | 1051 | } |
987 | 1052 | ||
988 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1053 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
@@ -1031,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag | |||
1031 | * to move current somewhere else, making room for our non-migratable | 1096 | * to move current somewhere else, making room for our non-migratable |
1032 | * task. | 1097 | * task. |
1033 | */ | 1098 | */ |
1034 | if (p->prio == rq->curr->prio && !need_resched()) | 1099 | if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) |
1035 | check_preempt_equal_prio(rq, p); | 1100 | check_preempt_equal_prio(rq, p); |
1036 | #endif | 1101 | #endif |
1037 | } | 1102 | } |
@@ -1074,7 +1139,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1074 | } while (rt_rq); | 1139 | } while (rt_rq); |
1075 | 1140 | ||
1076 | p = rt_task_of(rt_se); | 1141 | p = rt_task_of(rt_se); |
1077 | p->se.exec_start = rq->clock; | 1142 | p->se.exec_start = rq->clock_task; |
1078 | 1143 | ||
1079 | return p; | 1144 | return p; |
1080 | } | 1145 | } |
@@ -1107,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1107 | * The previous task needs to be made eligible for pushing | 1172 | * The previous task needs to be made eligible for pushing |
1108 | * if it is still active | 1173 | * if it is still active |
1109 | */ | 1174 | */ |
1110 | if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) | 1175 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) |
1111 | enqueue_pushable_task(rq, p); | 1176 | enqueue_pushable_task(rq, p); |
1112 | } | 1177 | } |
1113 | 1178 | ||
@@ -1139,7 +1204,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
1139 | for_each_leaf_rt_rq(rt_rq, rq) { | 1204 | for_each_leaf_rt_rq(rt_rq, rq) { |
1140 | array = &rt_rq->active; | 1205 | array = &rt_rq->active; |
1141 | idx = sched_find_first_bit(array->bitmap); | 1206 | idx = sched_find_first_bit(array->bitmap); |
1142 | next_idx: | 1207 | next_idx: |
1143 | if (idx >= MAX_RT_PRIO) | 1208 | if (idx >= MAX_RT_PRIO) |
1144 | continue; | 1209 | continue; |
1145 | if (next && next->prio < idx) | 1210 | if (next && next->prio < idx) |
@@ -1174,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task) | |||
1174 | int this_cpu = smp_processor_id(); | 1239 | int this_cpu = smp_processor_id(); |
1175 | int cpu = task_cpu(task); | 1240 | int cpu = task_cpu(task); |
1176 | 1241 | ||
1242 | /* Make sure the mask is initialized first */ | ||
1243 | if (unlikely(!lowest_mask)) | ||
1244 | return -1; | ||
1245 | |||
1177 | if (task->rt.nr_cpus_allowed == 1) | 1246 | if (task->rt.nr_cpus_allowed == 1) |
1178 | return -1; /* No other targets possible */ | 1247 | return -1; /* No other targets possible */ |
1179 | 1248 | ||
@@ -1198,6 +1267,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1198 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1267 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
1199 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1268 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
1200 | 1269 | ||
1270 | rcu_read_lock(); | ||
1201 | for_each_domain(cpu, sd) { | 1271 | for_each_domain(cpu, sd) { |
1202 | if (sd->flags & SD_WAKE_AFFINE) { | 1272 | if (sd->flags & SD_WAKE_AFFINE) { |
1203 | int best_cpu; | 1273 | int best_cpu; |
@@ -1207,15 +1277,20 @@ static int find_lowest_rq(struct task_struct *task) | |||
1207 | * remote processor. | 1277 | * remote processor. |
1208 | */ | 1278 | */ |
1209 | if (this_cpu != -1 && | 1279 | if (this_cpu != -1 && |
1210 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) | 1280 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { |
1281 | rcu_read_unlock(); | ||
1211 | return this_cpu; | 1282 | return this_cpu; |
1283 | } | ||
1212 | 1284 | ||
1213 | best_cpu = cpumask_first_and(lowest_mask, | 1285 | best_cpu = cpumask_first_and(lowest_mask, |
1214 | sched_domain_span(sd)); | 1286 | sched_domain_span(sd)); |
1215 | if (best_cpu < nr_cpu_ids) | 1287 | if (best_cpu < nr_cpu_ids) { |
1288 | rcu_read_unlock(); | ||
1216 | return best_cpu; | 1289 | return best_cpu; |
1290 | } | ||
1217 | } | 1291 | } |
1218 | } | 1292 | } |
1293 | rcu_read_unlock(); | ||
1219 | 1294 | ||
1220 | /* | 1295 | /* |
1221 | * And finally, if there were no matches within the domains | 1296 | * And finally, if there were no matches within the domains |
@@ -1258,7 +1333,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1258 | !cpumask_test_cpu(lowest_rq->cpu, | 1333 | !cpumask_test_cpu(lowest_rq->cpu, |
1259 | &task->cpus_allowed) || | 1334 | &task->cpus_allowed) || |
1260 | task_running(rq, task) || | 1335 | task_running(rq, task) || |
1261 | !task->se.on_rq)) { | 1336 | !task->on_rq)) { |
1262 | 1337 | ||
1263 | raw_spin_unlock(&lowest_rq->lock); | 1338 | raw_spin_unlock(&lowest_rq->lock); |
1264 | lowest_rq = NULL; | 1339 | lowest_rq = NULL; |
@@ -1292,7 +1367,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1292 | BUG_ON(task_current(rq, p)); | 1367 | BUG_ON(task_current(rq, p)); |
1293 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1368 | BUG_ON(p->rt.nr_cpus_allowed <= 1); |
1294 | 1369 | ||
1295 | BUG_ON(!p->se.on_rq); | 1370 | BUG_ON(!p->on_rq); |
1296 | BUG_ON(!rt_task(p)); | 1371 | BUG_ON(!rt_task(p)); |
1297 | 1372 | ||
1298 | return p; | 1373 | return p; |
@@ -1315,7 +1390,7 @@ static int push_rt_task(struct rq *rq) | |||
1315 | if (!next_task) | 1390 | if (!next_task) |
1316 | return 0; | 1391 | return 0; |
1317 | 1392 | ||
1318 | retry: | 1393 | retry: |
1319 | if (unlikely(next_task == rq->curr)) { | 1394 | if (unlikely(next_task == rq->curr)) { |
1320 | WARN_ON(1); | 1395 | WARN_ON(1); |
1321 | return 0; | 1396 | return 0; |
@@ -1349,7 +1424,7 @@ static int push_rt_task(struct rq *rq) | |||
1349 | task = pick_next_pushable_task(rq); | 1424 | task = pick_next_pushable_task(rq); |
1350 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1425 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
1351 | /* | 1426 | /* |
1352 | * If we get here, the task hasnt moved at all, but | 1427 | * If we get here, the task hasn't moved at all, but |
1353 | * it has failed to push. We will not try again, | 1428 | * it has failed to push. We will not try again, |
1354 | * since the other cpus will pull from us when they | 1429 | * since the other cpus will pull from us when they |
1355 | * are ready. | 1430 | * are ready. |
@@ -1438,7 +1513,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1438 | */ | 1513 | */ |
1439 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1514 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
1440 | WARN_ON(p == src_rq->curr); | 1515 | WARN_ON(p == src_rq->curr); |
1441 | WARN_ON(!p->se.on_rq); | 1516 | WARN_ON(!p->on_rq); |
1442 | 1517 | ||
1443 | /* | 1518 | /* |
1444 | * There's a chance that p is higher in priority | 1519 | * There's a chance that p is higher in priority |
@@ -1459,11 +1534,11 @@ static int pull_rt_task(struct rq *this_rq) | |||
1459 | /* | 1534 | /* |
1460 | * We continue with the search, just in | 1535 | * We continue with the search, just in |
1461 | * case there's an even higher prio task | 1536 | * case there's an even higher prio task |
1462 | * in another runqueue. (low likelyhood | 1537 | * in another runqueue. (low likelihood |
1463 | * but possible) | 1538 | * but possible) |
1464 | */ | 1539 | */ |
1465 | } | 1540 | } |
1466 | skip: | 1541 | skip: |
1467 | double_unlock_balance(this_rq, src_rq); | 1542 | double_unlock_balance(this_rq, src_rq); |
1468 | } | 1543 | } |
1469 | 1544 | ||
@@ -1491,7 +1566,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1491 | if (!task_running(rq, p) && | 1566 | if (!task_running(rq, p) && |
1492 | !test_tsk_need_resched(rq->curr) && | 1567 | !test_tsk_need_resched(rq->curr) && |
1493 | has_pushable_tasks(rq) && | 1568 | has_pushable_tasks(rq) && |
1494 | p->rt.nr_cpus_allowed > 1) | 1569 | p->rt.nr_cpus_allowed > 1 && |
1570 | rt_task(rq->curr) && | ||
1571 | (rq->curr->rt.nr_cpus_allowed < 2 || | ||
1572 | rq->curr->prio < p->prio)) | ||
1495 | push_rt_tasks(rq); | 1573 | push_rt_tasks(rq); |
1496 | } | 1574 | } |
1497 | 1575 | ||
@@ -1506,7 +1584,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1506 | * Update the migration status of the RQ if we have an RT task | 1584 | * Update the migration status of the RQ if we have an RT task |
1507 | * which is running AND changing its weight value. | 1585 | * which is running AND changing its weight value. |
1508 | */ | 1586 | */ |
1509 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | 1587 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { |
1510 | struct rq *rq = task_rq(p); | 1588 | struct rq *rq = task_rq(p); |
1511 | 1589 | ||
1512 | if (!task_current(rq, p)) { | 1590 | if (!task_current(rq, p)) { |
@@ -1567,8 +1645,7 @@ static void rq_offline_rt(struct rq *rq) | |||
1567 | * When switch from the rt queue, we bring ourselves to a position | 1645 | * When switch from the rt queue, we bring ourselves to a position |
1568 | * that we might want to pull RT tasks from other runqueues. | 1646 | * that we might want to pull RT tasks from other runqueues. |
1569 | */ | 1647 | */ |
1570 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | 1648 | static void switched_from_rt(struct rq *rq, struct task_struct *p) |
1571 | int running) | ||
1572 | { | 1649 | { |
1573 | /* | 1650 | /* |
1574 | * If there are other RT tasks then we will reschedule | 1651 | * If there are other RT tasks then we will reschedule |
@@ -1577,7 +1654,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, | |||
1577 | * we may need to handle the pulling of RT tasks | 1654 | * we may need to handle the pulling of RT tasks |
1578 | * now. | 1655 | * now. |
1579 | */ | 1656 | */ |
1580 | if (!rq->rt.rt_nr_running) | 1657 | if (p->on_rq && !rq->rt.rt_nr_running) |
1581 | pull_rt_task(rq); | 1658 | pull_rt_task(rq); |
1582 | } | 1659 | } |
1583 | 1660 | ||
@@ -1596,8 +1673,7 @@ static inline void init_sched_rt_class(void) | |||
1596 | * with RT tasks. In this case we try to push them off to | 1673 | * with RT tasks. In this case we try to push them off to |
1597 | * other runqueues. | 1674 | * other runqueues. |
1598 | */ | 1675 | */ |
1599 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | 1676 | static void switched_to_rt(struct rq *rq, struct task_struct *p) |
1600 | int running) | ||
1601 | { | 1677 | { |
1602 | int check_resched = 1; | 1678 | int check_resched = 1; |
1603 | 1679 | ||
@@ -1608,7 +1684,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
1608 | * If that current running task is also an RT task | 1684 | * If that current running task is also an RT task |
1609 | * then see if we can move to another run queue. | 1685 | * then see if we can move to another run queue. |
1610 | */ | 1686 | */ |
1611 | if (!running) { | 1687 | if (p->on_rq && rq->curr != p) { |
1612 | #ifdef CONFIG_SMP | 1688 | #ifdef CONFIG_SMP |
1613 | if (rq->rt.overloaded && push_rt_task(rq) && | 1689 | if (rq->rt.overloaded && push_rt_task(rq) && |
1614 | /* Don't resched if we changed runqueues */ | 1690 | /* Don't resched if we changed runqueues */ |
@@ -1624,10 +1700,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
1624 | * Priority of the task has changed. This may cause | 1700 | * Priority of the task has changed. This may cause |
1625 | * us to initiate a push or pull. | 1701 | * us to initiate a push or pull. |
1626 | */ | 1702 | */ |
1627 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | 1703 | static void |
1628 | int oldprio, int running) | 1704 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1629 | { | 1705 | { |
1630 | if (running) { | 1706 | if (!p->on_rq) |
1707 | return; | ||
1708 | |||
1709 | if (rq->curr == p) { | ||
1631 | #ifdef CONFIG_SMP | 1710 | #ifdef CONFIG_SMP |
1632 | /* | 1711 | /* |
1633 | * If our priority decreases while running, we | 1712 | * If our priority decreases while running, we |
@@ -1709,7 +1788,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
1709 | { | 1788 | { |
1710 | struct task_struct *p = rq->curr; | 1789 | struct task_struct *p = rq->curr; |
1711 | 1790 | ||
1712 | p->se.exec_start = rq->clock; | 1791 | p->se.exec_start = rq->clock_task; |
1713 | 1792 | ||
1714 | /* The running task is never eligible for pushing */ | 1793 | /* The running task is never eligible for pushing */ |
1715 | dequeue_pushable_task(rq, p); | 1794 | dequeue_pushable_task(rq, p); |
@@ -1763,10 +1842,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | |||
1763 | 1842 | ||
1764 | static void print_rt_stats(struct seq_file *m, int cpu) | 1843 | static void print_rt_stats(struct seq_file *m, int cpu) |
1765 | { | 1844 | { |
1845 | rt_rq_iter_t iter; | ||
1766 | struct rt_rq *rt_rq; | 1846 | struct rt_rq *rt_rq; |
1767 | 1847 | ||
1768 | rcu_read_lock(); | 1848 | rcu_read_lock(); |
1769 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | 1849 | for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) |
1770 | print_rt_rq(m, cpu, rt_rq); | 1850 | print_rt_rq(m, cpu, rt_rq); |
1771 | rcu_read_unlock(); | 1851 | rcu_read_unlock(); |
1772 | } | 1852 | } |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6fc..331e01bcd026 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
37 | 37 | ||
38 | #ifdef CONFIG_SMP | 38 | #ifdef CONFIG_SMP |
39 | /* domain-specific stats */ | 39 | /* domain-specific stats */ |
40 | preempt_disable(); | 40 | rcu_read_lock(); |
41 | for_each_domain(cpu, sd) { | 41 | for_each_domain(cpu, sd) { |
42 | enum cpu_idle_type itype; | 42 | enum cpu_idle_type itype; |
43 | 43 | ||
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
65 | sd->ttwu_move_balance); | 65 | sd->ttwu_move_balance); |
66 | } | 66 | } |
67 | preempt_enable(); | 67 | rcu_read_unlock(); |
68 | #endif | 68 | #endif |
69 | } | 69 | } |
70 | kfree(mask_str); | 70 | kfree(mask_str); |
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
157 | } | 157 | } |
158 | 158 | ||
159 | /* | 159 | /* |
160 | * Called when a process is dequeued from the active array and given | 160 | * We are interested in knowing how long it was from the *first* time a |
161 | * the cpu. We should note that with the exception of interactive | ||
162 | * tasks, the expired queue will become the active queue after the active | ||
163 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
164 | * expired queue. (Interactive tasks may be requeued directly to the | ||
165 | * active queue, thus delaying tasks in the expired queue from running; | ||
166 | * see scheduler_tick()). | ||
167 | * | ||
168 | * Though we are interested in knowing how long it was from the *first* time a | ||
169 | * task was queued to the time that it finally hit a cpu, we call this routine | 161 | * task was queued to the time that it finally hit a cpu, we call this routine |
170 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 162 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
171 | * delta taken on each cpu would annul the skew. | 163 | * delta taken on each cpu would annul the skew. |
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t) | |||
203 | } | 195 | } |
204 | 196 | ||
205 | /* | 197 | /* |
206 | * Called when a process is queued into either the active or expired | ||
207 | * array. The time is noted and later used to determine how long we | ||
208 | * had to wait for us to reach the cpu. Since the expired queue will | ||
209 | * become the active queue after active queue is empty, without dequeuing | ||
210 | * and requeuing any tasks, we are interested in queuing to either. It | ||
211 | * is unusual but not impossible for tasks to be dequeued and immediately | ||
212 | * requeued in the same or another array: this can happen in sched_yield(), | ||
213 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | ||
214 | * to runqueue. | ||
215 | * | ||
216 | * This function is only called from enqueue_task(), but also only updates | 198 | * This function is only called from enqueue_task(), but also only updates |
217 | * the timestamp if it is already not set. It's assumed that | 199 | * the timestamp if it is already not set. It's assumed that |
218 | * sched_info_dequeued() will clear that stamp when appropriate. | 200 | * sched_info_dequeued() will clear that stamp when appropriate. |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 000000000000..6f437632afab --- /dev/null +++ b/kernel/sched_stoptask.c | |||
@@ -0,0 +1,104 @@ | |||
1 | /* | ||
2 | * stop-task scheduling class. | ||
3 | * | ||
4 | * The stop task is the highest priority task in the system, it preempts | ||
5 | * everything and will be preempted by nothing. | ||
6 | * | ||
7 | * See kernel/stop_machine.c | ||
8 | */ | ||
9 | |||
10 | #ifdef CONFIG_SMP | ||
11 | static int | ||
12 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) | ||
13 | { | ||
14 | return task_cpu(p); /* stop tasks as never migrate */ | ||
15 | } | ||
16 | #endif /* CONFIG_SMP */ | ||
17 | |||
18 | static void | ||
19 | check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | ||
20 | { | ||
21 | /* we're never preempted */ | ||
22 | } | ||
23 | |||
24 | static struct task_struct *pick_next_task_stop(struct rq *rq) | ||
25 | { | ||
26 | struct task_struct *stop = rq->stop; | ||
27 | |||
28 | if (stop && stop->on_rq) | ||
29 | return stop; | ||
30 | |||
31 | return NULL; | ||
32 | } | ||
33 | |||
34 | static void | ||
35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | ||
36 | { | ||
37 | } | ||
38 | |||
39 | static void | ||
40 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | ||
41 | { | ||
42 | } | ||
43 | |||
44 | static void yield_task_stop(struct rq *rq) | ||
45 | { | ||
46 | BUG(); /* the stop task should never yield, its pointless. */ | ||
47 | } | ||
48 | |||
49 | static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | ||
50 | { | ||
51 | } | ||
52 | |||
53 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | ||
54 | { | ||
55 | } | ||
56 | |||
57 | static void set_curr_task_stop(struct rq *rq) | ||
58 | { | ||
59 | } | ||
60 | |||
61 | static void switched_to_stop(struct rq *rq, struct task_struct *p) | ||
62 | { | ||
63 | BUG(); /* its impossible to change to this class */ | ||
64 | } | ||
65 | |||
66 | static void | ||
67 | prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) | ||
68 | { | ||
69 | BUG(); /* how!?, what priority? */ | ||
70 | } | ||
71 | |||
72 | static unsigned int | ||
73 | get_rr_interval_stop(struct rq *rq, struct task_struct *task) | ||
74 | { | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * Simple, special scheduling class for the per-CPU stop tasks: | ||
80 | */ | ||
81 | static const struct sched_class stop_sched_class = { | ||
82 | .next = &rt_sched_class, | ||
83 | |||
84 | .enqueue_task = enqueue_task_stop, | ||
85 | .dequeue_task = dequeue_task_stop, | ||
86 | .yield_task = yield_task_stop, | ||
87 | |||
88 | .check_preempt_curr = check_preempt_curr_stop, | ||
89 | |||
90 | .pick_next_task = pick_next_task_stop, | ||
91 | .put_prev_task = put_prev_task_stop, | ||
92 | |||
93 | #ifdef CONFIG_SMP | ||
94 | .select_task_rq = select_task_rq_stop, | ||
95 | #endif | ||
96 | |||
97 | .set_curr_task = set_curr_task_stop, | ||
98 | .task_tick = task_tick_stop, | ||
99 | |||
100 | .get_rr_interval = get_rr_interval_stop, | ||
101 | |||
102 | .prio_changed = prio_changed_stop, | ||
103 | .switched_to = switched_to_stop, | ||
104 | }; | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 919562c3d6b7..415d85d6f6c6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |||
124 | 124 | ||
125 | static int recalc_sigpending_tsk(struct task_struct *t) | 125 | static int recalc_sigpending_tsk(struct task_struct *t) |
126 | { | 126 | { |
127 | if (t->signal->group_stop_count > 0 || | 127 | if ((t->group_stop & GROUP_STOP_PENDING) || |
128 | PENDING(&t->pending, &t->blocked) || | 128 | PENDING(&t->pending, &t->blocked) || |
129 | PENDING(&t->signal->shared_pending, &t->blocked)) { | 129 | PENDING(&t->signal->shared_pending, &t->blocked)) { |
130 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 130 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
@@ -223,10 +223,87 @@ static inline void print_dropped_signal(int sig) | |||
223 | current->comm, current->pid, sig); | 223 | current->comm, current->pid, sig); |
224 | } | 224 | } |
225 | 225 | ||
226 | /** | ||
227 | * task_clear_group_stop_trapping - clear group stop trapping bit | ||
228 | * @task: target task | ||
229 | * | ||
230 | * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it | ||
231 | * and wake up the ptracer. Note that we don't need any further locking. | ||
232 | * @task->siglock guarantees that @task->parent points to the ptracer. | ||
233 | * | ||
234 | * CONTEXT: | ||
235 | * Must be called with @task->sighand->siglock held. | ||
236 | */ | ||
237 | static void task_clear_group_stop_trapping(struct task_struct *task) | ||
238 | { | ||
239 | if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { | ||
240 | task->group_stop &= ~GROUP_STOP_TRAPPING; | ||
241 | __wake_up_sync_key(&task->parent->signal->wait_chldexit, | ||
242 | TASK_UNINTERRUPTIBLE, 1, task); | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /** | ||
247 | * task_clear_group_stop_pending - clear pending group stop | ||
248 | * @task: target task | ||
249 | * | ||
250 | * Clear group stop states for @task. | ||
251 | * | ||
252 | * CONTEXT: | ||
253 | * Must be called with @task->sighand->siglock held. | ||
254 | */ | ||
255 | void task_clear_group_stop_pending(struct task_struct *task) | ||
256 | { | ||
257 | task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | | ||
258 | GROUP_STOP_DEQUEUED); | ||
259 | } | ||
260 | |||
261 | /** | ||
262 | * task_participate_group_stop - participate in a group stop | ||
263 | * @task: task participating in a group stop | ||
264 | * | ||
265 | * @task has GROUP_STOP_PENDING set and is participating in a group stop. | ||
266 | * Group stop states are cleared and the group stop count is consumed if | ||
267 | * %GROUP_STOP_CONSUME was set. If the consumption completes the group | ||
268 | * stop, the appropriate %SIGNAL_* flags are set. | ||
269 | * | ||
270 | * CONTEXT: | ||
271 | * Must be called with @task->sighand->siglock held. | ||
272 | * | ||
273 | * RETURNS: | ||
274 | * %true if group stop completion should be notified to the parent, %false | ||
275 | * otherwise. | ||
276 | */ | ||
277 | static bool task_participate_group_stop(struct task_struct *task) | ||
278 | { | ||
279 | struct signal_struct *sig = task->signal; | ||
280 | bool consume = task->group_stop & GROUP_STOP_CONSUME; | ||
281 | |||
282 | WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); | ||
283 | |||
284 | task_clear_group_stop_pending(task); | ||
285 | |||
286 | if (!consume) | ||
287 | return false; | ||
288 | |||
289 | if (!WARN_ON_ONCE(sig->group_stop_count == 0)) | ||
290 | sig->group_stop_count--; | ||
291 | |||
292 | /* | ||
293 | * Tell the caller to notify completion iff we are entering into a | ||
294 | * fresh group stop. Read comment in do_signal_stop() for details. | ||
295 | */ | ||
296 | if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { | ||
297 | sig->flags = SIGNAL_STOP_STOPPED; | ||
298 | return true; | ||
299 | } | ||
300 | return false; | ||
301 | } | ||
302 | |||
226 | /* | 303 | /* |
227 | * allocate a new signal queue record | 304 | * allocate a new signal queue record |
228 | * - this may be called without locks if and only if t == current, otherwise an | 305 | * - this may be called without locks if and only if t == current, otherwise an |
229 | * appopriate lock must be held to stop the target task from exiting | 306 | * appropriate lock must be held to stop the target task from exiting |
230 | */ | 307 | */ |
231 | static struct sigqueue * | 308 | static struct sigqueue * |
232 | __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) | 309 | __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) |
@@ -375,15 +452,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) | |||
375 | return !tracehook_consider_fatal_signal(tsk, sig); | 452 | return !tracehook_consider_fatal_signal(tsk, sig); |
376 | } | 453 | } |
377 | 454 | ||
378 | 455 | /* | |
379 | /* Notify the system that a driver wants to block all signals for this | 456 | * Notify the system that a driver wants to block all signals for this |
380 | * process, and wants to be notified if any signals at all were to be | 457 | * process, and wants to be notified if any signals at all were to be |
381 | * sent/acted upon. If the notifier routine returns non-zero, then the | 458 | * sent/acted upon. If the notifier routine returns non-zero, then the |
382 | * signal will be acted upon after all. If the notifier routine returns 0, | 459 | * signal will be acted upon after all. If the notifier routine returns 0, |
383 | * then then signal will be blocked. Only one block per process is | 460 | * then then signal will be blocked. Only one block per process is |
384 | * allowed. priv is a pointer to private data that the notifier routine | 461 | * allowed. priv is a pointer to private data that the notifier routine |
385 | * can use to determine if the signal should be blocked or not. */ | 462 | * can use to determine if the signal should be blocked or not. |
386 | 463 | */ | |
387 | void | 464 | void |
388 | block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) | 465 | block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) |
389 | { | 466 | { |
@@ -434,9 +511,10 @@ still_pending: | |||
434 | copy_siginfo(info, &first->info); | 511 | copy_siginfo(info, &first->info); |
435 | __sigqueue_free(first); | 512 | __sigqueue_free(first); |
436 | } else { | 513 | } else { |
437 | /* Ok, it wasn't in the queue. This must be | 514 | /* |
438 | a fast-pathed signal or we must have been | 515 | * Ok, it wasn't in the queue. This must be |
439 | out of queue space. So zero out the info. | 516 | * a fast-pathed signal or we must have been |
517 | * out of queue space. So zero out the info. | ||
440 | */ | 518 | */ |
441 | info->si_signo = sig; | 519 | info->si_signo = sig; |
442 | info->si_errno = 0; | 520 | info->si_errno = 0; |
@@ -468,7 +546,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
468 | } | 546 | } |
469 | 547 | ||
470 | /* | 548 | /* |
471 | * Dequeue a signal and return the element to the caller, which is | 549 | * Dequeue a signal and return the element to the caller, which is |
472 | * expected to free it. | 550 | * expected to free it. |
473 | * | 551 | * |
474 | * All callers have to hold the siglock. | 552 | * All callers have to hold the siglock. |
@@ -490,7 +568,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
490 | * itimers are process shared and we restart periodic | 568 | * itimers are process shared and we restart periodic |
491 | * itimers in the signal delivery path to prevent DoS | 569 | * itimers in the signal delivery path to prevent DoS |
492 | * attacks in the high resolution timer case. This is | 570 | * attacks in the high resolution timer case. This is |
493 | * compliant with the old way of self restarting | 571 | * compliant with the old way of self-restarting |
494 | * itimers, as the SIGALRM is a legacy signal and only | 572 | * itimers, as the SIGALRM is a legacy signal and only |
495 | * queued once. Changing the restart behaviour to | 573 | * queued once. Changing the restart behaviour to |
496 | * restart the timer in the signal dequeue path is | 574 | * restart the timer in the signal dequeue path is |
@@ -526,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
526 | * is to alert stop-signal processing code when another | 604 | * is to alert stop-signal processing code when another |
527 | * processor has come along and cleared the flag. | 605 | * processor has come along and cleared the flag. |
528 | */ | 606 | */ |
529 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 607 | current->group_stop |= GROUP_STOP_DEQUEUED; |
530 | } | 608 | } |
531 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { | 609 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { |
532 | /* | 610 | /* |
@@ -591,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | |||
591 | if (sigisemptyset(&m)) | 669 | if (sigisemptyset(&m)) |
592 | return 0; | 670 | return 0; |
593 | 671 | ||
594 | signandsets(&s->signal, &s->signal, mask); | 672 | sigandnsets(&s->signal, &s->signal, mask); |
595 | list_for_each_entry_safe(q, n, &s->list, list) { | 673 | list_for_each_entry_safe(q, n, &s->list, list) { |
596 | if (sigismember(mask, q->info.si_signo)) { | 674 | if (sigismember(mask, q->info.si_signo)) { |
597 | list_del_init(&q->list); | 675 | list_del_init(&q->list); |
@@ -636,13 +714,33 @@ static inline bool si_fromuser(const struct siginfo *info) | |||
636 | } | 714 | } |
637 | 715 | ||
638 | /* | 716 | /* |
717 | * called with RCU read lock from check_kill_permission() | ||
718 | */ | ||
719 | static int kill_ok_by_cred(struct task_struct *t) | ||
720 | { | ||
721 | const struct cred *cred = current_cred(); | ||
722 | const struct cred *tcred = __task_cred(t); | ||
723 | |||
724 | if (cred->user->user_ns == tcred->user->user_ns && | ||
725 | (cred->euid == tcred->suid || | ||
726 | cred->euid == tcred->uid || | ||
727 | cred->uid == tcred->suid || | ||
728 | cred->uid == tcred->uid)) | ||
729 | return 1; | ||
730 | |||
731 | if (ns_capable(tcred->user->user_ns, CAP_KILL)) | ||
732 | return 1; | ||
733 | |||
734 | return 0; | ||
735 | } | ||
736 | |||
737 | /* | ||
639 | * Bad permissions for sending the signal | 738 | * Bad permissions for sending the signal |
640 | * - the caller must hold the RCU read lock | 739 | * - the caller must hold the RCU read lock |
641 | */ | 740 | */ |
642 | static int check_kill_permission(int sig, struct siginfo *info, | 741 | static int check_kill_permission(int sig, struct siginfo *info, |
643 | struct task_struct *t) | 742 | struct task_struct *t) |
644 | { | 743 | { |
645 | const struct cred *cred, *tcred; | ||
646 | struct pid *sid; | 744 | struct pid *sid; |
647 | int error; | 745 | int error; |
648 | 746 | ||
@@ -656,14 +754,8 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
656 | if (error) | 754 | if (error) |
657 | return error; | 755 | return error; |
658 | 756 | ||
659 | cred = current_cred(); | ||
660 | tcred = __task_cred(t); | ||
661 | if (!same_thread_group(current, t) && | 757 | if (!same_thread_group(current, t) && |
662 | (cred->euid ^ tcred->suid) && | 758 | !kill_ok_by_cred(t)) { |
663 | (cred->euid ^ tcred->uid) && | ||
664 | (cred->uid ^ tcred->suid) && | ||
665 | (cred->uid ^ tcred->uid) && | ||
666 | !capable(CAP_KILL)) { | ||
667 | switch (sig) { | 759 | switch (sig) { |
668 | case SIGCONT: | 760 | case SIGCONT: |
669 | sid = task_session(t); | 761 | sid = task_session(t); |
@@ -712,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
712 | } else if (sig == SIGCONT) { | 804 | } else if (sig == SIGCONT) { |
713 | unsigned int why; | 805 | unsigned int why; |
714 | /* | 806 | /* |
715 | * Remove all stop signals from all queues, | 807 | * Remove all stop signals from all queues, wake all threads. |
716 | * and wake all threads. | ||
717 | */ | 808 | */ |
718 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); | 809 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); |
719 | t = p; | 810 | t = p; |
720 | do { | 811 | do { |
721 | unsigned int state; | 812 | task_clear_group_stop_pending(t); |
722 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | 813 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); |
723 | /* | 814 | wake_up_state(t, __TASK_STOPPED); |
724 | * If there is a handler for SIGCONT, we must make | ||
725 | * sure that no thread returns to user mode before | ||
726 | * we post the signal, in case it was the only | ||
727 | * thread eligible to run the signal handler--then | ||
728 | * it must not do anything between resuming and | ||
729 | * running the handler. With the TIF_SIGPENDING | ||
730 | * flag set, the thread will pause and acquire the | ||
731 | * siglock that we hold now and until we've queued | ||
732 | * the pending signal. | ||
733 | * | ||
734 | * Wake up the stopped thread _after_ setting | ||
735 | * TIF_SIGPENDING | ||
736 | */ | ||
737 | state = __TASK_STOPPED; | ||
738 | if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { | ||
739 | set_tsk_thread_flag(t, TIF_SIGPENDING); | ||
740 | state |= TASK_INTERRUPTIBLE; | ||
741 | } | ||
742 | wake_up_state(t, state); | ||
743 | } while_each_thread(p, t); | 815 | } while_each_thread(p, t); |
744 | 816 | ||
745 | /* | 817 | /* |
@@ -765,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
765 | signal->flags = why | SIGNAL_STOP_CONTINUED; | 837 | signal->flags = why | SIGNAL_STOP_CONTINUED; |
766 | signal->group_stop_count = 0; | 838 | signal->group_stop_count = 0; |
767 | signal->group_exit_code = 0; | 839 | signal->group_exit_code = 0; |
768 | } else { | ||
769 | /* | ||
770 | * We are not stopped, but there could be a stop | ||
771 | * signal in the middle of being processed after | ||
772 | * being removed from the queue. Clear that too. | ||
773 | */ | ||
774 | signal->flags &= ~SIGNAL_STOP_DEQUEUED; | ||
775 | } | 840 | } |
776 | } | 841 | } |
777 | 842 | ||
@@ -860,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) | |||
860 | signal->group_stop_count = 0; | 925 | signal->group_stop_count = 0; |
861 | t = p; | 926 | t = p; |
862 | do { | 927 | do { |
928 | task_clear_group_stop_pending(t); | ||
863 | sigaddset(&t->pending.signal, SIGKILL); | 929 | sigaddset(&t->pending.signal, SIGKILL); |
864 | signal_wake_up(t, 1); | 930 | signal_wake_up(t, 1); |
865 | } while_each_thread(p, t); | 931 | } while_each_thread(p, t); |
@@ -909,14 +975,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
909 | if (info == SEND_SIG_FORCED) | 975 | if (info == SEND_SIG_FORCED) |
910 | goto out_set; | 976 | goto out_set; |
911 | 977 | ||
912 | /* Real-time signals must be queued if sent by sigqueue, or | 978 | /* |
913 | some other real-time mechanism. It is implementation | 979 | * Real-time signals must be queued if sent by sigqueue, or |
914 | defined whether kill() does so. We attempt to do so, on | 980 | * some other real-time mechanism. It is implementation |
915 | the principle of least surprise, but since kill is not | 981 | * defined whether kill() does so. We attempt to do so, on |
916 | allowed to fail with EAGAIN when low on memory we just | 982 | * the principle of least surprise, but since kill is not |
917 | make sure at least one signal gets delivered and don't | 983 | * allowed to fail with EAGAIN when low on memory we just |
918 | pass on the info struct. */ | 984 | * make sure at least one signal gets delivered and don't |
919 | 985 | * pass on the info struct. | |
986 | */ | ||
920 | if (sig < SIGRTMIN) | 987 | if (sig < SIGRTMIN) |
921 | override_rlimit = (is_si_special(info) || info->si_code >= 0); | 988 | override_rlimit = (is_si_special(info) || info->si_code >= 0); |
922 | else | 989 | else |
@@ -1093,6 +1160,7 @@ int zap_other_threads(struct task_struct *p) | |||
1093 | p->signal->group_stop_count = 0; | 1160 | p->signal->group_stop_count = 0; |
1094 | 1161 | ||
1095 | while_each_thread(p, t) { | 1162 | while_each_thread(p, t) { |
1163 | task_clear_group_stop_pending(t); | ||
1096 | count++; | 1164 | count++; |
1097 | 1165 | ||
1098 | /* Don't bother with already dead threads */ | 1166 | /* Don't bother with already dead threads */ |
@@ -1105,22 +1173,30 @@ int zap_other_threads(struct task_struct *p) | |||
1105 | return count; | 1173 | return count; |
1106 | } | 1174 | } |
1107 | 1175 | ||
1108 | struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) | 1176 | struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, |
1177 | unsigned long *flags) | ||
1109 | { | 1178 | { |
1110 | struct sighand_struct *sighand; | 1179 | struct sighand_struct *sighand; |
1111 | 1180 | ||
1112 | rcu_read_lock(); | ||
1113 | for (;;) { | 1181 | for (;;) { |
1182 | local_irq_save(*flags); | ||
1183 | rcu_read_lock(); | ||
1114 | sighand = rcu_dereference(tsk->sighand); | 1184 | sighand = rcu_dereference(tsk->sighand); |
1115 | if (unlikely(sighand == NULL)) | 1185 | if (unlikely(sighand == NULL)) { |
1186 | rcu_read_unlock(); | ||
1187 | local_irq_restore(*flags); | ||
1116 | break; | 1188 | break; |
1189 | } | ||
1117 | 1190 | ||
1118 | spin_lock_irqsave(&sighand->siglock, *flags); | 1191 | spin_lock(&sighand->siglock); |
1119 | if (likely(sighand == tsk->sighand)) | 1192 | if (likely(sighand == tsk->sighand)) { |
1193 | rcu_read_unlock(); | ||
1120 | break; | 1194 | break; |
1121 | spin_unlock_irqrestore(&sighand->siglock, *flags); | 1195 | } |
1196 | spin_unlock(&sighand->siglock); | ||
1197 | rcu_read_unlock(); | ||
1198 | local_irq_restore(*flags); | ||
1122 | } | 1199 | } |
1123 | rcu_read_unlock(); | ||
1124 | 1200 | ||
1125 | return sighand; | 1201 | return sighand; |
1126 | } | 1202 | } |
@@ -1186,8 +1262,7 @@ retry: | |||
1186 | return error; | 1262 | return error; |
1187 | } | 1263 | } |
1188 | 1264 | ||
1189 | int | 1265 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
1190 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | ||
1191 | { | 1266 | { |
1192 | int error; | 1267 | int error; |
1193 | rcu_read_lock(); | 1268 | rcu_read_lock(); |
@@ -1284,8 +1359,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) | |||
1284 | * These are for backward compatibility with the rest of the kernel source. | 1359 | * These are for backward compatibility with the rest of the kernel source. |
1285 | */ | 1360 | */ |
1286 | 1361 | ||
1287 | int | 1362 | int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
1288 | send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
1289 | { | 1363 | { |
1290 | /* | 1364 | /* |
1291 | * Make sure legacy kernel users don't send in bad values | 1365 | * Make sure legacy kernel users don't send in bad values |
@@ -1353,7 +1427,7 @@ EXPORT_SYMBOL(kill_pid); | |||
1353 | * These functions support sending signals using preallocated sigqueue | 1427 | * These functions support sending signals using preallocated sigqueue |
1354 | * structures. This is needed "because realtime applications cannot | 1428 | * structures. This is needed "because realtime applications cannot |
1355 | * afford to lose notifications of asynchronous events, like timer | 1429 | * afford to lose notifications of asynchronous events, like timer |
1356 | * expirations or I/O completions". In the case of Posix Timers | 1430 | * expirations or I/O completions". In the case of POSIX Timers |
1357 | * we allocate the sigqueue structure from the timer_create. If this | 1431 | * we allocate the sigqueue structure from the timer_create. If this |
1358 | * allocation fails we are able to report the failure to the application | 1432 | * allocation fails we are able to report the failure to the application |
1359 | * with an EAGAIN error. | 1433 | * with an EAGAIN error. |
@@ -1521,16 +1595,30 @@ int do_notify_parent(struct task_struct *tsk, int sig) | |||
1521 | return ret; | 1595 | return ret; |
1522 | } | 1596 | } |
1523 | 1597 | ||
1524 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | 1598 | /** |
1599 | * do_notify_parent_cldstop - notify parent of stopped/continued state change | ||
1600 | * @tsk: task reporting the state change | ||
1601 | * @for_ptracer: the notification is for ptracer | ||
1602 | * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report | ||
1603 | * | ||
1604 | * Notify @tsk's parent that the stopped/continued state has changed. If | ||
1605 | * @for_ptracer is %false, @tsk's group leader notifies to its real parent. | ||
1606 | * If %true, @tsk reports to @tsk->parent which should be the ptracer. | ||
1607 | * | ||
1608 | * CONTEXT: | ||
1609 | * Must be called with tasklist_lock at least read locked. | ||
1610 | */ | ||
1611 | static void do_notify_parent_cldstop(struct task_struct *tsk, | ||
1612 | bool for_ptracer, int why) | ||
1525 | { | 1613 | { |
1526 | struct siginfo info; | 1614 | struct siginfo info; |
1527 | unsigned long flags; | 1615 | unsigned long flags; |
1528 | struct task_struct *parent; | 1616 | struct task_struct *parent; |
1529 | struct sighand_struct *sighand; | 1617 | struct sighand_struct *sighand; |
1530 | 1618 | ||
1531 | if (task_ptrace(tsk)) | 1619 | if (for_ptracer) { |
1532 | parent = tsk->parent; | 1620 | parent = tsk->parent; |
1533 | else { | 1621 | } else { |
1534 | tsk = tsk->group_leader; | 1622 | tsk = tsk->group_leader; |
1535 | parent = tsk->real_parent; | 1623 | parent = tsk->real_parent; |
1536 | } | 1624 | } |
@@ -1538,7 +1626,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
1538 | info.si_signo = SIGCHLD; | 1626 | info.si_signo = SIGCHLD; |
1539 | info.si_errno = 0; | 1627 | info.si_errno = 0; |
1540 | /* | 1628 | /* |
1541 | * see comment in do_notify_parent() abot the following 3 lines | 1629 | * see comment in do_notify_parent() about the following 4 lines |
1542 | */ | 1630 | */ |
1543 | rcu_read_lock(); | 1631 | rcu_read_lock(); |
1544 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1632 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
@@ -1596,7 +1684,7 @@ static inline int may_ptrace_stop(void) | |||
1596 | } | 1684 | } |
1597 | 1685 | ||
1598 | /* | 1686 | /* |
1599 | * Return nonzero if there is a SIGKILL that should be waking us up. | 1687 | * Return non-zero if there is a SIGKILL that should be waking us up. |
1600 | * Called with the siglock held. | 1688 | * Called with the siglock held. |
1601 | */ | 1689 | */ |
1602 | static int sigkill_pending(struct task_struct *tsk) | 1690 | static int sigkill_pending(struct task_struct *tsk) |
@@ -1606,6 +1694,15 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1606 | } | 1694 | } |
1607 | 1695 | ||
1608 | /* | 1696 | /* |
1697 | * Test whether the target task of the usual cldstop notification - the | ||
1698 | * real_parent of @child - is in the same group as the ptracer. | ||
1699 | */ | ||
1700 | static bool real_parent_is_ptracer(struct task_struct *child) | ||
1701 | { | ||
1702 | return same_thread_group(child->parent, child->real_parent); | ||
1703 | } | ||
1704 | |||
1705 | /* | ||
1609 | * This must be called with current->sighand->siglock held. | 1706 | * This must be called with current->sighand->siglock held. |
1610 | * | 1707 | * |
1611 | * This should be the path for all ptrace stops. | 1708 | * This should be the path for all ptrace stops. |
@@ -1616,8 +1713,12 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1616 | * If we actually decide not to stop at all because the tracer | 1713 | * If we actually decide not to stop at all because the tracer |
1617 | * is gone, we keep current->exit_code unless clear_code. | 1714 | * is gone, we keep current->exit_code unless clear_code. |
1618 | */ | 1715 | */ |
1619 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | 1716 | static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) |
1717 | __releases(¤t->sighand->siglock) | ||
1718 | __acquires(¤t->sighand->siglock) | ||
1620 | { | 1719 | { |
1720 | bool gstop_done = false; | ||
1721 | |||
1621 | if (arch_ptrace_stop_needed(exit_code, info)) { | 1722 | if (arch_ptrace_stop_needed(exit_code, info)) { |
1622 | /* | 1723 | /* |
1623 | * The arch code has something special to do before a | 1724 | * The arch code has something special to do before a |
@@ -1638,21 +1739,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
1638 | } | 1739 | } |
1639 | 1740 | ||
1640 | /* | 1741 | /* |
1641 | * If there is a group stop in progress, | 1742 | * If @why is CLD_STOPPED, we're trapping to participate in a group |
1642 | * we must participate in the bookkeeping. | 1743 | * stop. Do the bookkeeping. Note that if SIGCONT was delievered |
1744 | * while siglock was released for the arch hook, PENDING could be | ||
1745 | * clear now. We act as if SIGCONT is received after TASK_TRACED | ||
1746 | * is entered - ignore it. | ||
1643 | */ | 1747 | */ |
1644 | if (current->signal->group_stop_count > 0) | 1748 | if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) |
1645 | --current->signal->group_stop_count; | 1749 | gstop_done = task_participate_group_stop(current); |
1646 | 1750 | ||
1647 | current->last_siginfo = info; | 1751 | current->last_siginfo = info; |
1648 | current->exit_code = exit_code; | 1752 | current->exit_code = exit_code; |
1649 | 1753 | ||
1650 | /* Let the debugger run. */ | 1754 | /* |
1651 | __set_current_state(TASK_TRACED); | 1755 | * TRACED should be visible before TRAPPING is cleared; otherwise, |
1756 | * the tracer might fail do_wait(). | ||
1757 | */ | ||
1758 | set_current_state(TASK_TRACED); | ||
1759 | |||
1760 | /* | ||
1761 | * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and | ||
1762 | * transition to TASK_TRACED should be atomic with respect to | ||
1763 | * siglock. This hsould be done after the arch hook as siglock is | ||
1764 | * released and regrabbed across it. | ||
1765 | */ | ||
1766 | task_clear_group_stop_trapping(current); | ||
1767 | |||
1652 | spin_unlock_irq(¤t->sighand->siglock); | 1768 | spin_unlock_irq(¤t->sighand->siglock); |
1653 | read_lock(&tasklist_lock); | 1769 | read_lock(&tasklist_lock); |
1654 | if (may_ptrace_stop()) { | 1770 | if (may_ptrace_stop()) { |
1655 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1771 | /* |
1772 | * Notify parents of the stop. | ||
1773 | * | ||
1774 | * While ptraced, there are two parents - the ptracer and | ||
1775 | * the real_parent of the group_leader. The ptracer should | ||
1776 | * know about every stop while the real parent is only | ||
1777 | * interested in the completion of group stop. The states | ||
1778 | * for the two don't interact with each other. Notify | ||
1779 | * separately unless they're gonna be duplicates. | ||
1780 | */ | ||
1781 | do_notify_parent_cldstop(current, true, why); | ||
1782 | if (gstop_done && !real_parent_is_ptracer(current)) | ||
1783 | do_notify_parent_cldstop(current, false, why); | ||
1784 | |||
1656 | /* | 1785 | /* |
1657 | * Don't want to allow preemption here, because | 1786 | * Don't want to allow preemption here, because |
1658 | * sys_ptrace() needs this task to be inactive. | 1787 | * sys_ptrace() needs this task to be inactive. |
@@ -1667,7 +1796,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
1667 | /* | 1796 | /* |
1668 | * By the time we got the lock, our tracer went away. | 1797 | * By the time we got the lock, our tracer went away. |
1669 | * Don't drop the lock yet, another tracer may come. | 1798 | * Don't drop the lock yet, another tracer may come. |
1799 | * | ||
1800 | * If @gstop_done, the ptracer went away between group stop | ||
1801 | * completion and here. During detach, it would have set | ||
1802 | * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED | ||
1803 | * in do_signal_stop() on return, so notifying the real | ||
1804 | * parent of the group stop completion is enough. | ||
1670 | */ | 1805 | */ |
1806 | if (gstop_done) | ||
1807 | do_notify_parent_cldstop(current, false, why); | ||
1808 | |||
1671 | __set_current_state(TASK_RUNNING); | 1809 | __set_current_state(TASK_RUNNING); |
1672 | if (clear_code) | 1810 | if (clear_code) |
1673 | current->exit_code = 0; | 1811 | current->exit_code = 0; |
@@ -1711,79 +1849,128 @@ void ptrace_notify(int exit_code) | |||
1711 | 1849 | ||
1712 | /* Let the debugger run. */ | 1850 | /* Let the debugger run. */ |
1713 | spin_lock_irq(¤t->sighand->siglock); | 1851 | spin_lock_irq(¤t->sighand->siglock); |
1714 | ptrace_stop(exit_code, 1, &info); | 1852 | ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); |
1715 | spin_unlock_irq(¤t->sighand->siglock); | 1853 | spin_unlock_irq(¤t->sighand->siglock); |
1716 | } | 1854 | } |
1717 | 1855 | ||
1718 | /* | 1856 | /* |
1719 | * This performs the stopping for SIGSTOP and other stop signals. | 1857 | * This performs the stopping for SIGSTOP and other stop signals. |
1720 | * We have to stop all threads in the thread group. | 1858 | * We have to stop all threads in the thread group. |
1721 | * Returns nonzero if we've actually stopped and released the siglock. | 1859 | * Returns non-zero if we've actually stopped and released the siglock. |
1722 | * Returns zero if we didn't stop and still hold the siglock. | 1860 | * Returns zero if we didn't stop and still hold the siglock. |
1723 | */ | 1861 | */ |
1724 | static int do_signal_stop(int signr) | 1862 | static int do_signal_stop(int signr) |
1725 | { | 1863 | { |
1726 | struct signal_struct *sig = current->signal; | 1864 | struct signal_struct *sig = current->signal; |
1727 | int notify; | ||
1728 | 1865 | ||
1729 | if (!sig->group_stop_count) { | 1866 | if (!(current->group_stop & GROUP_STOP_PENDING)) { |
1867 | unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; | ||
1730 | struct task_struct *t; | 1868 | struct task_struct *t; |
1731 | 1869 | ||
1732 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || | 1870 | /* signr will be recorded in task->group_stop for retries */ |
1871 | WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); | ||
1872 | |||
1873 | if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || | ||
1733 | unlikely(signal_group_exit(sig))) | 1874 | unlikely(signal_group_exit(sig))) |
1734 | return 0; | 1875 | return 0; |
1735 | /* | 1876 | /* |
1736 | * There is no group stop already in progress. | 1877 | * There is no group stop already in progress. We must |
1737 | * We must initiate one now. | 1878 | * initiate one now. |
1879 | * | ||
1880 | * While ptraced, a task may be resumed while group stop is | ||
1881 | * still in effect and then receive a stop signal and | ||
1882 | * initiate another group stop. This deviates from the | ||
1883 | * usual behavior as two consecutive stop signals can't | ||
1884 | * cause two group stops when !ptraced. That is why we | ||
1885 | * also check !task_is_stopped(t) below. | ||
1886 | * | ||
1887 | * The condition can be distinguished by testing whether | ||
1888 | * SIGNAL_STOP_STOPPED is already set. Don't generate | ||
1889 | * group_exit_code in such case. | ||
1890 | * | ||
1891 | * This is not necessary for SIGNAL_STOP_CONTINUED because | ||
1892 | * an intervening stop signal is required to cause two | ||
1893 | * continued events regardless of ptrace. | ||
1738 | */ | 1894 | */ |
1739 | sig->group_exit_code = signr; | 1895 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) |
1896 | sig->group_exit_code = signr; | ||
1897 | else | ||
1898 | WARN_ON_ONCE(!task_ptrace(current)); | ||
1740 | 1899 | ||
1900 | current->group_stop &= ~GROUP_STOP_SIGMASK; | ||
1901 | current->group_stop |= signr | gstop; | ||
1741 | sig->group_stop_count = 1; | 1902 | sig->group_stop_count = 1; |
1742 | for (t = next_thread(current); t != current; t = next_thread(t)) | 1903 | for (t = next_thread(current); t != current; |
1904 | t = next_thread(t)) { | ||
1905 | t->group_stop &= ~GROUP_STOP_SIGMASK; | ||
1743 | /* | 1906 | /* |
1744 | * Setting state to TASK_STOPPED for a group | 1907 | * Setting state to TASK_STOPPED for a group |
1745 | * stop is always done with the siglock held, | 1908 | * stop is always done with the siglock held, |
1746 | * so this check has no races. | 1909 | * so this check has no races. |
1747 | */ | 1910 | */ |
1748 | if (!(t->flags & PF_EXITING) && | 1911 | if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { |
1749 | !task_is_stopped_or_traced(t)) { | 1912 | t->group_stop |= signr | gstop; |
1750 | sig->group_stop_count++; | 1913 | sig->group_stop_count++; |
1751 | signal_wake_up(t, 0); | 1914 | signal_wake_up(t, 0); |
1752 | } | 1915 | } |
1916 | } | ||
1753 | } | 1917 | } |
1754 | /* | 1918 | retry: |
1755 | * If there are no other threads in the group, or if there is | 1919 | if (likely(!task_ptrace(current))) { |
1756 | * a group stop in progress and we are the last to stop, report | 1920 | int notify = 0; |
1757 | * to the parent. When ptraced, every thread reports itself. | 1921 | |
1758 | */ | 1922 | /* |
1759 | notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; | 1923 | * If there are no other threads in the group, or if there |
1760 | notify = tracehook_notify_jctl(notify, CLD_STOPPED); | 1924 | * is a group stop in progress and we are the last to stop, |
1761 | /* | 1925 | * report to the parent. |
1762 | * tracehook_notify_jctl() can drop and reacquire siglock, so | 1926 | */ |
1763 | * we keep ->group_stop_count != 0 before the call. If SIGCONT | 1927 | if (task_participate_group_stop(current)) |
1764 | * or SIGKILL comes in between ->group_stop_count == 0. | 1928 | notify = CLD_STOPPED; |
1765 | */ | 1929 | |
1766 | if (sig->group_stop_count) { | ||
1767 | if (!--sig->group_stop_count) | ||
1768 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1769 | current->exit_code = sig->group_exit_code; | ||
1770 | __set_current_state(TASK_STOPPED); | 1930 | __set_current_state(TASK_STOPPED); |
1931 | spin_unlock_irq(¤t->sighand->siglock); | ||
1932 | |||
1933 | /* | ||
1934 | * Notify the parent of the group stop completion. Because | ||
1935 | * we're not holding either the siglock or tasklist_lock | ||
1936 | * here, ptracer may attach inbetween; however, this is for | ||
1937 | * group stop and should always be delivered to the real | ||
1938 | * parent of the group leader. The new ptracer will get | ||
1939 | * its notification when this task transitions into | ||
1940 | * TASK_TRACED. | ||
1941 | */ | ||
1942 | if (notify) { | ||
1943 | read_lock(&tasklist_lock); | ||
1944 | do_notify_parent_cldstop(current, false, notify); | ||
1945 | read_unlock(&tasklist_lock); | ||
1946 | } | ||
1947 | |||
1948 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | ||
1949 | schedule(); | ||
1950 | |||
1951 | spin_lock_irq(¤t->sighand->siglock); | ||
1952 | } else { | ||
1953 | ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, | ||
1954 | CLD_STOPPED, 0, NULL); | ||
1955 | current->exit_code = 0; | ||
1771 | } | 1956 | } |
1772 | spin_unlock_irq(¤t->sighand->siglock); | ||
1773 | 1957 | ||
1774 | if (notify) { | 1958 | /* |
1775 | read_lock(&tasklist_lock); | 1959 | * GROUP_STOP_PENDING could be set if another group stop has |
1776 | do_notify_parent_cldstop(current, notify); | 1960 | * started since being woken up or ptrace wants us to transit |
1777 | read_unlock(&tasklist_lock); | 1961 | * between TASK_STOPPED and TRACED. Retry group stop. |
1962 | */ | ||
1963 | if (current->group_stop & GROUP_STOP_PENDING) { | ||
1964 | WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); | ||
1965 | goto retry; | ||
1778 | } | 1966 | } |
1779 | 1967 | ||
1780 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 1968 | /* PTRACE_ATTACH might have raced with task killing, clear trapping */ |
1781 | do { | 1969 | task_clear_group_stop_trapping(current); |
1782 | schedule(); | 1970 | |
1783 | } while (try_to_freeze()); | 1971 | spin_unlock_irq(¤t->sighand->siglock); |
1784 | 1972 | ||
1785 | tracehook_finish_jctl(); | 1973 | tracehook_finish_jctl(); |
1786 | current->exit_code = 0; | ||
1787 | 1974 | ||
1788 | return 1; | 1975 | return 1; |
1789 | } | 1976 | } |
@@ -1797,7 +1984,7 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
1797 | ptrace_signal_deliver(regs, cookie); | 1984 | ptrace_signal_deliver(regs, cookie); |
1798 | 1985 | ||
1799 | /* Let the debugger run. */ | 1986 | /* Let the debugger run. */ |
1800 | ptrace_stop(signr, 0, info); | 1987 | ptrace_stop(signr, CLD_TRAPPED, 0, info); |
1801 | 1988 | ||
1802 | /* We're back. Did the debugger cancel the sig? */ | 1989 | /* We're back. Did the debugger cancel the sig? */ |
1803 | signr = current->exit_code; | 1990 | signr = current->exit_code; |
@@ -1806,10 +1993,12 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
1806 | 1993 | ||
1807 | current->exit_code = 0; | 1994 | current->exit_code = 0; |
1808 | 1995 | ||
1809 | /* Update the siginfo structure if the signal has | 1996 | /* |
1810 | changed. If the debugger wanted something | 1997 | * Update the siginfo structure if the signal has |
1811 | specific in the siginfo structure then it should | 1998 | * changed. If the debugger wanted something |
1812 | have updated *info via PTRACE_SETSIGINFO. */ | 1999 | * specific in the siginfo structure then it should |
2000 | * have updated *info via PTRACE_SETSIGINFO. | ||
2001 | */ | ||
1813 | if (signr != info->si_signo) { | 2002 | if (signr != info->si_signo) { |
1814 | info->si_signo = signr; | 2003 | info->si_signo = signr; |
1815 | info->si_errno = 0; | 2004 | info->si_errno = 0; |
@@ -1850,25 +2039,43 @@ relock: | |||
1850 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. | 2039 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. |
1851 | */ | 2040 | */ |
1852 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { | 2041 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { |
1853 | int why = (signal->flags & SIGNAL_STOP_CONTINUED) | 2042 | struct task_struct *leader; |
1854 | ? CLD_CONTINUED : CLD_STOPPED; | 2043 | int why; |
2044 | |||
2045 | if (signal->flags & SIGNAL_CLD_CONTINUED) | ||
2046 | why = CLD_CONTINUED; | ||
2047 | else | ||
2048 | why = CLD_STOPPED; | ||
2049 | |||
1855 | signal->flags &= ~SIGNAL_CLD_MASK; | 2050 | signal->flags &= ~SIGNAL_CLD_MASK; |
1856 | 2051 | ||
1857 | why = tracehook_notify_jctl(why, CLD_CONTINUED); | ||
1858 | spin_unlock_irq(&sighand->siglock); | 2052 | spin_unlock_irq(&sighand->siglock); |
1859 | 2053 | ||
1860 | if (why) { | 2054 | /* |
1861 | read_lock(&tasklist_lock); | 2055 | * Notify the parent that we're continuing. This event is |
1862 | do_notify_parent_cldstop(current->group_leader, why); | 2056 | * always per-process and doesn't make whole lot of sense |
1863 | read_unlock(&tasklist_lock); | 2057 | * for ptracers, who shouldn't consume the state via |
1864 | } | 2058 | * wait(2) either, but, for backward compatibility, notify |
2059 | * the ptracer of the group leader too unless it's gonna be | ||
2060 | * a duplicate. | ||
2061 | */ | ||
2062 | read_lock(&tasklist_lock); | ||
2063 | |||
2064 | do_notify_parent_cldstop(current, false, why); | ||
2065 | |||
2066 | leader = current->group_leader; | ||
2067 | if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) | ||
2068 | do_notify_parent_cldstop(leader, true, why); | ||
2069 | |||
2070 | read_unlock(&tasklist_lock); | ||
2071 | |||
1865 | goto relock; | 2072 | goto relock; |
1866 | } | 2073 | } |
1867 | 2074 | ||
1868 | for (;;) { | 2075 | for (;;) { |
1869 | struct k_sigaction *ka; | 2076 | struct k_sigaction *ka; |
1870 | /* | 2077 | /* |
1871 | * Tracing can induce an artifical signal and choose sigaction. | 2078 | * Tracing can induce an artificial signal and choose sigaction. |
1872 | * The return value in @signr determines the default action, | 2079 | * The return value in @signr determines the default action, |
1873 | * but @info->si_signo is the signal number we will report. | 2080 | * but @info->si_signo is the signal number we will report. |
1874 | */ | 2081 | */ |
@@ -1878,8 +2085,8 @@ relock: | |||
1878 | if (unlikely(signr != 0)) | 2085 | if (unlikely(signr != 0)) |
1879 | ka = return_ka; | 2086 | ka = return_ka; |
1880 | else { | 2087 | else { |
1881 | if (unlikely(signal->group_stop_count > 0) && | 2088 | if (unlikely(current->group_stop & |
1882 | do_signal_stop(0)) | 2089 | GROUP_STOP_PENDING) && do_signal_stop(0)) |
1883 | goto relock; | 2090 | goto relock; |
1884 | 2091 | ||
1885 | signr = dequeue_signal(current, ¤t->blocked, | 2092 | signr = dequeue_signal(current, ¤t->blocked, |
@@ -1998,10 +2205,42 @@ relock: | |||
1998 | return signr; | 2205 | return signr; |
1999 | } | 2206 | } |
2000 | 2207 | ||
2208 | /* | ||
2209 | * It could be that complete_signal() picked us to notify about the | ||
2210 | * group-wide signal. Other threads should be notified now to take | ||
2211 | * the shared signals in @which since we will not. | ||
2212 | */ | ||
2213 | static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) | ||
2214 | { | ||
2215 | sigset_t retarget; | ||
2216 | struct task_struct *t; | ||
2217 | |||
2218 | sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); | ||
2219 | if (sigisemptyset(&retarget)) | ||
2220 | return; | ||
2221 | |||
2222 | t = tsk; | ||
2223 | while_each_thread(tsk, t) { | ||
2224 | if (t->flags & PF_EXITING) | ||
2225 | continue; | ||
2226 | |||
2227 | if (!has_pending_signals(&retarget, &t->blocked)) | ||
2228 | continue; | ||
2229 | /* Remove the signals this thread can handle. */ | ||
2230 | sigandsets(&retarget, &retarget, &t->blocked); | ||
2231 | |||
2232 | if (!signal_pending(t)) | ||
2233 | signal_wake_up(t, 0); | ||
2234 | |||
2235 | if (sigisemptyset(&retarget)) | ||
2236 | break; | ||
2237 | } | ||
2238 | } | ||
2239 | |||
2001 | void exit_signals(struct task_struct *tsk) | 2240 | void exit_signals(struct task_struct *tsk) |
2002 | { | 2241 | { |
2003 | int group_stop = 0; | 2242 | int group_stop = 0; |
2004 | struct task_struct *t; | 2243 | sigset_t unblocked; |
2005 | 2244 | ||
2006 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { | 2245 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { |
2007 | tsk->flags |= PF_EXITING; | 2246 | tsk->flags |= PF_EXITING; |
@@ -2017,25 +2256,23 @@ void exit_signals(struct task_struct *tsk) | |||
2017 | if (!signal_pending(tsk)) | 2256 | if (!signal_pending(tsk)) |
2018 | goto out; | 2257 | goto out; |
2019 | 2258 | ||
2020 | /* It could be that __group_complete_signal() choose us to | 2259 | unblocked = tsk->blocked; |
2021 | * notify about group-wide signal. Another thread should be | 2260 | signotset(&unblocked); |
2022 | * woken now to take the signal since we will not. | 2261 | retarget_shared_pending(tsk, &unblocked); |
2023 | */ | ||
2024 | for (t = tsk; (t = next_thread(t)) != tsk; ) | ||
2025 | if (!signal_pending(t) && !(t->flags & PF_EXITING)) | ||
2026 | recalc_sigpending_and_wake(t); | ||
2027 | 2262 | ||
2028 | if (unlikely(tsk->signal->group_stop_count) && | 2263 | if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && |
2029 | !--tsk->signal->group_stop_count) { | 2264 | task_participate_group_stop(tsk)) |
2030 | tsk->signal->flags = SIGNAL_STOP_STOPPED; | 2265 | group_stop = CLD_STOPPED; |
2031 | group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); | ||
2032 | } | ||
2033 | out: | 2266 | out: |
2034 | spin_unlock_irq(&tsk->sighand->siglock); | 2267 | spin_unlock_irq(&tsk->sighand->siglock); |
2035 | 2268 | ||
2269 | /* | ||
2270 | * If group stop has completed, deliver the notification. This | ||
2271 | * should always go to the real parent of the group leader. | ||
2272 | */ | ||
2036 | if (unlikely(group_stop)) { | 2273 | if (unlikely(group_stop)) { |
2037 | read_lock(&tasklist_lock); | 2274 | read_lock(&tasklist_lock); |
2038 | do_notify_parent_cldstop(tsk, group_stop); | 2275 | do_notify_parent_cldstop(tsk, false, group_stop); |
2039 | read_unlock(&tasklist_lock); | 2276 | read_unlock(&tasklist_lock); |
2040 | } | 2277 | } |
2041 | } | 2278 | } |
@@ -2055,6 +2292,9 @@ EXPORT_SYMBOL(unblock_all_signals); | |||
2055 | * System call entry points. | 2292 | * System call entry points. |
2056 | */ | 2293 | */ |
2057 | 2294 | ||
2295 | /** | ||
2296 | * sys_restart_syscall - restart a system call | ||
2297 | */ | ||
2058 | SYSCALL_DEFINE0(restart_syscall) | 2298 | SYSCALL_DEFINE0(restart_syscall) |
2059 | { | 2299 | { |
2060 | struct restart_block *restart = ¤t_thread_info()->restart_block; | 2300 | struct restart_block *restart = ¤t_thread_info()->restart_block; |
@@ -2066,11 +2306,33 @@ long do_no_restart_syscall(struct restart_block *param) | |||
2066 | return -EINTR; | 2306 | return -EINTR; |
2067 | } | 2307 | } |
2068 | 2308 | ||
2069 | /* | 2309 | static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) |
2070 | * We don't need to get the kernel lock - this is all local to this | 2310 | { |
2071 | * particular thread.. (and that's good, because this is _heavily_ | 2311 | if (signal_pending(tsk) && !thread_group_empty(tsk)) { |
2072 | * used by various programs) | 2312 | sigset_t newblocked; |
2313 | /* A set of now blocked but previously unblocked signals. */ | ||
2314 | sigandnsets(&newblocked, newset, ¤t->blocked); | ||
2315 | retarget_shared_pending(tsk, &newblocked); | ||
2316 | } | ||
2317 | tsk->blocked = *newset; | ||
2318 | recalc_sigpending(); | ||
2319 | } | ||
2320 | |||
2321 | /** | ||
2322 | * set_current_blocked - change current->blocked mask | ||
2323 | * @newset: new mask | ||
2324 | * | ||
2325 | * It is wrong to change ->blocked directly, this helper should be used | ||
2326 | * to ensure the process can't miss a shared signal we are going to block. | ||
2073 | */ | 2327 | */ |
2328 | void set_current_blocked(const sigset_t *newset) | ||
2329 | { | ||
2330 | struct task_struct *tsk = current; | ||
2331 | |||
2332 | spin_lock_irq(&tsk->sighand->siglock); | ||
2333 | __set_task_blocked(tsk, newset); | ||
2334 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2335 | } | ||
2074 | 2336 | ||
2075 | /* | 2337 | /* |
2076 | * This is also useful for kernel threads that want to temporarily | 2338 | * This is also useful for kernel threads that want to temporarily |
@@ -2082,66 +2344,66 @@ long do_no_restart_syscall(struct restart_block *param) | |||
2082 | */ | 2344 | */ |
2083 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | 2345 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) |
2084 | { | 2346 | { |
2085 | int error; | 2347 | struct task_struct *tsk = current; |
2348 | sigset_t newset; | ||
2086 | 2349 | ||
2087 | spin_lock_irq(¤t->sighand->siglock); | 2350 | /* Lockless, only current can change ->blocked, never from irq */ |
2088 | if (oldset) | 2351 | if (oldset) |
2089 | *oldset = current->blocked; | 2352 | *oldset = tsk->blocked; |
2090 | 2353 | ||
2091 | error = 0; | ||
2092 | switch (how) { | 2354 | switch (how) { |
2093 | case SIG_BLOCK: | 2355 | case SIG_BLOCK: |
2094 | sigorsets(¤t->blocked, ¤t->blocked, set); | 2356 | sigorsets(&newset, &tsk->blocked, set); |
2095 | break; | 2357 | break; |
2096 | case SIG_UNBLOCK: | 2358 | case SIG_UNBLOCK: |
2097 | signandsets(¤t->blocked, ¤t->blocked, set); | 2359 | sigandnsets(&newset, &tsk->blocked, set); |
2098 | break; | 2360 | break; |
2099 | case SIG_SETMASK: | 2361 | case SIG_SETMASK: |
2100 | current->blocked = *set; | 2362 | newset = *set; |
2101 | break; | 2363 | break; |
2102 | default: | 2364 | default: |
2103 | error = -EINVAL; | 2365 | return -EINVAL; |
2104 | } | 2366 | } |
2105 | recalc_sigpending(); | ||
2106 | spin_unlock_irq(¤t->sighand->siglock); | ||
2107 | 2367 | ||
2108 | return error; | 2368 | set_current_blocked(&newset); |
2369 | return 0; | ||
2109 | } | 2370 | } |
2110 | 2371 | ||
2111 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, | 2372 | /** |
2373 | * sys_rt_sigprocmask - change the list of currently blocked signals | ||
2374 | * @how: whether to add, remove, or set signals | ||
2375 | * @nset: stores pending signals | ||
2376 | * @oset: previous value of signal mask if non-null | ||
2377 | * @sigsetsize: size of sigset_t type | ||
2378 | */ | ||
2379 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, | ||
2112 | sigset_t __user *, oset, size_t, sigsetsize) | 2380 | sigset_t __user *, oset, size_t, sigsetsize) |
2113 | { | 2381 | { |
2114 | int error = -EINVAL; | ||
2115 | sigset_t old_set, new_set; | 2382 | sigset_t old_set, new_set; |
2383 | int error; | ||
2116 | 2384 | ||
2117 | /* XXX: Don't preclude handling different sized sigset_t's. */ | 2385 | /* XXX: Don't preclude handling different sized sigset_t's. */ |
2118 | if (sigsetsize != sizeof(sigset_t)) | 2386 | if (sigsetsize != sizeof(sigset_t)) |
2119 | goto out; | 2387 | return -EINVAL; |
2120 | 2388 | ||
2121 | if (set) { | 2389 | old_set = current->blocked; |
2122 | error = -EFAULT; | 2390 | |
2123 | if (copy_from_user(&new_set, set, sizeof(*set))) | 2391 | if (nset) { |
2124 | goto out; | 2392 | if (copy_from_user(&new_set, nset, sizeof(sigset_t))) |
2393 | return -EFAULT; | ||
2125 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 2394 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
2126 | 2395 | ||
2127 | error = sigprocmask(how, &new_set, &old_set); | 2396 | error = sigprocmask(how, &new_set, NULL); |
2128 | if (error) | 2397 | if (error) |
2129 | goto out; | 2398 | return error; |
2130 | if (oset) | 2399 | } |
2131 | goto set_old; | ||
2132 | } else if (oset) { | ||
2133 | spin_lock_irq(¤t->sighand->siglock); | ||
2134 | old_set = current->blocked; | ||
2135 | spin_unlock_irq(¤t->sighand->siglock); | ||
2136 | 2400 | ||
2137 | set_old: | 2401 | if (oset) { |
2138 | error = -EFAULT; | 2402 | if (copy_to_user(oset, &old_set, sizeof(sigset_t))) |
2139 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | 2403 | return -EFAULT; |
2140 | goto out; | ||
2141 | } | 2404 | } |
2142 | error = 0; | 2405 | |
2143 | out: | 2406 | return 0; |
2144 | return error; | ||
2145 | } | 2407 | } |
2146 | 2408 | ||
2147 | long do_sigpending(void __user *set, unsigned long sigsetsize) | 2409 | long do_sigpending(void __user *set, unsigned long sigsetsize) |
@@ -2166,8 +2428,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) | |||
2166 | 2428 | ||
2167 | out: | 2429 | out: |
2168 | return error; | 2430 | return error; |
2169 | } | 2431 | } |
2170 | 2432 | ||
2433 | /** | ||
2434 | * sys_rt_sigpending - examine a pending signal that has been raised | ||
2435 | * while blocked | ||
2436 | * @set: stores pending signals | ||
2437 | * @sigsetsize: size of sigset_t type or larger | ||
2438 | */ | ||
2171 | SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) | 2439 | SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) |
2172 | { | 2440 | { |
2173 | return do_sigpending(set, sigsetsize); | 2441 | return do_sigpending(set, sigsetsize); |
@@ -2216,9 +2484,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2216 | err |= __put_user(from->si_trapno, &to->si_trapno); | 2484 | err |= __put_user(from->si_trapno, &to->si_trapno); |
2217 | #endif | 2485 | #endif |
2218 | #ifdef BUS_MCEERR_AO | 2486 | #ifdef BUS_MCEERR_AO |
2219 | /* | 2487 | /* |
2220 | * Other callers might not initialize the si_lsb field, | 2488 | * Other callers might not initialize the si_lsb field, |
2221 | * so check explicitely for the right codes here. | 2489 | * so check explicitly for the right codes here. |
2222 | */ | 2490 | */ |
2223 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | 2491 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) |
2224 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2492 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
@@ -2247,15 +2515,82 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2247 | 2515 | ||
2248 | #endif | 2516 | #endif |
2249 | 2517 | ||
2518 | /** | ||
2519 | * do_sigtimedwait - wait for queued signals specified in @which | ||
2520 | * @which: queued signals to wait for | ||
2521 | * @info: if non-null, the signal's siginfo is returned here | ||
2522 | * @ts: upper bound on process time suspension | ||
2523 | */ | ||
2524 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | ||
2525 | const struct timespec *ts) | ||
2526 | { | ||
2527 | struct task_struct *tsk = current; | ||
2528 | long timeout = MAX_SCHEDULE_TIMEOUT; | ||
2529 | sigset_t mask = *which; | ||
2530 | int sig; | ||
2531 | |||
2532 | if (ts) { | ||
2533 | if (!timespec_valid(ts)) | ||
2534 | return -EINVAL; | ||
2535 | timeout = timespec_to_jiffies(ts); | ||
2536 | /* | ||
2537 | * We can be close to the next tick, add another one | ||
2538 | * to ensure we will wait at least the time asked for. | ||
2539 | */ | ||
2540 | if (ts->tv_sec || ts->tv_nsec) | ||
2541 | timeout++; | ||
2542 | } | ||
2543 | |||
2544 | /* | ||
2545 | * Invert the set of allowed signals to get those we want to block. | ||
2546 | */ | ||
2547 | sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2548 | signotset(&mask); | ||
2549 | |||
2550 | spin_lock_irq(&tsk->sighand->siglock); | ||
2551 | sig = dequeue_signal(tsk, &mask, info); | ||
2552 | if (!sig && timeout) { | ||
2553 | /* | ||
2554 | * None ready, temporarily unblock those we're interested | ||
2555 | * while we are sleeping in so that we'll be awakened when | ||
2556 | * they arrive. Unblocking is always fine, we can avoid | ||
2557 | * set_current_blocked(). | ||
2558 | */ | ||
2559 | tsk->real_blocked = tsk->blocked; | ||
2560 | sigandsets(&tsk->blocked, &tsk->blocked, &mask); | ||
2561 | recalc_sigpending(); | ||
2562 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2563 | |||
2564 | timeout = schedule_timeout_interruptible(timeout); | ||
2565 | |||
2566 | spin_lock_irq(&tsk->sighand->siglock); | ||
2567 | __set_task_blocked(tsk, &tsk->real_blocked); | ||
2568 | siginitset(&tsk->real_blocked, 0); | ||
2569 | sig = dequeue_signal(tsk, &mask, info); | ||
2570 | } | ||
2571 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2572 | |||
2573 | if (sig) | ||
2574 | return sig; | ||
2575 | return timeout ? -EINTR : -EAGAIN; | ||
2576 | } | ||
2577 | |||
2578 | /** | ||
2579 | * sys_rt_sigtimedwait - synchronously wait for queued signals specified | ||
2580 | * in @uthese | ||
2581 | * @uthese: queued signals to wait for | ||
2582 | * @uinfo: if non-null, the signal's siginfo is returned here | ||
2583 | * @uts: upper bound on process time suspension | ||
2584 | * @sigsetsize: size of sigset_t type | ||
2585 | */ | ||
2250 | SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | 2586 | SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, |
2251 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, | 2587 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, |
2252 | size_t, sigsetsize) | 2588 | size_t, sigsetsize) |
2253 | { | 2589 | { |
2254 | int ret, sig; | ||
2255 | sigset_t these; | 2590 | sigset_t these; |
2256 | struct timespec ts; | 2591 | struct timespec ts; |
2257 | siginfo_t info; | 2592 | siginfo_t info; |
2258 | long timeout = 0; | 2593 | int ret; |
2259 | 2594 | ||
2260 | /* XXX: Don't preclude handling different sized sigset_t's. */ | 2595 | /* XXX: Don't preclude handling different sized sigset_t's. */ |
2261 | if (sigsetsize != sizeof(sigset_t)) | 2596 | if (sigsetsize != sizeof(sigset_t)) |
@@ -2263,65 +2598,27 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
2263 | 2598 | ||
2264 | if (copy_from_user(&these, uthese, sizeof(these))) | 2599 | if (copy_from_user(&these, uthese, sizeof(these))) |
2265 | return -EFAULT; | 2600 | return -EFAULT; |
2266 | |||
2267 | /* | ||
2268 | * Invert the set of allowed signals to get those we | ||
2269 | * want to block. | ||
2270 | */ | ||
2271 | sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2272 | signotset(&these); | ||
2273 | 2601 | ||
2274 | if (uts) { | 2602 | if (uts) { |
2275 | if (copy_from_user(&ts, uts, sizeof(ts))) | 2603 | if (copy_from_user(&ts, uts, sizeof(ts))) |
2276 | return -EFAULT; | 2604 | return -EFAULT; |
2277 | if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 | ||
2278 | || ts.tv_sec < 0) | ||
2279 | return -EINVAL; | ||
2280 | } | 2605 | } |
2281 | 2606 | ||
2282 | spin_lock_irq(¤t->sighand->siglock); | 2607 | ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); |
2283 | sig = dequeue_signal(current, &these, &info); | ||
2284 | if (!sig) { | ||
2285 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
2286 | if (uts) | ||
2287 | timeout = (timespec_to_jiffies(&ts) | ||
2288 | + (ts.tv_sec || ts.tv_nsec)); | ||
2289 | |||
2290 | if (timeout) { | ||
2291 | /* None ready -- temporarily unblock those we're | ||
2292 | * interested while we are sleeping in so that we'll | ||
2293 | * be awakened when they arrive. */ | ||
2294 | current->real_blocked = current->blocked; | ||
2295 | sigandsets(¤t->blocked, ¤t->blocked, &these); | ||
2296 | recalc_sigpending(); | ||
2297 | spin_unlock_irq(¤t->sighand->siglock); | ||
2298 | |||
2299 | timeout = schedule_timeout_interruptible(timeout); | ||
2300 | |||
2301 | spin_lock_irq(¤t->sighand->siglock); | ||
2302 | sig = dequeue_signal(current, &these, &info); | ||
2303 | current->blocked = current->real_blocked; | ||
2304 | siginitset(¤t->real_blocked, 0); | ||
2305 | recalc_sigpending(); | ||
2306 | } | ||
2307 | } | ||
2308 | spin_unlock_irq(¤t->sighand->siglock); | ||
2309 | 2608 | ||
2310 | if (sig) { | 2609 | if (ret > 0 && uinfo) { |
2311 | ret = sig; | 2610 | if (copy_siginfo_to_user(uinfo, &info)) |
2312 | if (uinfo) { | 2611 | ret = -EFAULT; |
2313 | if (copy_siginfo_to_user(uinfo, &info)) | ||
2314 | ret = -EFAULT; | ||
2315 | } | ||
2316 | } else { | ||
2317 | ret = -EAGAIN; | ||
2318 | if (timeout) | ||
2319 | ret = -EINTR; | ||
2320 | } | 2612 | } |
2321 | 2613 | ||
2322 | return ret; | 2614 | return ret; |
2323 | } | 2615 | } |
2324 | 2616 | ||
2617 | /** | ||
2618 | * sys_kill - send a signal to a process | ||
2619 | * @pid: the PID of the process | ||
2620 | * @sig: signal to be sent | ||
2621 | */ | ||
2325 | SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | 2622 | SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) |
2326 | { | 2623 | { |
2327 | struct siginfo info; | 2624 | struct siginfo info; |
@@ -2397,7 +2694,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) | |||
2397 | return do_tkill(tgid, pid, sig); | 2694 | return do_tkill(tgid, pid, sig); |
2398 | } | 2695 | } |
2399 | 2696 | ||
2400 | /* | 2697 | /** |
2698 | * sys_tkill - send signal to one specific task | ||
2699 | * @pid: the PID of the task | ||
2700 | * @sig: signal to be sent | ||
2701 | * | ||
2401 | * Send a signal to only one task, even if it's a CLONE_THREAD task. | 2702 | * Send a signal to only one task, even if it's a CLONE_THREAD task. |
2402 | */ | 2703 | */ |
2403 | SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) | 2704 | SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) |
@@ -2409,6 +2710,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) | |||
2409 | return do_tkill(0, pid, sig); | 2710 | return do_tkill(0, pid, sig); |
2410 | } | 2711 | } |
2411 | 2712 | ||
2713 | /** | ||
2714 | * sys_rt_sigqueueinfo - send signal information to a signal | ||
2715 | * @pid: the PID of the thread | ||
2716 | * @sig: signal to be sent | ||
2717 | * @uinfo: signal info to be sent | ||
2718 | */ | ||
2412 | SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, | 2719 | SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, |
2413 | siginfo_t __user *, uinfo) | 2720 | siginfo_t __user *, uinfo) |
2414 | { | 2721 | { |
@@ -2418,9 +2725,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, | |||
2418 | return -EFAULT; | 2725 | return -EFAULT; |
2419 | 2726 | ||
2420 | /* Not even root can pretend to send signals from the kernel. | 2727 | /* Not even root can pretend to send signals from the kernel. |
2421 | Nor can they impersonate a kill(), which adds source info. */ | 2728 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
2422 | if (info.si_code >= 0) | 2729 | */ |
2730 | if (info.si_code >= 0 || info.si_code == SI_TKILL) { | ||
2731 | /* We used to allow any < 0 si_code */ | ||
2732 | WARN_ON_ONCE(info.si_code < 0); | ||
2423 | return -EPERM; | 2733 | return -EPERM; |
2734 | } | ||
2424 | info.si_signo = sig; | 2735 | info.si_signo = sig; |
2425 | 2736 | ||
2426 | /* POSIX.1b doesn't mention process groups. */ | 2737 | /* POSIX.1b doesn't mention process groups. */ |
@@ -2434,9 +2745,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) | |||
2434 | return -EINVAL; | 2745 | return -EINVAL; |
2435 | 2746 | ||
2436 | /* Not even root can pretend to send signals from the kernel. | 2747 | /* Not even root can pretend to send signals from the kernel. |
2437 | Nor can they impersonate a kill(), which adds source info. */ | 2748 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
2438 | if (info->si_code >= 0) | 2749 | */ |
2750 | if (info->si_code >= 0 || info->si_code == SI_TKILL) { | ||
2751 | /* We used to allow any < 0 si_code */ | ||
2752 | WARN_ON_ONCE(info->si_code < 0); | ||
2439 | return -EPERM; | 2753 | return -EPERM; |
2754 | } | ||
2440 | info->si_signo = sig; | 2755 | info->si_signo = sig; |
2441 | 2756 | ||
2442 | return do_send_specific(tgid, pid, sig, info); | 2757 | return do_send_specific(tgid, pid, sig, info); |
@@ -2528,12 +2843,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s | |||
2528 | 2843 | ||
2529 | error = -EINVAL; | 2844 | error = -EINVAL; |
2530 | /* | 2845 | /* |
2531 | * | 2846 | * Note - this code used to test ss_flags incorrectly: |
2532 | * Note - this code used to test ss_flags incorrectly | ||
2533 | * old code may have been written using ss_flags==0 | 2847 | * old code may have been written using ss_flags==0 |
2534 | * to mean ss_flags==SS_ONSTACK (as this was the only | 2848 | * to mean ss_flags==SS_ONSTACK (as this was the only |
2535 | * way that worked) - this fix preserves that older | 2849 | * way that worked) - this fix preserves that older |
2536 | * mechanism | 2850 | * mechanism. |
2537 | */ | 2851 | */ |
2538 | if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) | 2852 | if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) |
2539 | goto out; | 2853 | goto out; |
@@ -2567,6 +2881,10 @@ out: | |||
2567 | 2881 | ||
2568 | #ifdef __ARCH_WANT_SYS_SIGPENDING | 2882 | #ifdef __ARCH_WANT_SYS_SIGPENDING |
2569 | 2883 | ||
2884 | /** | ||
2885 | * sys_sigpending - examine pending signals | ||
2886 | * @set: where mask of pending signal is returned | ||
2887 | */ | ||
2570 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | 2888 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) |
2571 | { | 2889 | { |
2572 | return do_sigpending(set, sizeof(*set)); | 2890 | return do_sigpending(set, sizeof(*set)); |
@@ -2575,60 +2893,65 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | |||
2575 | #endif | 2893 | #endif |
2576 | 2894 | ||
2577 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 2895 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
2578 | /* Some platforms have their own version with special arguments others | 2896 | /** |
2579 | support only sys_rt_sigprocmask. */ | 2897 | * sys_sigprocmask - examine and change blocked signals |
2898 | * @how: whether to add, remove, or set signals | ||
2899 | * @nset: signals to add or remove (if non-null) | ||
2900 | * @oset: previous value of signal mask if non-null | ||
2901 | * | ||
2902 | * Some platforms have their own version with special arguments; | ||
2903 | * others support only sys_rt_sigprocmask. | ||
2904 | */ | ||
2580 | 2905 | ||
2581 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, | 2906 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, |
2582 | old_sigset_t __user *, oset) | 2907 | old_sigset_t __user *, oset) |
2583 | { | 2908 | { |
2584 | int error; | ||
2585 | old_sigset_t old_set, new_set; | 2909 | old_sigset_t old_set, new_set; |
2910 | sigset_t new_blocked; | ||
2586 | 2911 | ||
2587 | if (set) { | 2912 | old_set = current->blocked.sig[0]; |
2588 | error = -EFAULT; | 2913 | |
2589 | if (copy_from_user(&new_set, set, sizeof(*set))) | 2914 | if (nset) { |
2590 | goto out; | 2915 | if (copy_from_user(&new_set, nset, sizeof(*nset))) |
2916 | return -EFAULT; | ||
2591 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | 2917 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); |
2592 | 2918 | ||
2593 | spin_lock_irq(¤t->sighand->siglock); | 2919 | new_blocked = current->blocked; |
2594 | old_set = current->blocked.sig[0]; | ||
2595 | 2920 | ||
2596 | error = 0; | ||
2597 | switch (how) { | 2921 | switch (how) { |
2598 | default: | ||
2599 | error = -EINVAL; | ||
2600 | break; | ||
2601 | case SIG_BLOCK: | 2922 | case SIG_BLOCK: |
2602 | sigaddsetmask(¤t->blocked, new_set); | 2923 | sigaddsetmask(&new_blocked, new_set); |
2603 | break; | 2924 | break; |
2604 | case SIG_UNBLOCK: | 2925 | case SIG_UNBLOCK: |
2605 | sigdelsetmask(¤t->blocked, new_set); | 2926 | sigdelsetmask(&new_blocked, new_set); |
2606 | break; | 2927 | break; |
2607 | case SIG_SETMASK: | 2928 | case SIG_SETMASK: |
2608 | current->blocked.sig[0] = new_set; | 2929 | new_blocked.sig[0] = new_set; |
2609 | break; | 2930 | break; |
2931 | default: | ||
2932 | return -EINVAL; | ||
2610 | } | 2933 | } |
2611 | 2934 | ||
2612 | recalc_sigpending(); | 2935 | set_current_blocked(&new_blocked); |
2613 | spin_unlock_irq(¤t->sighand->siglock); | 2936 | } |
2614 | if (error) | 2937 | |
2615 | goto out; | 2938 | if (oset) { |
2616 | if (oset) | ||
2617 | goto set_old; | ||
2618 | } else if (oset) { | ||
2619 | old_set = current->blocked.sig[0]; | ||
2620 | set_old: | ||
2621 | error = -EFAULT; | ||
2622 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | 2939 | if (copy_to_user(oset, &old_set, sizeof(*oset))) |
2623 | goto out; | 2940 | return -EFAULT; |
2624 | } | 2941 | } |
2625 | error = 0; | 2942 | |
2626 | out: | 2943 | return 0; |
2627 | return error; | ||
2628 | } | 2944 | } |
2629 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ | 2945 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ |
2630 | 2946 | ||
2631 | #ifdef __ARCH_WANT_SYS_RT_SIGACTION | 2947 | #ifdef __ARCH_WANT_SYS_RT_SIGACTION |
2948 | /** | ||
2949 | * sys_rt_sigaction - alter an action taken by a process | ||
2950 | * @sig: signal to be sent | ||
2951 | * @act: new sigaction | ||
2952 | * @oact: used to save the previous sigaction | ||
2953 | * @sigsetsize: size of sigset_t type | ||
2954 | */ | ||
2632 | SYSCALL_DEFINE4(rt_sigaction, int, sig, | 2955 | SYSCALL_DEFINE4(rt_sigaction, int, sig, |
2633 | const struct sigaction __user *, act, | 2956 | const struct sigaction __user *, act, |
2634 | struct sigaction __user *, oact, | 2957 | struct sigaction __user *, oact, |
@@ -2707,14 +3030,22 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) | |||
2707 | 3030 | ||
2708 | SYSCALL_DEFINE0(pause) | 3031 | SYSCALL_DEFINE0(pause) |
2709 | { | 3032 | { |
2710 | current->state = TASK_INTERRUPTIBLE; | 3033 | while (!signal_pending(current)) { |
2711 | schedule(); | 3034 | current->state = TASK_INTERRUPTIBLE; |
3035 | schedule(); | ||
3036 | } | ||
2712 | return -ERESTARTNOHAND; | 3037 | return -ERESTARTNOHAND; |
2713 | } | 3038 | } |
2714 | 3039 | ||
2715 | #endif | 3040 | #endif |
2716 | 3041 | ||
2717 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | 3042 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND |
3043 | /** | ||
3044 | * sys_rt_sigsuspend - replace the signal mask for a value with the | ||
3045 | * @unewset value until a signal is received | ||
3046 | * @unewset: new signal mask value | ||
3047 | * @sigsetsize: size of sigset_t type | ||
3048 | */ | ||
2718 | SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | 3049 | SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) |
2719 | { | 3050 | { |
2720 | sigset_t newset; | 3051 | sigset_t newset; |
diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7ef..fb67dfa8394e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | ||
16 | static struct { | 17 | static struct { |
17 | struct list_head queue; | 18 | struct list_head queue; |
18 | raw_spinlock_t lock; | 19 | raw_spinlock_t lock; |
@@ -73,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { | |||
73 | .notifier_call = hotplug_cfd, | 74 | .notifier_call = hotplug_cfd, |
74 | }; | 75 | }; |
75 | 76 | ||
76 | static int __cpuinit init_call_single_data(void) | 77 | void __init call_function_init(void) |
77 | { | 78 | { |
78 | void *cpu = (void *)(long)smp_processor_id(); | 79 | void *cpu = (void *)(long)smp_processor_id(); |
79 | int i; | 80 | int i; |
@@ -87,10 +88,7 @@ static int __cpuinit init_call_single_data(void) | |||
87 | 88 | ||
88 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); | 89 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); |
89 | register_cpu_notifier(&hotplug_cfd_notifier); | 90 | register_cpu_notifier(&hotplug_cfd_notifier); |
90 | |||
91 | return 0; | ||
92 | } | 91 | } |
93 | early_initcall(init_call_single_data); | ||
94 | 92 | ||
95 | /* | 93 | /* |
96 | * csd_lock/csd_unlock used to serialize access to per-cpu csd resources | 94 | * csd_lock/csd_unlock used to serialize access to per-cpu csd resources |
@@ -193,23 +191,52 @@ void generic_smp_call_function_interrupt(void) | |||
193 | */ | 191 | */ |
194 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { | 192 | list_for_each_entry_rcu(data, &call_function.queue, csd.list) { |
195 | int refs; | 193 | int refs; |
194 | smp_call_func_t func; | ||
195 | |||
196 | /* | ||
197 | * Since we walk the list without any locks, we might | ||
198 | * see an entry that was completed, removed from the | ||
199 | * list and is in the process of being reused. | ||
200 | * | ||
201 | * We must check that the cpu is in the cpumask before | ||
202 | * checking the refs, and both must be set before | ||
203 | * executing the callback on this cpu. | ||
204 | */ | ||
196 | 205 | ||
197 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) | 206 | if (!cpumask_test_cpu(cpu, data->cpumask)) |
198 | continue; | 207 | continue; |
199 | 208 | ||
200 | data->csd.func(data->csd.info); | 209 | smp_rmb(); |
210 | |||
211 | if (atomic_read(&data->refs) == 0) | ||
212 | continue; | ||
213 | |||
214 | func = data->csd.func; /* save for later warn */ | ||
215 | func(data->csd.info); | ||
216 | |||
217 | /* | ||
218 | * If the cpu mask is not still set then func enabled | ||
219 | * interrupts (BUG), and this cpu took another smp call | ||
220 | * function interrupt and executed func(info) twice | ||
221 | * on this cpu. That nested execution decremented refs. | ||
222 | */ | ||
223 | if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { | ||
224 | WARN(1, "%pf enabled interrupts and double executed\n", func); | ||
225 | continue; | ||
226 | } | ||
201 | 227 | ||
202 | refs = atomic_dec_return(&data->refs); | 228 | refs = atomic_dec_return(&data->refs); |
203 | WARN_ON(refs < 0); | 229 | WARN_ON(refs < 0); |
204 | if (!refs) { | ||
205 | raw_spin_lock(&call_function.lock); | ||
206 | list_del_rcu(&data->csd.list); | ||
207 | raw_spin_unlock(&call_function.lock); | ||
208 | } | ||
209 | 230 | ||
210 | if (refs) | 231 | if (refs) |
211 | continue; | 232 | continue; |
212 | 233 | ||
234 | WARN_ON(!cpumask_empty(data->cpumask)); | ||
235 | |||
236 | raw_spin_lock(&call_function.lock); | ||
237 | list_del_rcu(&data->csd.list); | ||
238 | raw_spin_unlock(&call_function.lock); | ||
239 | |||
213 | csd_unlock(&data->csd); | 240 | csd_unlock(&data->csd); |
214 | } | 241 | } |
215 | 242 | ||
@@ -267,7 +294,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); | |||
267 | * | 294 | * |
268 | * Returns 0 on success, else a negative status code. | 295 | * Returns 0 on success, else a negative status code. |
269 | */ | 296 | */ |
270 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 297 | int smp_call_function_single(int cpu, smp_call_func_t func, void *info, |
271 | int wait) | 298 | int wait) |
272 | { | 299 | { |
273 | struct call_single_data d = { | 300 | struct call_single_data d = { |
@@ -336,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
336 | * 3) any other online cpu in @mask | 363 | * 3) any other online cpu in @mask |
337 | */ | 364 | */ |
338 | int smp_call_function_any(const struct cpumask *mask, | 365 | int smp_call_function_any(const struct cpumask *mask, |
339 | void (*func)(void *info), void *info, int wait) | 366 | smp_call_func_t func, void *info, int wait) |
340 | { | 367 | { |
341 | unsigned int cpu; | 368 | unsigned int cpu; |
342 | const struct cpumask *nodemask; | 369 | const struct cpumask *nodemask; |
@@ -416,11 +443,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
416 | * must be disabled when calling this function. | 443 | * must be disabled when calling this function. |
417 | */ | 444 | */ |
418 | void smp_call_function_many(const struct cpumask *mask, | 445 | void smp_call_function_many(const struct cpumask *mask, |
419 | void (*func)(void *), void *info, bool wait) | 446 | smp_call_func_t func, void *info, bool wait) |
420 | { | 447 | { |
421 | struct call_function_data *data; | 448 | struct call_function_data *data; |
422 | unsigned long flags; | 449 | unsigned long flags; |
423 | int cpu, next_cpu, this_cpu = smp_processor_id(); | 450 | int refs, cpu, next_cpu, this_cpu = smp_processor_id(); |
424 | 451 | ||
425 | /* | 452 | /* |
426 | * Can deadlock when called with interrupts disabled. | 453 | * Can deadlock when called with interrupts disabled. |
@@ -429,9 +456,9 @@ void smp_call_function_many(const struct cpumask *mask, | |||
429 | * can't happen. | 456 | * can't happen. |
430 | */ | 457 | */ |
431 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() | 458 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() |
432 | && !oops_in_progress); | 459 | && !oops_in_progress && !early_boot_irqs_disabled); |
433 | 460 | ||
434 | /* So, what's a CPU they want? Ignoring this one. */ | 461 | /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ |
435 | cpu = cpumask_first_and(mask, cpu_online_mask); | 462 | cpu = cpumask_first_and(mask, cpu_online_mask); |
436 | if (cpu == this_cpu) | 463 | if (cpu == this_cpu) |
437 | cpu = cpumask_next_and(cpu, mask, cpu_online_mask); | 464 | cpu = cpumask_next_and(cpu, mask, cpu_online_mask); |
@@ -454,11 +481,48 @@ void smp_call_function_many(const struct cpumask *mask, | |||
454 | data = &__get_cpu_var(cfd_data); | 481 | data = &__get_cpu_var(cfd_data); |
455 | csd_lock(&data->csd); | 482 | csd_lock(&data->csd); |
456 | 483 | ||
484 | /* This BUG_ON verifies our reuse assertions and can be removed */ | ||
485 | BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); | ||
486 | |||
487 | /* | ||
488 | * The global call function queue list add and delete are protected | ||
489 | * by a lock, but the list is traversed without any lock, relying | ||
490 | * on the rcu list add and delete to allow safe concurrent traversal. | ||
491 | * We reuse the call function data without waiting for any grace | ||
492 | * period after some other cpu removes it from the global queue. | ||
493 | * This means a cpu might find our data block as it is being | ||
494 | * filled out. | ||
495 | * | ||
496 | * We hold off the interrupt handler on the other cpu by | ||
497 | * ordering our writes to the cpu mask vs our setting of the | ||
498 | * refs counter. We assert only the cpu owning the data block | ||
499 | * will set a bit in cpumask, and each bit will only be cleared | ||
500 | * by the subject cpu. Each cpu must first find its bit is | ||
501 | * set and then check that refs is set indicating the element is | ||
502 | * ready to be processed, otherwise it must skip the entry. | ||
503 | * | ||
504 | * On the previous iteration refs was set to 0 by another cpu. | ||
505 | * To avoid the use of transitivity, set the counter to 0 here | ||
506 | * so the wmb will pair with the rmb in the interrupt handler. | ||
507 | */ | ||
508 | atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ | ||
509 | |||
457 | data->csd.func = func; | 510 | data->csd.func = func; |
458 | data->csd.info = info; | 511 | data->csd.info = info; |
512 | |||
513 | /* Ensure 0 refs is visible before mask. Also orders func and info */ | ||
514 | smp_wmb(); | ||
515 | |||
516 | /* We rely on the "and" being processed before the store */ | ||
459 | cpumask_and(data->cpumask, mask, cpu_online_mask); | 517 | cpumask_and(data->cpumask, mask, cpu_online_mask); |
460 | cpumask_clear_cpu(this_cpu, data->cpumask); | 518 | cpumask_clear_cpu(this_cpu, data->cpumask); |
461 | atomic_set(&data->refs, cpumask_weight(data->cpumask)); | 519 | refs = cpumask_weight(data->cpumask); |
520 | |||
521 | /* Some callers race with other cpus changing the passed mask */ | ||
522 | if (unlikely(!refs)) { | ||
523 | csd_unlock(&data->csd); | ||
524 | return; | ||
525 | } | ||
462 | 526 | ||
463 | raw_spin_lock_irqsave(&call_function.lock, flags); | 527 | raw_spin_lock_irqsave(&call_function.lock, flags); |
464 | /* | 528 | /* |
@@ -467,6 +531,12 @@ void smp_call_function_many(const struct cpumask *mask, | |||
467 | * will not miss any other list entries: | 531 | * will not miss any other list entries: |
468 | */ | 532 | */ |
469 | list_add_rcu(&data->csd.list, &call_function.queue); | 533 | list_add_rcu(&data->csd.list, &call_function.queue); |
534 | /* | ||
535 | * We rely on the wmb() in list_add_rcu to complete our writes | ||
536 | * to the cpumask before this write to refs, which indicates | ||
537 | * data is on the list and is ready to be processed. | ||
538 | */ | ||
539 | atomic_set(&data->refs, refs); | ||
470 | raw_spin_unlock_irqrestore(&call_function.lock, flags); | 540 | raw_spin_unlock_irqrestore(&call_function.lock, flags); |
471 | 541 | ||
472 | /* | 542 | /* |
@@ -500,7 +570,7 @@ EXPORT_SYMBOL(smp_call_function_many); | |||
500 | * You must not call this function with disabled interrupts or from a | 570 | * You must not call this function with disabled interrupts or from a |
501 | * hardware interrupt handler or from a bottom half handler. | 571 | * hardware interrupt handler or from a bottom half handler. |
502 | */ | 572 | */ |
503 | int smp_call_function(void (*func)(void *), void *info, int wait) | 573 | int smp_call_function(smp_call_func_t func, void *info, int wait) |
504 | { | 574 | { |
505 | preempt_disable(); | 575 | preempt_disable(); |
506 | smp_call_function_many(cpu_online_mask, func, info, wait); | 576 | smp_call_function_many(cpu_online_mask, func, info, wait); |
@@ -529,3 +599,105 @@ void ipi_call_unlock_irq(void) | |||
529 | { | 599 | { |
530 | raw_spin_unlock_irq(&call_function.lock); | 600 | raw_spin_unlock_irq(&call_function.lock); |
531 | } | 601 | } |
602 | #endif /* USE_GENERIC_SMP_HELPERS */ | ||
603 | |||
604 | /* Setup configured maximum number of CPUs to activate */ | ||
605 | unsigned int setup_max_cpus = NR_CPUS; | ||
606 | EXPORT_SYMBOL(setup_max_cpus); | ||
607 | |||
608 | |||
609 | /* | ||
610 | * Setup routine for controlling SMP activation | ||
611 | * | ||
612 | * Command-line option of "nosmp" or "maxcpus=0" will disable SMP | ||
613 | * activation entirely (the MPS table probe still happens, though). | ||
614 | * | ||
615 | * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer | ||
616 | * greater than 0, limits the maximum number of CPUs activated in | ||
617 | * SMP mode to <NUM>. | ||
618 | */ | ||
619 | |||
620 | void __weak arch_disable_smp_support(void) { } | ||
621 | |||
622 | static int __init nosmp(char *str) | ||
623 | { | ||
624 | setup_max_cpus = 0; | ||
625 | arch_disable_smp_support(); | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | early_param("nosmp", nosmp); | ||
631 | |||
632 | /* this is hard limit */ | ||
633 | static int __init nrcpus(char *str) | ||
634 | { | ||
635 | int nr_cpus; | ||
636 | |||
637 | get_option(&str, &nr_cpus); | ||
638 | if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) | ||
639 | nr_cpu_ids = nr_cpus; | ||
640 | |||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | early_param("nr_cpus", nrcpus); | ||
645 | |||
646 | static int __init maxcpus(char *str) | ||
647 | { | ||
648 | get_option(&str, &setup_max_cpus); | ||
649 | if (setup_max_cpus == 0) | ||
650 | arch_disable_smp_support(); | ||
651 | |||
652 | return 0; | ||
653 | } | ||
654 | |||
655 | early_param("maxcpus", maxcpus); | ||
656 | |||
657 | /* Setup number of possible processor ids */ | ||
658 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
659 | EXPORT_SYMBOL(nr_cpu_ids); | ||
660 | |||
661 | /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ | ||
662 | void __init setup_nr_cpu_ids(void) | ||
663 | { | ||
664 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; | ||
665 | } | ||
666 | |||
667 | /* Called by boot processor to activate the rest. */ | ||
668 | void __init smp_init(void) | ||
669 | { | ||
670 | unsigned int cpu; | ||
671 | |||
672 | /* FIXME: This should be done in userspace --RR */ | ||
673 | for_each_present_cpu(cpu) { | ||
674 | if (num_online_cpus() >= setup_max_cpus) | ||
675 | break; | ||
676 | if (!cpu_online(cpu)) | ||
677 | cpu_up(cpu); | ||
678 | } | ||
679 | |||
680 | /* Any cleanup work */ | ||
681 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); | ||
682 | smp_cpus_done(setup_max_cpus); | ||
683 | } | ||
684 | |||
685 | /* | ||
686 | * Call a function on all processors. May be used during early boot while | ||
687 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead | ||
688 | * of local_irq_disable/enable(). | ||
689 | */ | ||
690 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
691 | { | ||
692 | unsigned long flags; | ||
693 | int ret = 0; | ||
694 | |||
695 | preempt_disable(); | ||
696 | ret = smp_call_function(func, info, wait); | ||
697 | local_irq_save(flags); | ||
698 | func(info); | ||
699 | local_irq_restore(flags); | ||
700 | preempt_enable(); | ||
701 | return ret; | ||
702 | } | ||
703 | EXPORT_SYMBOL(on_each_cpu); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..fca82c32042b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -54,11 +54,11 @@ EXPORT_SYMBOL(irq_stat); | |||
54 | 54 | ||
55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; | 55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; |
56 | 56 | ||
57 | static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
58 | 58 | ||
59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
61 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 61 | "TASKLET", "SCHED", "HRTIMER", "RCU" |
62 | }; | 62 | }; |
63 | 63 | ||
64 | /* | 64 | /* |
@@ -67,21 +67,31 @@ char *softirq_to_name[NR_SOFTIRQS] = { | |||
67 | * to the pending events, so lets the scheduler to balance | 67 | * to the pending events, so lets the scheduler to balance |
68 | * the softirq load for us. | 68 | * the softirq load for us. |
69 | */ | 69 | */ |
70 | void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
71 | { | 71 | { |
72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | 73 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); |
74 | 74 | ||
75 | if (tsk && tsk->state != TASK_RUNNING) | 75 | if (tsk && tsk->state != TASK_RUNNING) |
76 | wake_up_process(tsk); | 76 | wake_up_process(tsk); |
77 | } | 77 | } |
78 | 78 | ||
79 | /* | 79 | /* |
80 | * preempt_count and SOFTIRQ_OFFSET usage: | ||
81 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | ||
82 | * softirq processing. | ||
83 | * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) | ||
84 | * on local_bh_disable or local_bh_enable. | ||
85 | * This lets us distinguish between whether we are currently processing | ||
86 | * softirq and whether we just have bh disabled. | ||
87 | */ | ||
88 | |||
89 | /* | ||
80 | * This one is for softirq.c-internal use, | 90 | * This one is for softirq.c-internal use, |
81 | * where hardirqs are disabled legitimately: | 91 | * where hardirqs are disabled legitimately: |
82 | */ | 92 | */ |
83 | #ifdef CONFIG_TRACE_IRQFLAGS | 93 | #ifdef CONFIG_TRACE_IRQFLAGS |
84 | static void __local_bh_disable(unsigned long ip) | 94 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) |
85 | { | 95 | { |
86 | unsigned long flags; | 96 | unsigned long flags; |
87 | 97 | ||
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) | |||
95 | * We must manually increment preempt_count here and manually | 105 | * We must manually increment preempt_count here and manually |
96 | * call the trace_preempt_off later. | 106 | * call the trace_preempt_off later. |
97 | */ | 107 | */ |
98 | preempt_count() += SOFTIRQ_OFFSET; | 108 | preempt_count() += cnt; |
99 | /* | 109 | /* |
100 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
101 | */ | 111 | */ |
102 | if (softirq_count() == SOFTIRQ_OFFSET) | 112 | if (softirq_count() == cnt) |
103 | trace_softirqs_off(ip); | 113 | trace_softirqs_off(ip); |
104 | raw_local_irq_restore(flags); | 114 | raw_local_irq_restore(flags); |
105 | 115 | ||
106 | if (preempt_count() == SOFTIRQ_OFFSET) | 116 | if (preempt_count() == cnt) |
107 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 117 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
108 | } | 118 | } |
109 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
110 | static inline void __local_bh_disable(unsigned long ip) | 120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
111 | { | 121 | { |
112 | add_preempt_count(SOFTIRQ_OFFSET); | 122 | add_preempt_count(cnt); |
113 | barrier(); | 123 | barrier(); |
114 | } | 124 | } |
115 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 125 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
116 | 126 | ||
117 | void local_bh_disable(void) | 127 | void local_bh_disable(void) |
118 | { | 128 | { |
119 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | 129 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
130 | SOFTIRQ_DISABLE_OFFSET); | ||
120 | } | 131 | } |
121 | 132 | ||
122 | EXPORT_SYMBOL(local_bh_disable); | 133 | EXPORT_SYMBOL(local_bh_disable); |
123 | 134 | ||
135 | static void __local_bh_enable(unsigned int cnt) | ||
136 | { | ||
137 | WARN_ON_ONCE(in_irq()); | ||
138 | WARN_ON_ONCE(!irqs_disabled()); | ||
139 | |||
140 | if (softirq_count() == cnt) | ||
141 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
142 | sub_preempt_count(cnt); | ||
143 | } | ||
144 | |||
124 | /* | 145 | /* |
125 | * Special-case - softirqs can safely be enabled in | 146 | * Special-case - softirqs can safely be enabled in |
126 | * cond_resched_softirq(), or by __do_softirq(), | 147 | * cond_resched_softirq(), or by __do_softirq(), |
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); | |||
128 | */ | 149 | */ |
129 | void _local_bh_enable(void) | 150 | void _local_bh_enable(void) |
130 | { | 151 | { |
131 | WARN_ON_ONCE(in_irq()); | 152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
132 | WARN_ON_ONCE(!irqs_disabled()); | ||
133 | |||
134 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
135 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
136 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
137 | } | 153 | } |
138 | 154 | ||
139 | EXPORT_SYMBOL(_local_bh_enable); | 155 | EXPORT_SYMBOL(_local_bh_enable); |
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
147 | /* | 163 | /* |
148 | * Are softirqs going to be turned on now: | 164 | * Are softirqs going to be turned on now: |
149 | */ | 165 | */ |
150 | if (softirq_count() == SOFTIRQ_OFFSET) | 166 | if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) |
151 | trace_softirqs_on(ip); | 167 | trace_softirqs_on(ip); |
152 | /* | 168 | /* |
153 | * Keep preemption disabled until we are done with | 169 | * Keep preemption disabled until we are done with |
154 | * softirq processing: | 170 | * softirq processing: |
155 | */ | 171 | */ |
156 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | 172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); |
157 | 173 | ||
158 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) |
159 | do_softirq(); | 175 | do_softirq(); |
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) | |||
198 | pending = local_softirq_pending(); | 214 | pending = local_softirq_pending(); |
199 | account_system_vtime(current); | 215 | account_system_vtime(current); |
200 | 216 | ||
201 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | 217 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
218 | SOFTIRQ_OFFSET); | ||
202 | lockdep_softirq_enter(); | 219 | lockdep_softirq_enter(); |
203 | 220 | ||
204 | cpu = smp_processor_id(); | 221 | cpu = smp_processor_id(); |
@@ -212,18 +229,20 @@ restart: | |||
212 | 229 | ||
213 | do { | 230 | do { |
214 | if (pending & 1) { | 231 | if (pending & 1) { |
232 | unsigned int vec_nr = h - softirq_vec; | ||
215 | int prev_count = preempt_count(); | 233 | int prev_count = preempt_count(); |
216 | kstat_incr_softirqs_this_cpu(h - softirq_vec); | ||
217 | 234 | ||
218 | trace_softirq_entry(h, softirq_vec); | 235 | kstat_incr_softirqs_this_cpu(vec_nr); |
236 | |||
237 | trace_softirq_entry(vec_nr); | ||
219 | h->action(h); | 238 | h->action(h); |
220 | trace_softirq_exit(h, softirq_vec); | 239 | trace_softirq_exit(vec_nr); |
221 | if (unlikely(prev_count != preempt_count())) { | 240 | if (unlikely(prev_count != preempt_count())) { |
222 | printk(KERN_ERR "huh, entered softirq %td %s %p" | 241 | printk(KERN_ERR "huh, entered softirq %u %s %p" |
223 | "with preempt_count %08x," | 242 | "with preempt_count %08x," |
224 | " exited with %08x?\n", h - softirq_vec, | 243 | " exited with %08x?\n", vec_nr, |
225 | softirq_to_name[h - softirq_vec], | 244 | softirq_to_name[vec_nr], h->action, |
226 | h->action, prev_count, preempt_count()); | 245 | prev_count, preempt_count()); |
227 | preempt_count() = prev_count; | 246 | preempt_count() = prev_count; |
228 | } | 247 | } |
229 | 248 | ||
@@ -245,7 +264,7 @@ restart: | |||
245 | lockdep_softirq_exit(); | 264 | lockdep_softirq_exit(); |
246 | 265 | ||
247 | account_system_vtime(current); | 266 | account_system_vtime(current); |
248 | _local_bh_enable(); | 267 | __local_bh_enable(SOFTIRQ_OFFSET); |
249 | } | 268 | } |
250 | 269 | ||
251 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 270 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
@@ -279,16 +298,42 @@ void irq_enter(void) | |||
279 | 298 | ||
280 | rcu_irq_enter(); | 299 | rcu_irq_enter(); |
281 | if (idle_cpu(cpu) && !in_interrupt()) { | 300 | if (idle_cpu(cpu) && !in_interrupt()) { |
282 | __irq_enter(); | 301 | /* |
302 | * Prevent raise_softirq from needlessly waking up ksoftirqd | ||
303 | * here, as softirq will be serviced on return from interrupt. | ||
304 | */ | ||
305 | local_bh_disable(); | ||
283 | tick_check_idle(cpu); | 306 | tick_check_idle(cpu); |
284 | } else | 307 | _local_bh_enable(); |
285 | __irq_enter(); | 308 | } |
309 | |||
310 | __irq_enter(); | ||
286 | } | 311 | } |
287 | 312 | ||
288 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
289 | # define invoke_softirq() __do_softirq() | 314 | static inline void invoke_softirq(void) |
315 | { | ||
316 | if (!force_irqthreads) | ||
317 | __do_softirq(); | ||
318 | else { | ||
319 | __local_bh_disable((unsigned long)__builtin_return_address(0), | ||
320 | SOFTIRQ_OFFSET); | ||
321 | wakeup_softirqd(); | ||
322 | __local_bh_enable(SOFTIRQ_OFFSET); | ||
323 | } | ||
324 | } | ||
290 | #else | 325 | #else |
291 | # define invoke_softirq() do_softirq() | 326 | static inline void invoke_softirq(void) |
327 | { | ||
328 | if (!force_irqthreads) | ||
329 | do_softirq(); | ||
330 | else { | ||
331 | __local_bh_disable((unsigned long)__builtin_return_address(0), | ||
332 | SOFTIRQ_OFFSET); | ||
333 | wakeup_softirqd(); | ||
334 | __local_bh_enable(SOFTIRQ_OFFSET); | ||
335 | } | ||
336 | } | ||
292 | #endif | 337 | #endif |
293 | 338 | ||
294 | /* | 339 | /* |
@@ -363,8 +408,8 @@ void __tasklet_schedule(struct tasklet_struct *t) | |||
363 | 408 | ||
364 | local_irq_save(flags); | 409 | local_irq_save(flags); |
365 | t->next = NULL; | 410 | t->next = NULL; |
366 | *__get_cpu_var(tasklet_vec).tail = t; | 411 | *__this_cpu_read(tasklet_vec.tail) = t; |
367 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 412 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
368 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 413 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
369 | local_irq_restore(flags); | 414 | local_irq_restore(flags); |
370 | } | 415 | } |
@@ -377,8 +422,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) | |||
377 | 422 | ||
378 | local_irq_save(flags); | 423 | local_irq_save(flags); |
379 | t->next = NULL; | 424 | t->next = NULL; |
380 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 425 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
381 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 426 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
382 | raise_softirq_irqoff(HI_SOFTIRQ); | 427 | raise_softirq_irqoff(HI_SOFTIRQ); |
383 | local_irq_restore(flags); | 428 | local_irq_restore(flags); |
384 | } | 429 | } |
@@ -389,8 +434,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
389 | { | 434 | { |
390 | BUG_ON(!irqs_disabled()); | 435 | BUG_ON(!irqs_disabled()); |
391 | 436 | ||
392 | t->next = __get_cpu_var(tasklet_hi_vec).head; | 437 | t->next = __this_cpu_read(tasklet_hi_vec.head); |
393 | __get_cpu_var(tasklet_hi_vec).head = t; | 438 | __this_cpu_write(tasklet_hi_vec.head, t); |
394 | __raise_softirq_irqoff(HI_SOFTIRQ); | 439 | __raise_softirq_irqoff(HI_SOFTIRQ); |
395 | } | 440 | } |
396 | 441 | ||
@@ -401,9 +446,9 @@ static void tasklet_action(struct softirq_action *a) | |||
401 | struct tasklet_struct *list; | 446 | struct tasklet_struct *list; |
402 | 447 | ||
403 | local_irq_disable(); | 448 | local_irq_disable(); |
404 | list = __get_cpu_var(tasklet_vec).head; | 449 | list = __this_cpu_read(tasklet_vec.head); |
405 | __get_cpu_var(tasklet_vec).head = NULL; | 450 | __this_cpu_write(tasklet_vec.head, NULL); |
406 | __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; | 451 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); |
407 | local_irq_enable(); | 452 | local_irq_enable(); |
408 | 453 | ||
409 | while (list) { | 454 | while (list) { |
@@ -424,8 +469,8 @@ static void tasklet_action(struct softirq_action *a) | |||
424 | 469 | ||
425 | local_irq_disable(); | 470 | local_irq_disable(); |
426 | t->next = NULL; | 471 | t->next = NULL; |
427 | *__get_cpu_var(tasklet_vec).tail = t; | 472 | *__this_cpu_read(tasklet_vec.tail) = t; |
428 | __get_cpu_var(tasklet_vec).tail = &(t->next); | 473 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
429 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 474 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
430 | local_irq_enable(); | 475 | local_irq_enable(); |
431 | } | 476 | } |
@@ -436,9 +481,9 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
436 | struct tasklet_struct *list; | 481 | struct tasklet_struct *list; |
437 | 482 | ||
438 | local_irq_disable(); | 483 | local_irq_disable(); |
439 | list = __get_cpu_var(tasklet_hi_vec).head; | 484 | list = __this_cpu_read(tasklet_hi_vec.head); |
440 | __get_cpu_var(tasklet_hi_vec).head = NULL; | 485 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
441 | __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; | 486 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); |
442 | local_irq_enable(); | 487 | local_irq_enable(); |
443 | 488 | ||
444 | while (list) { | 489 | while (list) { |
@@ -459,8 +504,8 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
459 | 504 | ||
460 | local_irq_disable(); | 505 | local_irq_disable(); |
461 | t->next = NULL; | 506 | t->next = NULL; |
462 | *__get_cpu_var(tasklet_hi_vec).tail = t; | 507 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
463 | __get_cpu_var(tasklet_hi_vec).tail = &(t->next); | 508 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
464 | __raise_softirq_irqoff(HI_SOFTIRQ); | 509 | __raise_softirq_irqoff(HI_SOFTIRQ); |
465 | local_irq_enable(); | 510 | local_irq_enable(); |
466 | } | 511 | } |
@@ -530,7 +575,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) | |||
530 | /** | 575 | /** |
531 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks | 576 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks |
532 | * @ttimer: tasklet_hrtimer which is initialized | 577 | * @ttimer: tasklet_hrtimer which is initialized |
533 | * @function: hrtimer callback funtion which gets called from softirq context | 578 | * @function: hrtimer callback function which gets called from softirq context |
534 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) | 579 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) |
535 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) | 580 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) |
536 | */ | 581 | */ |
@@ -712,7 +757,10 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
712 | don't process */ | 757 | don't process */ |
713 | if (cpu_is_offline((long)__bind_cpu)) | 758 | if (cpu_is_offline((long)__bind_cpu)) |
714 | goto wait_to_die; | 759 | goto wait_to_die; |
715 | do_softirq(); | 760 | local_irq_disable(); |
761 | if (local_softirq_pending()) | ||
762 | __do_softirq(); | ||
763 | local_irq_enable(); | ||
716 | preempt_enable_no_resched(); | 764 | preempt_enable_no_resched(); |
717 | cond_resched(); | 765 | cond_resched(); |
718 | preempt_disable(); | 766 | preempt_disable(); |
@@ -776,16 +824,16 @@ static void takeover_tasklets(unsigned int cpu) | |||
776 | 824 | ||
777 | /* Find end, append list for that CPU. */ | 825 | /* Find end, append list for that CPU. */ |
778 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { | 826 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
779 | *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; | 827 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; |
780 | __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; | 828 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); |
781 | per_cpu(tasklet_vec, cpu).head = NULL; | 829 | per_cpu(tasklet_vec, cpu).head = NULL; |
782 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | 830 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; |
783 | } | 831 | } |
784 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 832 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
785 | 833 | ||
786 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { | 834 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { |
787 | *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; | 835 | *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; |
788 | __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; | 836 | __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); |
789 | per_cpu(tasklet_hi_vec, cpu).head = NULL; | 837 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
790 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | 838 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; |
791 | } | 839 | } |
@@ -805,7 +853,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
805 | switch (action) { | 853 | switch (action) { |
806 | case CPU_UP_PREPARE: | 854 | case CPU_UP_PREPARE: |
807 | case CPU_UP_PREPARE_FROZEN: | 855 | case CPU_UP_PREPARE_FROZEN: |
808 | p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); | 856 | p = kthread_create_on_node(run_ksoftirqd, |
857 | hcpu, | ||
858 | cpu_to_node(hotcpu), | ||
859 | "ksoftirqd/%d", hotcpu); | ||
809 | if (IS_ERR(p)) { | 860 | if (IS_ERR(p)) { |
810 | printk("ksoftirqd for %i failed\n", hotcpu); | 861 | printk("ksoftirqd for %i failed\n", hotcpu); |
811 | return notifier_from_errno(PTR_ERR(p)); | 862 | return notifier_from_errno(PTR_ERR(p)); |
@@ -827,7 +878,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
827 | cpumask_any(cpu_online_mask)); | 878 | cpumask_any(cpu_online_mask)); |
828 | case CPU_DEAD: | 879 | case CPU_DEAD: |
829 | case CPU_DEAD_FROZEN: { | 880 | case CPU_DEAD_FROZEN: { |
830 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 881 | static const struct sched_param param = { |
882 | .sched_priority = MAX_RT_PRIO-1 | ||
883 | }; | ||
831 | 884 | ||
832 | p = per_cpu(ksoftirqd, hotcpu); | 885 | p = per_cpu(ksoftirqd, hotcpu); |
833 | per_cpu(ksoftirqd, hotcpu) = NULL; | 886 | per_cpu(ksoftirqd, hotcpu) = NULL; |
@@ -857,25 +910,6 @@ static __init int spawn_ksoftirqd(void) | |||
857 | } | 910 | } |
858 | early_initcall(spawn_ksoftirqd); | 911 | early_initcall(spawn_ksoftirqd); |
859 | 912 | ||
860 | #ifdef CONFIG_SMP | ||
861 | /* | ||
862 | * Call a function on all processors | ||
863 | */ | ||
864 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | ||
865 | { | ||
866 | int ret = 0; | ||
867 | |||
868 | preempt_disable(); | ||
869 | ret = smp_call_function(func, info, wait); | ||
870 | local_irq_disable(); | ||
871 | func(info); | ||
872 | local_irq_enable(); | ||
873 | preempt_enable(); | ||
874 | return ret; | ||
875 | } | ||
876 | EXPORT_SYMBOL(on_each_cpu); | ||
877 | #endif | ||
878 | |||
879 | /* | 913 | /* |
880 | * [ These __weak aliases are kept in a separate compilation unit, so that | 914 | * [ These __weak aliases are kept in a separate compilation unit, so that |
881 | * GCC does not inline them incorrectly. ] | 915 | * GCC does not inline them incorrectly. ] |
@@ -886,17 +920,14 @@ int __init __weak early_irq_init(void) | |||
886 | return 0; | 920 | return 0; |
887 | } | 921 | } |
888 | 922 | ||
923 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
889 | int __init __weak arch_probe_nr_irqs(void) | 924 | int __init __weak arch_probe_nr_irqs(void) |
890 | { | 925 | { |
891 | return 0; | 926 | return NR_IRQS_LEGACY; |
892 | } | 927 | } |
893 | 928 | ||
894 | int __init __weak arch_early_irq_init(void) | 929 | int __init __weak arch_early_irq_init(void) |
895 | { | 930 | { |
896 | return 0; | 931 | return 0; |
897 | } | 932 | } |
898 | 933 | #endif | |
899 | int __weak arch_init_chip_data(struct irq_desc *desc, int node) | ||
900 | { | ||
901 | return 0; | ||
902 | } | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..73ce23feaea9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
33 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
34 | #include <linux/delay.h> | ||
34 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
35 | 36 | ||
36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 37 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
@@ -46,11 +47,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) | |||
46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | 47 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, |
47 | struct lock_class_key *key) | 48 | struct lock_class_key *key) |
48 | { | 49 | { |
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
50 | /* Don't re-initialize a lock while it is held. */ | 50 | /* Don't re-initialize a lock while it is held. */ |
51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | 51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); |
52 | lockdep_init_map(&sp->dep_map, name, key, 0); | 52 | lockdep_init_map(&sp->dep_map, name, key, 0); |
53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
54 | return init_srcu_struct_fields(sp); | 53 | return init_srcu_struct_fields(sp); |
55 | } | 54 | } |
56 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | 55 | EXPORT_SYMBOL_GPL(__init_srcu_struct); |
@@ -157,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) | |||
157 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
158 | 157 | ||
159 | /* | 158 | /* |
159 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
160 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
161 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
162 | * sections. If there are still some readers after 10 microseconds, | ||
163 | * we repeatedly block for 1-millisecond time periods. This approach | ||
164 | * has done well in testing, so there is no need for a config parameter. | ||
165 | */ | ||
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | ||
167 | |||
168 | /* | ||
160 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
161 | */ | 170 | */ |
162 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) |
@@ -205,9 +214,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
205 | * all srcu_read_lock() calls using the old counters have completed. | 214 | * all srcu_read_lock() calls using the old counters have completed. |
206 | * Their corresponding critical sections might well be still | 215 | * Their corresponding critical sections might well be still |
207 | * executing, but the srcu_read_lock() primitives themselves | 216 | * executing, but the srcu_read_lock() primitives themselves |
208 | * will have finished executing. | 217 | * will have finished executing. We initially give readers |
218 | * an arbitrarily chosen 10 microseconds to get out of their | ||
219 | * SRCU read-side critical sections, then loop waiting 1/HZ | ||
220 | * seconds per iteration. The 10-microsecond value has done | ||
221 | * very well in testing. | ||
209 | */ | 222 | */ |
210 | 223 | ||
224 | if (srcu_readers_active_idx(sp, idx)) | ||
225 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
211 | while (srcu_readers_active_idx(sp, idx)) | 226 | while (srcu_readers_active_idx(sp, idx)) |
212 | schedule_timeout_interruptible(1); | 227 | schedule_timeout_interruptible(1); |
213 | 228 | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4372ccb25127..e3516b29076c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -262,7 +262,7 @@ repeat: | |||
262 | cpu_stop_fn_t fn = work->fn; | 262 | cpu_stop_fn_t fn = work->fn; |
263 | void *arg = work->arg; | 263 | void *arg = work->arg; |
264 | struct cpu_stop_done *done = work->done; | 264 | struct cpu_stop_done *done = work->done; |
265 | char ksym_buf[KSYM_NAME_LEN]; | 265 | char ksym_buf[KSYM_NAME_LEN] __maybe_unused; |
266 | 266 | ||
267 | __set_current_state(TASK_RUNNING); | 267 | __set_current_state(TASK_RUNNING); |
268 | 268 | ||
@@ -287,11 +287,12 @@ repeat: | |||
287 | goto repeat; | 287 | goto repeat; |
288 | } | 288 | } |
289 | 289 | ||
290 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); | ||
291 | |||
290 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ | 292 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ |
291 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | 293 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, |
292 | unsigned long action, void *hcpu) | 294 | unsigned long action, void *hcpu) |
293 | { | 295 | { |
294 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
295 | unsigned int cpu = (unsigned long)hcpu; | 296 | unsigned int cpu = (unsigned long)hcpu; |
296 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 297 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
297 | struct task_struct *p; | 298 | struct task_struct *p; |
@@ -300,17 +301,19 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
300 | case CPU_UP_PREPARE: | 301 | case CPU_UP_PREPARE: |
301 | BUG_ON(stopper->thread || stopper->enabled || | 302 | BUG_ON(stopper->thread || stopper->enabled || |
302 | !list_empty(&stopper->works)); | 303 | !list_empty(&stopper->works)); |
303 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", | 304 | p = kthread_create_on_node(cpu_stopper_thread, |
304 | cpu); | 305 | stopper, |
306 | cpu_to_node(cpu), | ||
307 | "migration/%d", cpu); | ||
305 | if (IS_ERR(p)) | 308 | if (IS_ERR(p)) |
306 | return NOTIFY_BAD; | 309 | return notifier_from_errno(PTR_ERR(p)); |
307 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
308 | get_task_struct(p); | 310 | get_task_struct(p); |
311 | kthread_bind(p, cpu); | ||
312 | sched_set_stop_task(cpu, p); | ||
309 | stopper->thread = p; | 313 | stopper->thread = p; |
310 | break; | 314 | break; |
311 | 315 | ||
312 | case CPU_ONLINE: | 316 | case CPU_ONLINE: |
313 | kthread_bind(stopper->thread, cpu); | ||
314 | /* strictly unnecessary, as first user will wake it */ | 317 | /* strictly unnecessary, as first user will wake it */ |
315 | wake_up_process(stopper->thread); | 318 | wake_up_process(stopper->thread); |
316 | /* mark enabled */ | 319 | /* mark enabled */ |
@@ -325,6 +328,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
325 | { | 328 | { |
326 | struct cpu_stop_work *work; | 329 | struct cpu_stop_work *work; |
327 | 330 | ||
331 | sched_set_stop_task(cpu, NULL); | ||
328 | /* kill the stopper */ | 332 | /* kill the stopper */ |
329 | kthread_stop(stopper->thread); | 333 | kthread_stop(stopper->thread); |
330 | /* drain remaining works */ | 334 | /* drain remaining works */ |
@@ -370,7 +374,7 @@ static int __init cpu_stop_init(void) | |||
370 | /* start one for the boot cpu */ | 374 | /* start one for the boot cpu */ |
371 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, | 375 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, |
372 | bcpu); | 376 | bcpu); |
373 | BUG_ON(err == NOTIFY_BAD); | 377 | BUG_ON(err != NOTIFY_OK); |
374 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); | 378 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); |
375 | register_cpu_notifier(&cpu_stop_cpu_notifier); | 379 | register_cpu_notifier(&cpu_stop_cpu_notifier); |
376 | 380 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a9..e4128b278f23 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -37,12 +37,15 @@ | |||
37 | #include <linux/ptrace.h> | 37 | #include <linux/ptrace.h> |
38 | #include <linux/fs_struct.h> | 38 | #include <linux/fs_struct.h> |
39 | #include <linux/gfp.h> | 39 | #include <linux/gfp.h> |
40 | #include <linux/syscore_ops.h> | ||
40 | 41 | ||
41 | #include <linux/compat.h> | 42 | #include <linux/compat.h> |
42 | #include <linux/syscalls.h> | 43 | #include <linux/syscalls.h> |
43 | #include <linux/kprobes.h> | 44 | #include <linux/kprobes.h> |
44 | #include <linux/user_namespace.h> | 45 | #include <linux/user_namespace.h> |
45 | 46 | ||
47 | #include <linux/kmsg_dump.h> | ||
48 | |||
46 | #include <asm/uaccess.h> | 49 | #include <asm/uaccess.h> |
47 | #include <asm/io.h> | 50 | #include <asm/io.h> |
48 | #include <asm/unistd.h> | 51 | #include <asm/unistd.h> |
@@ -117,16 +120,33 @@ EXPORT_SYMBOL(cad_pid); | |||
117 | void (*pm_power_off_prepare)(void); | 120 | void (*pm_power_off_prepare)(void); |
118 | 121 | ||
119 | /* | 122 | /* |
123 | * Returns true if current's euid is same as p's uid or euid, | ||
124 | * or has CAP_SYS_NICE to p's user_ns. | ||
125 | * | ||
126 | * Called with rcu_read_lock, creds are safe | ||
127 | */ | ||
128 | static bool set_one_prio_perm(struct task_struct *p) | ||
129 | { | ||
130 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | ||
131 | |||
132 | if (pcred->user->user_ns == cred->user->user_ns && | ||
133 | (pcred->uid == cred->euid || | ||
134 | pcred->euid == cred->euid)) | ||
135 | return true; | ||
136 | if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) | ||
137 | return true; | ||
138 | return false; | ||
139 | } | ||
140 | |||
141 | /* | ||
120 | * set the priority of a task | 142 | * set the priority of a task |
121 | * - the caller must hold the RCU read lock | 143 | * - the caller must hold the RCU read lock |
122 | */ | 144 | */ |
123 | static int set_one_prio(struct task_struct *p, int niceval, int error) | 145 | static int set_one_prio(struct task_struct *p, int niceval, int error) |
124 | { | 146 | { |
125 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | ||
126 | int no_nice; | 147 | int no_nice; |
127 | 148 | ||
128 | if (pcred->uid != cred->euid && | 149 | if (!set_one_prio_perm(p)) { |
129 | pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { | ||
130 | error = -EPERM; | 150 | error = -EPERM; |
131 | goto out; | 151 | goto out; |
132 | } | 152 | } |
@@ -285,6 +305,7 @@ out_unlock: | |||
285 | */ | 305 | */ |
286 | void emergency_restart(void) | 306 | void emergency_restart(void) |
287 | { | 307 | { |
308 | kmsg_dump(KMSG_DUMP_EMERG); | ||
288 | machine_emergency_restart(); | 309 | machine_emergency_restart(); |
289 | } | 310 | } |
290 | EXPORT_SYMBOL_GPL(emergency_restart); | 311 | EXPORT_SYMBOL_GPL(emergency_restart); |
@@ -293,8 +314,9 @@ void kernel_restart_prepare(char *cmd) | |||
293 | { | 314 | { |
294 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 315 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
295 | system_state = SYSTEM_RESTART; | 316 | system_state = SYSTEM_RESTART; |
317 | usermodehelper_disable(); | ||
296 | device_shutdown(); | 318 | device_shutdown(); |
297 | sysdev_shutdown(); | 319 | syscore_shutdown(); |
298 | } | 320 | } |
299 | 321 | ||
300 | /** | 322 | /** |
@@ -312,6 +334,7 @@ void kernel_restart(char *cmd) | |||
312 | printk(KERN_EMERG "Restarting system.\n"); | 334 | printk(KERN_EMERG "Restarting system.\n"); |
313 | else | 335 | else |
314 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | 336 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); |
337 | kmsg_dump(KMSG_DUMP_RESTART); | ||
315 | machine_restart(cmd); | 338 | machine_restart(cmd); |
316 | } | 339 | } |
317 | EXPORT_SYMBOL_GPL(kernel_restart); | 340 | EXPORT_SYMBOL_GPL(kernel_restart); |
@@ -321,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
321 | blocking_notifier_call_chain(&reboot_notifier_list, | 344 | blocking_notifier_call_chain(&reboot_notifier_list, |
322 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | 345 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); |
323 | system_state = state; | 346 | system_state = state; |
347 | usermodehelper_disable(); | ||
324 | device_shutdown(); | 348 | device_shutdown(); |
325 | } | 349 | } |
326 | /** | 350 | /** |
@@ -331,8 +355,9 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
331 | void kernel_halt(void) | 355 | void kernel_halt(void) |
332 | { | 356 | { |
333 | kernel_shutdown_prepare(SYSTEM_HALT); | 357 | kernel_shutdown_prepare(SYSTEM_HALT); |
334 | sysdev_shutdown(); | 358 | syscore_shutdown(); |
335 | printk(KERN_EMERG "System halted.\n"); | 359 | printk(KERN_EMERG "System halted.\n"); |
360 | kmsg_dump(KMSG_DUMP_HALT); | ||
336 | machine_halt(); | 361 | machine_halt(); |
337 | } | 362 | } |
338 | 363 | ||
@@ -349,8 +374,9 @@ void kernel_power_off(void) | |||
349 | if (pm_power_off_prepare) | 374 | if (pm_power_off_prepare) |
350 | pm_power_off_prepare(); | 375 | pm_power_off_prepare(); |
351 | disable_nonboot_cpus(); | 376 | disable_nonboot_cpus(); |
352 | sysdev_shutdown(); | 377 | syscore_shutdown(); |
353 | printk(KERN_EMERG "Power down.\n"); | 378 | printk(KERN_EMERG "Power down.\n"); |
379 | kmsg_dump(KMSG_DUMP_POWEROFF); | ||
354 | machine_power_off(); | 380 | machine_power_off(); |
355 | } | 381 | } |
356 | EXPORT_SYMBOL_GPL(kernel_power_off); | 382 | EXPORT_SYMBOL_GPL(kernel_power_off); |
@@ -496,7 +522,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
496 | if (rgid != (gid_t) -1) { | 522 | if (rgid != (gid_t) -1) { |
497 | if (old->gid == rgid || | 523 | if (old->gid == rgid || |
498 | old->egid == rgid || | 524 | old->egid == rgid || |
499 | capable(CAP_SETGID)) | 525 | nsown_capable(CAP_SETGID)) |
500 | new->gid = rgid; | 526 | new->gid = rgid; |
501 | else | 527 | else |
502 | goto error; | 528 | goto error; |
@@ -505,7 +531,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
505 | if (old->gid == egid || | 531 | if (old->gid == egid || |
506 | old->egid == egid || | 532 | old->egid == egid || |
507 | old->sgid == egid || | 533 | old->sgid == egid || |
508 | capable(CAP_SETGID)) | 534 | nsown_capable(CAP_SETGID)) |
509 | new->egid = egid; | 535 | new->egid = egid; |
510 | else | 536 | else |
511 | goto error; | 537 | goto error; |
@@ -540,7 +566,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
540 | old = current_cred(); | 566 | old = current_cred(); |
541 | 567 | ||
542 | retval = -EPERM; | 568 | retval = -EPERM; |
543 | if (capable(CAP_SETGID)) | 569 | if (nsown_capable(CAP_SETGID)) |
544 | new->gid = new->egid = new->sgid = new->fsgid = gid; | 570 | new->gid = new->egid = new->sgid = new->fsgid = gid; |
545 | else if (gid == old->gid || gid == old->sgid) | 571 | else if (gid == old->gid || gid == old->sgid) |
546 | new->egid = new->fsgid = gid; | 572 | new->egid = new->fsgid = gid; |
@@ -607,7 +633,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
607 | new->uid = ruid; | 633 | new->uid = ruid; |
608 | if (old->uid != ruid && | 634 | if (old->uid != ruid && |
609 | old->euid != ruid && | 635 | old->euid != ruid && |
610 | !capable(CAP_SETUID)) | 636 | !nsown_capable(CAP_SETUID)) |
611 | goto error; | 637 | goto error; |
612 | } | 638 | } |
613 | 639 | ||
@@ -616,7 +642,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
616 | if (old->uid != euid && | 642 | if (old->uid != euid && |
617 | old->euid != euid && | 643 | old->euid != euid && |
618 | old->suid != euid && | 644 | old->suid != euid && |
619 | !capable(CAP_SETUID)) | 645 | !nsown_capable(CAP_SETUID)) |
620 | goto error; | 646 | goto error; |
621 | } | 647 | } |
622 | 648 | ||
@@ -664,7 +690,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
664 | old = current_cred(); | 690 | old = current_cred(); |
665 | 691 | ||
666 | retval = -EPERM; | 692 | retval = -EPERM; |
667 | if (capable(CAP_SETUID)) { | 693 | if (nsown_capable(CAP_SETUID)) { |
668 | new->suid = new->uid = uid; | 694 | new->suid = new->uid = uid; |
669 | if (uid != old->uid) { | 695 | if (uid != old->uid) { |
670 | retval = set_user(new); | 696 | retval = set_user(new); |
@@ -706,7 +732,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
706 | old = current_cred(); | 732 | old = current_cred(); |
707 | 733 | ||
708 | retval = -EPERM; | 734 | retval = -EPERM; |
709 | if (!capable(CAP_SETUID)) { | 735 | if (!nsown_capable(CAP_SETUID)) { |
710 | if (ruid != (uid_t) -1 && ruid != old->uid && | 736 | if (ruid != (uid_t) -1 && ruid != old->uid && |
711 | ruid != old->euid && ruid != old->suid) | 737 | ruid != old->euid && ruid != old->suid) |
712 | goto error; | 738 | goto error; |
@@ -770,7 +796,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
770 | old = current_cred(); | 796 | old = current_cred(); |
771 | 797 | ||
772 | retval = -EPERM; | 798 | retval = -EPERM; |
773 | if (!capable(CAP_SETGID)) { | 799 | if (!nsown_capable(CAP_SETGID)) { |
774 | if (rgid != (gid_t) -1 && rgid != old->gid && | 800 | if (rgid != (gid_t) -1 && rgid != old->gid && |
775 | rgid != old->egid && rgid != old->sgid) | 801 | rgid != old->egid && rgid != old->sgid) |
776 | goto error; | 802 | goto error; |
@@ -830,7 +856,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
830 | 856 | ||
831 | if (uid == old->uid || uid == old->euid || | 857 | if (uid == old->uid || uid == old->euid || |
832 | uid == old->suid || uid == old->fsuid || | 858 | uid == old->suid || uid == old->fsuid || |
833 | capable(CAP_SETUID)) { | 859 | nsown_capable(CAP_SETUID)) { |
834 | if (uid != old_fsuid) { | 860 | if (uid != old_fsuid) { |
835 | new->fsuid = uid; | 861 | new->fsuid = uid; |
836 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 862 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
@@ -863,7 +889,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
863 | 889 | ||
864 | if (gid == old->gid || gid == old->egid || | 890 | if (gid == old->gid || gid == old->egid || |
865 | gid == old->sgid || gid == old->fsgid || | 891 | gid == old->sgid || gid == old->fsgid || |
866 | capable(CAP_SETGID)) { | 892 | nsown_capable(CAP_SETGID)) { |
867 | if (gid != old_fsgid) { | 893 | if (gid != old_fsgid) { |
868 | new->fsgid = gid; | 894 | new->fsgid = gid; |
869 | goto change_okay; | 895 | goto change_okay; |
@@ -1080,8 +1106,10 @@ SYSCALL_DEFINE0(setsid) | |||
1080 | err = session; | 1106 | err = session; |
1081 | out: | 1107 | out: |
1082 | write_unlock_irq(&tasklist_lock); | 1108 | write_unlock_irq(&tasklist_lock); |
1083 | if (err > 0) | 1109 | if (err > 0) { |
1084 | proc_sid_connector(group_leader); | 1110 | proc_sid_connector(group_leader); |
1111 | sched_autogroup_create_attach(group_leader); | ||
1112 | } | ||
1085 | return err; | 1113 | return err; |
1086 | } | 1114 | } |
1087 | 1115 | ||
@@ -1169,8 +1197,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) | |||
1169 | int errno; | 1197 | int errno; |
1170 | char tmp[__NEW_UTS_LEN]; | 1198 | char tmp[__NEW_UTS_LEN]; |
1171 | 1199 | ||
1172 | if (!capable(CAP_SYS_ADMIN)) | 1200 | if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) |
1173 | return -EPERM; | 1201 | return -EPERM; |
1202 | |||
1174 | if (len < 0 || len > __NEW_UTS_LEN) | 1203 | if (len < 0 || len > __NEW_UTS_LEN) |
1175 | return -EINVAL; | 1204 | return -EINVAL; |
1176 | down_write(&uts_sem); | 1205 | down_write(&uts_sem); |
@@ -1218,7 +1247,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
1218 | int errno; | 1247 | int errno; |
1219 | char tmp[__NEW_UTS_LEN]; | 1248 | char tmp[__NEW_UTS_LEN]; |
1220 | 1249 | ||
1221 | if (!capable(CAP_SYS_ADMIN)) | 1250 | if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) |
1222 | return -EPERM; | 1251 | return -EPERM; |
1223 | if (len < 0 || len > __NEW_UTS_LEN) | 1252 | if (len < 0 || len > __NEW_UTS_LEN) |
1224 | return -EINVAL; | 1253 | return -EINVAL; |
@@ -1333,6 +1362,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, | |||
1333 | rlim = tsk->signal->rlim + resource; | 1362 | rlim = tsk->signal->rlim + resource; |
1334 | task_lock(tsk->group_leader); | 1363 | task_lock(tsk->group_leader); |
1335 | if (new_rlim) { | 1364 | if (new_rlim) { |
1365 | /* Keep the capable check against init_user_ns until | ||
1366 | cgroups can contain all limits */ | ||
1336 | if (new_rlim->rlim_max > rlim->rlim_max && | 1367 | if (new_rlim->rlim_max > rlim->rlim_max && |
1337 | !capable(CAP_SYS_RESOURCE)) | 1368 | !capable(CAP_SYS_RESOURCE)) |
1338 | retval = -EPERM; | 1369 | retval = -EPERM; |
@@ -1376,18 +1407,22 @@ static int check_prlimit_permission(struct task_struct *task) | |||
1376 | { | 1407 | { |
1377 | const struct cred *cred = current_cred(), *tcred; | 1408 | const struct cred *cred = current_cred(), *tcred; |
1378 | 1409 | ||
1379 | tcred = __task_cred(task); | 1410 | if (current == task) |
1380 | if ((cred->uid != tcred->euid || | 1411 | return 0; |
1381 | cred->uid != tcred->suid || | ||
1382 | cred->uid != tcred->uid || | ||
1383 | cred->gid != tcred->egid || | ||
1384 | cred->gid != tcred->sgid || | ||
1385 | cred->gid != tcred->gid) && | ||
1386 | !capable(CAP_SYS_RESOURCE)) { | ||
1387 | return -EPERM; | ||
1388 | } | ||
1389 | 1412 | ||
1390 | return 0; | 1413 | tcred = __task_cred(task); |
1414 | if (cred->user->user_ns == tcred->user->user_ns && | ||
1415 | (cred->uid == tcred->euid && | ||
1416 | cred->uid == tcred->suid && | ||
1417 | cred->uid == tcred->uid && | ||
1418 | cred->gid == tcred->egid && | ||
1419 | cred->gid == tcred->sgid && | ||
1420 | cred->gid == tcred->gid)) | ||
1421 | return 0; | ||
1422 | if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) | ||
1423 | return 0; | ||
1424 | |||
1425 | return -EPERM; | ||
1391 | } | 1426 | } |
1392 | 1427 | ||
1393 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | 1428 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bad369ec5403..62cbc8877fef 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -46,10 +46,13 @@ cond_syscall(sys_getsockopt); | |||
46 | cond_syscall(compat_sys_getsockopt); | 46 | cond_syscall(compat_sys_getsockopt); |
47 | cond_syscall(sys_shutdown); | 47 | cond_syscall(sys_shutdown); |
48 | cond_syscall(sys_sendmsg); | 48 | cond_syscall(sys_sendmsg); |
49 | cond_syscall(sys_sendmmsg); | ||
49 | cond_syscall(compat_sys_sendmsg); | 50 | cond_syscall(compat_sys_sendmsg); |
51 | cond_syscall(compat_sys_sendmmsg); | ||
50 | cond_syscall(sys_recvmsg); | 52 | cond_syscall(sys_recvmsg); |
51 | cond_syscall(sys_recvmmsg); | 53 | cond_syscall(sys_recvmmsg); |
52 | cond_syscall(compat_sys_recvmsg); | 54 | cond_syscall(compat_sys_recvmsg); |
55 | cond_syscall(compat_sys_recv); | ||
53 | cond_syscall(compat_sys_recvfrom); | 56 | cond_syscall(compat_sys_recvfrom); |
54 | cond_syscall(compat_sys_recvmmsg); | 57 | cond_syscall(compat_sys_recvmmsg); |
55 | cond_syscall(sys_socketcall); | 58 | cond_syscall(sys_socketcall); |
@@ -68,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait); | |||
68 | cond_syscall(sys_semget); | 71 | cond_syscall(sys_semget); |
69 | cond_syscall(sys_semop); | 72 | cond_syscall(sys_semop); |
70 | cond_syscall(sys_semtimedop); | 73 | cond_syscall(sys_semtimedop); |
74 | cond_syscall(compat_sys_semtimedop); | ||
71 | cond_syscall(sys_semctl); | 75 | cond_syscall(sys_semctl); |
76 | cond_syscall(compat_sys_semctl); | ||
72 | cond_syscall(sys_msgget); | 77 | cond_syscall(sys_msgget); |
73 | cond_syscall(sys_msgsnd); | 78 | cond_syscall(sys_msgsnd); |
79 | cond_syscall(compat_sys_msgsnd); | ||
74 | cond_syscall(sys_msgrcv); | 80 | cond_syscall(sys_msgrcv); |
81 | cond_syscall(compat_sys_msgrcv); | ||
75 | cond_syscall(sys_msgctl); | 82 | cond_syscall(sys_msgctl); |
83 | cond_syscall(compat_sys_msgctl); | ||
76 | cond_syscall(sys_shmget); | 84 | cond_syscall(sys_shmget); |
77 | cond_syscall(sys_shmat); | 85 | cond_syscall(sys_shmat); |
86 | cond_syscall(compat_sys_shmat); | ||
78 | cond_syscall(sys_shmdt); | 87 | cond_syscall(sys_shmdt); |
79 | cond_syscall(sys_shmctl); | 88 | cond_syscall(sys_shmctl); |
89 | cond_syscall(compat_sys_shmctl); | ||
80 | cond_syscall(sys_mq_open); | 90 | cond_syscall(sys_mq_open); |
81 | cond_syscall(sys_mq_unlink); | 91 | cond_syscall(sys_mq_unlink); |
82 | cond_syscall(sys_mq_timedsend); | 92 | cond_syscall(sys_mq_timedsend); |
@@ -185,3 +195,8 @@ cond_syscall(sys_perf_event_open); | |||
185 | /* fanotify! */ | 195 | /* fanotify! */ |
186 | cond_syscall(sys_fanotify_init); | 196 | cond_syscall(sys_fanotify_init); |
187 | cond_syscall(sys_fanotify_mark); | 197 | cond_syscall(sys_fanotify_mark); |
198 | |||
199 | /* open by handle */ | ||
200 | cond_syscall(sys_name_to_handle_at); | ||
201 | cond_syscall(sys_open_by_handle_at); | ||
202 | cond_syscall(compat_sys_open_by_handle_at); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770f..f175d98bd355 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
27 | #include <linux/printk.h> | ||
27 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
28 | #include <linux/security.h> | 29 | #include <linux/security.h> |
29 | #include <linux/ctype.h> | 30 | #include <linux/ctype.h> |
@@ -55,6 +56,7 @@ | |||
55 | #include <linux/kprobes.h> | 56 | #include <linux/kprobes.h> |
56 | #include <linux/pipe_fs_i.h> | 57 | #include <linux/pipe_fs_i.h> |
57 | #include <linux/oom.h> | 58 | #include <linux/oom.h> |
59 | #include <linux/kmod.h> | ||
58 | 60 | ||
59 | #include <asm/uaccess.h> | 61 | #include <asm/uaccess.h> |
60 | #include <asm/processor.h> | 62 | #include <asm/processor.h> |
@@ -116,6 +118,7 @@ static int neg_one = -1; | |||
116 | static int zero; | 118 | static int zero; |
117 | static int __maybe_unused one = 1; | 119 | static int __maybe_unused one = 1; |
118 | static int __maybe_unused two = 2; | 120 | static int __maybe_unused two = 2; |
121 | static int __maybe_unused three = 3; | ||
119 | static unsigned long one_ul = 1; | 122 | static unsigned long one_ul = 1; |
120 | static int one_hundred = 100; | 123 | static int one_hundred = 100; |
121 | #ifdef CONFIG_PRINTK | 124 | #ifdef CONFIG_PRINTK |
@@ -161,8 +164,6 @@ extern int no_unaligned_warning; | |||
161 | extern int unaligned_dump_stack; | 164 | extern int unaligned_dump_stack; |
162 | #endif | 165 | #endif |
163 | 166 | ||
164 | extern struct ratelimit_state printk_ratelimit_state; | ||
165 | |||
166 | #ifdef CONFIG_PROC_SYSCTL | 167 | #ifdef CONFIG_PROC_SYSCTL |
167 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 168 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
168 | void __user *buffer, size_t *lenp, loff_t *ppos); | 169 | void __user *buffer, size_t *lenp, loff_t *ppos); |
@@ -170,8 +171,14 @@ static int proc_taint(struct ctl_table *table, int write, | |||
170 | void __user *buffer, size_t *lenp, loff_t *ppos); | 171 | void __user *buffer, size_t *lenp, loff_t *ppos); |
171 | #endif | 172 | #endif |
172 | 173 | ||
174 | #ifdef CONFIG_PRINTK | ||
175 | static int proc_dmesg_restrict(struct ctl_table *table, int write, | ||
176 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
177 | #endif | ||
178 | |||
173 | #ifdef CONFIG_MAGIC_SYSRQ | 179 | #ifdef CONFIG_MAGIC_SYSRQ |
174 | static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ | 180 | /* Note: sysrq code uses it's own private copy */ |
181 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | ||
175 | 182 | ||
176 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 183 | static int sysrq_sysctl_handler(ctl_table *table, int write, |
177 | void __user *buffer, size_t *lenp, | 184 | void __user *buffer, size_t *lenp, |
@@ -194,9 +201,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, | |||
194 | static struct ctl_table root_table[]; | 201 | static struct ctl_table root_table[]; |
195 | static struct ctl_table_root sysctl_table_root; | 202 | static struct ctl_table_root sysctl_table_root; |
196 | static struct ctl_table_header root_table_header = { | 203 | static struct ctl_table_header root_table_header = { |
197 | .count = 1, | 204 | {{.count = 1, |
198 | .ctl_table = root_table, | 205 | .ctl_table = root_table, |
199 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), | 206 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, |
200 | .root = &sysctl_table_root, | 207 | .root = &sysctl_table_root, |
201 | .set = &sysctl_table_root.default_set, | 208 | .set = &sysctl_table_root.default_set, |
202 | }; | 209 | }; |
@@ -247,10 +254,6 @@ static struct ctl_table root_table[] = { | |||
247 | .mode = 0555, | 254 | .mode = 0555, |
248 | .child = dev_table, | 255 | .child = dev_table, |
249 | }, | 256 | }, |
250 | /* | ||
251 | * NOTE: do not add new entries to this table unless you have read | ||
252 | * Documentation/sysctl/ctl_unnumbered.txt | ||
253 | */ | ||
254 | { } | 257 | { } |
255 | }; | 258 | }; |
256 | 259 | ||
@@ -261,8 +264,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ | |||
261 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 264 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
262 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 265 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
263 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 266 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
264 | static int min_sched_shares_ratelimit = 100000; /* 100 usec */ | ||
265 | static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ | ||
266 | #endif | 267 | #endif |
267 | 268 | ||
268 | #ifdef CONFIG_COMPACTION | 269 | #ifdef CONFIG_COMPACTION |
@@ -307,15 +308,6 @@ static struct ctl_table kern_table[] = { | |||
307 | .extra2 = &max_wakeup_granularity_ns, | 308 | .extra2 = &max_wakeup_granularity_ns, |
308 | }, | 309 | }, |
309 | { | 310 | { |
310 | .procname = "sched_shares_ratelimit", | ||
311 | .data = &sysctl_sched_shares_ratelimit, | ||
312 | .maxlen = sizeof(unsigned int), | ||
313 | .mode = 0644, | ||
314 | .proc_handler = sched_proc_update_handler, | ||
315 | .extra1 = &min_sched_shares_ratelimit, | ||
316 | .extra2 = &max_sched_shares_ratelimit, | ||
317 | }, | ||
318 | { | ||
319 | .procname = "sched_tunable_scaling", | 311 | .procname = "sched_tunable_scaling", |
320 | .data = &sysctl_sched_tunable_scaling, | 312 | .data = &sysctl_sched_tunable_scaling, |
321 | .maxlen = sizeof(enum sched_tunable_scaling), | 313 | .maxlen = sizeof(enum sched_tunable_scaling), |
@@ -325,14 +317,6 @@ static struct ctl_table kern_table[] = { | |||
325 | .extra2 = &max_sched_tunable_scaling, | 317 | .extra2 = &max_sched_tunable_scaling, |
326 | }, | 318 | }, |
327 | { | 319 | { |
328 | .procname = "sched_shares_thresh", | ||
329 | .data = &sysctl_sched_shares_thresh, | ||
330 | .maxlen = sizeof(unsigned int), | ||
331 | .mode = 0644, | ||
332 | .proc_handler = proc_dointvec_minmax, | ||
333 | .extra1 = &zero, | ||
334 | }, | ||
335 | { | ||
336 | .procname = "sched_migration_cost", | 320 | .procname = "sched_migration_cost", |
337 | .data = &sysctl_sched_migration_cost, | 321 | .data = &sysctl_sched_migration_cost, |
338 | .maxlen = sizeof(unsigned int), | 322 | .maxlen = sizeof(unsigned int), |
@@ -354,6 +338,13 @@ static struct ctl_table kern_table[] = { | |||
354 | .proc_handler = proc_dointvec, | 338 | .proc_handler = proc_dointvec, |
355 | }, | 339 | }, |
356 | { | 340 | { |
341 | .procname = "sched_shares_window", | ||
342 | .data = &sysctl_sched_shares_window, | ||
343 | .maxlen = sizeof(unsigned int), | ||
344 | .mode = 0644, | ||
345 | .proc_handler = proc_dointvec, | ||
346 | }, | ||
347 | { | ||
357 | .procname = "timer_migration", | 348 | .procname = "timer_migration", |
358 | .data = &sysctl_timer_migration, | 349 | .data = &sysctl_timer_migration, |
359 | .maxlen = sizeof(unsigned int), | 350 | .maxlen = sizeof(unsigned int), |
@@ -377,13 +368,17 @@ static struct ctl_table kern_table[] = { | |||
377 | .mode = 0644, | 368 | .mode = 0644, |
378 | .proc_handler = sched_rt_handler, | 369 | .proc_handler = sched_rt_handler, |
379 | }, | 370 | }, |
371 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
380 | { | 372 | { |
381 | .procname = "sched_compat_yield", | 373 | .procname = "sched_autogroup_enabled", |
382 | .data = &sysctl_sched_compat_yield, | 374 | .data = &sysctl_sched_autogroup_enabled, |
383 | .maxlen = sizeof(unsigned int), | 375 | .maxlen = sizeof(unsigned int), |
384 | .mode = 0644, | 376 | .mode = 0644, |
385 | .proc_handler = proc_dointvec, | 377 | .proc_handler = proc_dointvec_minmax, |
378 | .extra1 = &zero, | ||
379 | .extra2 = &one, | ||
386 | }, | 380 | }, |
381 | #endif | ||
387 | #ifdef CONFIG_PROVE_LOCKING | 382 | #ifdef CONFIG_PROVE_LOCKING |
388 | { | 383 | { |
389 | .procname = "prove_locking", | 384 | .procname = "prove_locking", |
@@ -622,6 +617,11 @@ static struct ctl_table kern_table[] = { | |||
622 | .child = random_table, | 617 | .child = random_table, |
623 | }, | 618 | }, |
624 | { | 619 | { |
620 | .procname = "usermodehelper", | ||
621 | .mode = 0555, | ||
622 | .child = usermodehelper_table, | ||
623 | }, | ||
624 | { | ||
625 | .procname = "overflowuid", | 625 | .procname = "overflowuid", |
626 | .data = &overflowuid, | 626 | .data = &overflowuid, |
627 | .maxlen = sizeof(int), | 627 | .maxlen = sizeof(int), |
@@ -704,6 +704,24 @@ static struct ctl_table kern_table[] = { | |||
704 | .extra1 = &zero, | 704 | .extra1 = &zero, |
705 | .extra2 = &ten_thousand, | 705 | .extra2 = &ten_thousand, |
706 | }, | 706 | }, |
707 | { | ||
708 | .procname = "dmesg_restrict", | ||
709 | .data = &dmesg_restrict, | ||
710 | .maxlen = sizeof(int), | ||
711 | .mode = 0644, | ||
712 | .proc_handler = proc_dointvec_minmax, | ||
713 | .extra1 = &zero, | ||
714 | .extra2 = &one, | ||
715 | }, | ||
716 | { | ||
717 | .procname = "kptr_restrict", | ||
718 | .data = &kptr_restrict, | ||
719 | .maxlen = sizeof(int), | ||
720 | .mode = 0644, | ||
721 | .proc_handler = proc_dmesg_restrict, | ||
722 | .extra1 = &zero, | ||
723 | .extra2 = &two, | ||
724 | }, | ||
707 | #endif | 725 | #endif |
708 | { | 726 | { |
709 | .procname = "ngroups_max", | 727 | .procname = "ngroups_max", |
@@ -718,14 +736,16 @@ static struct ctl_table kern_table[] = { | |||
718 | .data = &watchdog_enabled, | 736 | .data = &watchdog_enabled, |
719 | .maxlen = sizeof (int), | 737 | .maxlen = sizeof (int), |
720 | .mode = 0644, | 738 | .mode = 0644, |
721 | .proc_handler = proc_dowatchdog_enabled, | 739 | .proc_handler = proc_dowatchdog, |
740 | .extra1 = &zero, | ||
741 | .extra2 = &one, | ||
722 | }, | 742 | }, |
723 | { | 743 | { |
724 | .procname = "watchdog_thresh", | 744 | .procname = "watchdog_thresh", |
725 | .data = &softlockup_thresh, | 745 | .data = &watchdog_thresh, |
726 | .maxlen = sizeof(int), | 746 | .maxlen = sizeof(int), |
727 | .mode = 0644, | 747 | .mode = 0644, |
728 | .proc_handler = proc_dowatchdog_thresh, | 748 | .proc_handler = proc_dowatchdog, |
729 | .extra1 = &neg_one, | 749 | .extra1 = &neg_one, |
730 | .extra2 = &sixty, | 750 | .extra2 = &sixty, |
731 | }, | 751 | }, |
@@ -738,21 +758,23 @@ static struct ctl_table kern_table[] = { | |||
738 | .extra1 = &zero, | 758 | .extra1 = &zero, |
739 | .extra2 = &one, | 759 | .extra2 = &one, |
740 | }, | 760 | }, |
741 | #endif | ||
742 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
743 | { | 761 | { |
744 | .procname = "unknown_nmi_panic", | 762 | .procname = "nmi_watchdog", |
745 | .data = &unknown_nmi_panic, | 763 | .data = &watchdog_enabled, |
746 | .maxlen = sizeof (int), | 764 | .maxlen = sizeof (int), |
747 | .mode = 0644, | 765 | .mode = 0644, |
748 | .proc_handler = proc_dointvec, | 766 | .proc_handler = proc_dowatchdog, |
767 | .extra1 = &zero, | ||
768 | .extra2 = &one, | ||
749 | }, | 769 | }, |
770 | #endif | ||
771 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
750 | { | 772 | { |
751 | .procname = "nmi_watchdog", | 773 | .procname = "unknown_nmi_panic", |
752 | .data = &nmi_watchdog_enabled, | 774 | .data = &unknown_nmi_panic, |
753 | .maxlen = sizeof (int), | 775 | .maxlen = sizeof (int), |
754 | .mode = 0644, | 776 | .mode = 0644, |
755 | .proc_handler = proc_nmi_enabled, | 777 | .proc_handler = proc_dointvec, |
756 | }, | 778 | }, |
757 | #endif | 779 | #endif |
758 | #if defined(CONFIG_X86) | 780 | #if defined(CONFIG_X86) |
@@ -916,6 +938,12 @@ static struct ctl_table kern_table[] = { | |||
916 | }, | 938 | }, |
917 | #endif | 939 | #endif |
918 | #ifdef CONFIG_PERF_EVENTS | 940 | #ifdef CONFIG_PERF_EVENTS |
941 | /* | ||
942 | * User-space scripts rely on the existence of this file | ||
943 | * as a feature check for perf_events being enabled. | ||
944 | * | ||
945 | * So it's an ABI, do not remove! | ||
946 | */ | ||
919 | { | 947 | { |
920 | .procname = "perf_event_paranoid", | 948 | .procname = "perf_event_paranoid", |
921 | .data = &sysctl_perf_event_paranoid, | 949 | .data = &sysctl_perf_event_paranoid, |
@@ -935,7 +963,7 @@ static struct ctl_table kern_table[] = { | |||
935 | .data = &sysctl_perf_event_sample_rate, | 963 | .data = &sysctl_perf_event_sample_rate, |
936 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 964 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
937 | .mode = 0644, | 965 | .mode = 0644, |
938 | .proc_handler = proc_dointvec, | 966 | .proc_handler = perf_proc_update_handler, |
939 | }, | 967 | }, |
940 | #endif | 968 | #endif |
941 | #ifdef CONFIG_KMEMCHECK | 969 | #ifdef CONFIG_KMEMCHECK |
@@ -956,10 +984,6 @@ static struct ctl_table kern_table[] = { | |||
956 | .proc_handler = proc_dointvec, | 984 | .proc_handler = proc_dointvec, |
957 | }, | 985 | }, |
958 | #endif | 986 | #endif |
959 | /* | ||
960 | * NOTE: do not add new entries to this table unless you have read | ||
961 | * Documentation/sysctl/ctl_unnumbered.txt | ||
962 | */ | ||
963 | { } | 987 | { } |
964 | }; | 988 | }; |
965 | 989 | ||
@@ -969,14 +993,18 @@ static struct ctl_table vm_table[] = { | |||
969 | .data = &sysctl_overcommit_memory, | 993 | .data = &sysctl_overcommit_memory, |
970 | .maxlen = sizeof(sysctl_overcommit_memory), | 994 | .maxlen = sizeof(sysctl_overcommit_memory), |
971 | .mode = 0644, | 995 | .mode = 0644, |
972 | .proc_handler = proc_dointvec, | 996 | .proc_handler = proc_dointvec_minmax, |
997 | .extra1 = &zero, | ||
998 | .extra2 = &two, | ||
973 | }, | 999 | }, |
974 | { | 1000 | { |
975 | .procname = "panic_on_oom", | 1001 | .procname = "panic_on_oom", |
976 | .data = &sysctl_panic_on_oom, | 1002 | .data = &sysctl_panic_on_oom, |
977 | .maxlen = sizeof(sysctl_panic_on_oom), | 1003 | .maxlen = sizeof(sysctl_panic_on_oom), |
978 | .mode = 0644, | 1004 | .mode = 0644, |
979 | .proc_handler = proc_dointvec, | 1005 | .proc_handler = proc_dointvec_minmax, |
1006 | .extra1 = &zero, | ||
1007 | .extra2 = &two, | ||
980 | }, | 1008 | }, |
981 | { | 1009 | { |
982 | .procname = "oom_kill_allocating_task", | 1010 | .procname = "oom_kill_allocating_task", |
@@ -1004,7 +1032,8 @@ static struct ctl_table vm_table[] = { | |||
1004 | .data = &page_cluster, | 1032 | .data = &page_cluster, |
1005 | .maxlen = sizeof(int), | 1033 | .maxlen = sizeof(int), |
1006 | .mode = 0644, | 1034 | .mode = 0644, |
1007 | .proc_handler = proc_dointvec, | 1035 | .proc_handler = proc_dointvec_minmax, |
1036 | .extra1 = &zero, | ||
1008 | }, | 1037 | }, |
1009 | { | 1038 | { |
1010 | .procname = "dirty_background_ratio", | 1039 | .procname = "dirty_background_ratio", |
@@ -1052,7 +1081,8 @@ static struct ctl_table vm_table[] = { | |||
1052 | .data = &dirty_expire_interval, | 1081 | .data = &dirty_expire_interval, |
1053 | .maxlen = sizeof(dirty_expire_interval), | 1082 | .maxlen = sizeof(dirty_expire_interval), |
1054 | .mode = 0644, | 1083 | .mode = 0644, |
1055 | .proc_handler = proc_dointvec, | 1084 | .proc_handler = proc_dointvec_minmax, |
1085 | .extra1 = &zero, | ||
1056 | }, | 1086 | }, |
1057 | { | 1087 | { |
1058 | .procname = "nr_pdflush_threads", | 1088 | .procname = "nr_pdflush_threads", |
@@ -1128,6 +1158,8 @@ static struct ctl_table vm_table[] = { | |||
1128 | .maxlen = sizeof(int), | 1158 | .maxlen = sizeof(int), |
1129 | .mode = 0644, | 1159 | .mode = 0644, |
1130 | .proc_handler = drop_caches_sysctl_handler, | 1160 | .proc_handler = drop_caches_sysctl_handler, |
1161 | .extra1 = &one, | ||
1162 | .extra2 = &three, | ||
1131 | }, | 1163 | }, |
1132 | #ifdef CONFIG_COMPACTION | 1164 | #ifdef CONFIG_COMPACTION |
1133 | { | 1165 | { |
@@ -1320,11 +1352,6 @@ static struct ctl_table vm_table[] = { | |||
1320 | .extra2 = &one, | 1352 | .extra2 = &one, |
1321 | }, | 1353 | }, |
1322 | #endif | 1354 | #endif |
1323 | |||
1324 | /* | ||
1325 | * NOTE: do not add new entries to this table unless you have read | ||
1326 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1327 | */ | ||
1328 | { } | 1355 | { } |
1329 | }; | 1356 | }; |
1330 | 1357 | ||
@@ -1340,28 +1367,28 @@ static struct ctl_table fs_table[] = { | |||
1340 | .data = &inodes_stat, | 1367 | .data = &inodes_stat, |
1341 | .maxlen = 2*sizeof(int), | 1368 | .maxlen = 2*sizeof(int), |
1342 | .mode = 0444, | 1369 | .mode = 0444, |
1343 | .proc_handler = proc_dointvec, | 1370 | .proc_handler = proc_nr_inodes, |
1344 | }, | 1371 | }, |
1345 | { | 1372 | { |
1346 | .procname = "inode-state", | 1373 | .procname = "inode-state", |
1347 | .data = &inodes_stat, | 1374 | .data = &inodes_stat, |
1348 | .maxlen = 7*sizeof(int), | 1375 | .maxlen = 7*sizeof(int), |
1349 | .mode = 0444, | 1376 | .mode = 0444, |
1350 | .proc_handler = proc_dointvec, | 1377 | .proc_handler = proc_nr_inodes, |
1351 | }, | 1378 | }, |
1352 | { | 1379 | { |
1353 | .procname = "file-nr", | 1380 | .procname = "file-nr", |
1354 | .data = &files_stat, | 1381 | .data = &files_stat, |
1355 | .maxlen = 3*sizeof(int), | 1382 | .maxlen = sizeof(files_stat), |
1356 | .mode = 0444, | 1383 | .mode = 0444, |
1357 | .proc_handler = proc_nr_files, | 1384 | .proc_handler = proc_nr_files, |
1358 | }, | 1385 | }, |
1359 | { | 1386 | { |
1360 | .procname = "file-max", | 1387 | .procname = "file-max", |
1361 | .data = &files_stat.max_files, | 1388 | .data = &files_stat.max_files, |
1362 | .maxlen = sizeof(int), | 1389 | .maxlen = sizeof(files_stat.max_files), |
1363 | .mode = 0644, | 1390 | .mode = 0644, |
1364 | .proc_handler = proc_dointvec, | 1391 | .proc_handler = proc_doulongvec_minmax, |
1365 | }, | 1392 | }, |
1366 | { | 1393 | { |
1367 | .procname = "nr_open", | 1394 | .procname = "nr_open", |
@@ -1377,7 +1404,7 @@ static struct ctl_table fs_table[] = { | |||
1377 | .data = &dentry_stat, | 1404 | .data = &dentry_stat, |
1378 | .maxlen = 6*sizeof(int), | 1405 | .maxlen = 6*sizeof(int), |
1379 | .mode = 0444, | 1406 | .mode = 0444, |
1380 | .proc_handler = proc_dointvec, | 1407 | .proc_handler = proc_nr_dentry, |
1381 | }, | 1408 | }, |
1382 | { | 1409 | { |
1383 | .procname = "overflowuid", | 1410 | .procname = "overflowuid", |
@@ -1480,16 +1507,12 @@ static struct ctl_table fs_table[] = { | |||
1480 | .proc_handler = &pipe_proc_fn, | 1507 | .proc_handler = &pipe_proc_fn, |
1481 | .extra1 = &pipe_min_size, | 1508 | .extra1 = &pipe_min_size, |
1482 | }, | 1509 | }, |
1483 | /* | ||
1484 | * NOTE: do not add new entries to this table unless you have read | ||
1485 | * Documentation/sysctl/ctl_unnumbered.txt | ||
1486 | */ | ||
1487 | { } | 1510 | { } |
1488 | }; | 1511 | }; |
1489 | 1512 | ||
1490 | static struct ctl_table debug_table[] = { | 1513 | static struct ctl_table debug_table[] = { |
1491 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ | 1514 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ |
1492 | defined(CONFIG_S390) | 1515 | defined(CONFIG_S390) || defined(CONFIG_TILE) |
1493 | { | 1516 | { |
1494 | .procname = "exception-trace", | 1517 | .procname = "exception-trace", |
1495 | .data = &show_unhandled_signals, | 1518 | .data = &show_unhandled_signals, |
@@ -1567,11 +1590,16 @@ void sysctl_head_get(struct ctl_table_header *head) | |||
1567 | spin_unlock(&sysctl_lock); | 1590 | spin_unlock(&sysctl_lock); |
1568 | } | 1591 | } |
1569 | 1592 | ||
1593 | static void free_head(struct rcu_head *rcu) | ||
1594 | { | ||
1595 | kfree(container_of(rcu, struct ctl_table_header, rcu)); | ||
1596 | } | ||
1597 | |||
1570 | void sysctl_head_put(struct ctl_table_header *head) | 1598 | void sysctl_head_put(struct ctl_table_header *head) |
1571 | { | 1599 | { |
1572 | spin_lock(&sysctl_lock); | 1600 | spin_lock(&sysctl_lock); |
1573 | if (!--head->count) | 1601 | if (!--head->count) |
1574 | kfree(head); | 1602 | call_rcu(&head->rcu, free_head); |
1575 | spin_unlock(&sysctl_lock); | 1603 | spin_unlock(&sysctl_lock); |
1576 | } | 1604 | } |
1577 | 1605 | ||
@@ -1685,13 +1713,8 @@ static int test_perm(int mode, int op) | |||
1685 | 1713 | ||
1686 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) | 1714 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) |
1687 | { | 1715 | { |
1688 | int error; | ||
1689 | int mode; | 1716 | int mode; |
1690 | 1717 | ||
1691 | error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); | ||
1692 | if (error) | ||
1693 | return error; | ||
1694 | |||
1695 | if (root->permissions) | 1718 | if (root->permissions) |
1696 | mode = root->permissions(root, current->nsproxy, table); | 1719 | mode = root->permissions(root, current->nsproxy, table); |
1697 | else | 1720 | else |
@@ -1948,10 +1971,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
1948 | start_unregistering(header); | 1971 | start_unregistering(header); |
1949 | if (!--header->parent->count) { | 1972 | if (!--header->parent->count) { |
1950 | WARN_ON(1); | 1973 | WARN_ON(1); |
1951 | kfree(header->parent); | 1974 | call_rcu(&header->parent->rcu, free_head); |
1952 | } | 1975 | } |
1953 | if (!--header->count) | 1976 | if (!--header->count) |
1954 | kfree(header); | 1977 | call_rcu(&header->rcu, free_head); |
1955 | spin_unlock(&sysctl_lock); | 1978 | spin_unlock(&sysctl_lock); |
1956 | } | 1979 | } |
1957 | 1980 | ||
@@ -2392,6 +2415,17 @@ static int proc_taint(struct ctl_table *table, int write, | |||
2392 | return err; | 2415 | return err; |
2393 | } | 2416 | } |
2394 | 2417 | ||
2418 | #ifdef CONFIG_PRINTK | ||
2419 | static int proc_dmesg_restrict(struct ctl_table *table, int write, | ||
2420 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2421 | { | ||
2422 | if (write && !capable(CAP_SYS_ADMIN)) | ||
2423 | return -EPERM; | ||
2424 | |||
2425 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
2426 | } | ||
2427 | #endif | ||
2428 | |||
2395 | struct do_proc_dointvec_minmax_conv_param { | 2429 | struct do_proc_dointvec_minmax_conv_param { |
2396 | int *min; | 2430 | int *min; |
2397 | int *max; | 2431 | int *max; |
@@ -2893,7 +2927,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
2893 | } | 2927 | } |
2894 | } | 2928 | } |
2895 | 2929 | ||
2896 | #else /* CONFIG_PROC_FS */ | 2930 | #else /* CONFIG_PROC_SYSCTL */ |
2897 | 2931 | ||
2898 | int proc_dostring(struct ctl_table *table, int write, | 2932 | int proc_dostring(struct ctl_table *table, int write, |
2899 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2933 | void __user *buffer, size_t *lenp, loff_t *ppos) |
@@ -2945,7 +2979,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | |||
2945 | } | 2979 | } |
2946 | 2980 | ||
2947 | 2981 | ||
2948 | #endif /* CONFIG_PROC_FS */ | 2982 | #endif /* CONFIG_PROC_SYSCTL */ |
2949 | 2983 | ||
2950 | /* | 2984 | /* |
2951 | * No sense putting this after each symbol definition, twice, | 2985 | * No sense putting this after each symbol definition, twice, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c5786064..3b8e028b9601 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { | |||
136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, | 136 | { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, |
137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
139 | { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, | ||
140 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
141 | {} | 140 | {} |
142 | }; | 141 | }; |
@@ -1193,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file, | |||
1193 | 1192 | ||
1194 | buf[result] = '\0'; | 1193 | buf[result] = '\0'; |
1195 | 1194 | ||
1196 | /* Convert the decnet addresss to binary */ | 1195 | /* Convert the decnet address to binary */ |
1197 | result = -EIO; | 1196 | result = -EIO; |
1198 | nodep = strchr(buf, '.') + 1; | 1197 | nodep = strchr(buf, '.') + 1; |
1199 | if (!nodep) | 1198 | if (!nodep) |
@@ -1322,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1322 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) | 1321 | void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) |
1323 | { | 1322 | { |
1324 | const struct bin_table *table = NULL; | 1323 | const struct bin_table *table = NULL; |
1325 | struct nameidata nd; | ||
1326 | struct vfsmount *mnt; | 1324 | struct vfsmount *mnt; |
1327 | struct file *file; | 1325 | struct file *file; |
1328 | ssize_t result; | 1326 | ssize_t result; |
1329 | char *pathname; | 1327 | char *pathname; |
1330 | int flags; | 1328 | int flags; |
1331 | int acc_mode; | ||
1332 | 1329 | ||
1333 | pathname = sysctl_getname(name, nlen, &table); | 1330 | pathname = sysctl_getname(name, nlen, &table); |
1334 | result = PTR_ERR(pathname); | 1331 | result = PTR_ERR(pathname); |
@@ -1338,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1338 | /* How should the sysctl be accessed? */ | 1335 | /* How should the sysctl be accessed? */ |
1339 | if (oldval && oldlen && newval && newlen) { | 1336 | if (oldval && oldlen && newval && newlen) { |
1340 | flags = O_RDWR; | 1337 | flags = O_RDWR; |
1341 | acc_mode = MAY_READ | MAY_WRITE; | ||
1342 | } else if (newval && newlen) { | 1338 | } else if (newval && newlen) { |
1343 | flags = O_WRONLY; | 1339 | flags = O_WRONLY; |
1344 | acc_mode = MAY_WRITE; | ||
1345 | } else if (oldval && oldlen) { | 1340 | } else if (oldval && oldlen) { |
1346 | flags = O_RDONLY; | 1341 | flags = O_RDONLY; |
1347 | acc_mode = MAY_READ; | ||
1348 | } else { | 1342 | } else { |
1349 | result = 0; | 1343 | result = 0; |
1350 | goto out_putname; | 1344 | goto out_putname; |
1351 | } | 1345 | } |
1352 | 1346 | ||
1353 | mnt = current->nsproxy->pid_ns->proc_mnt; | 1347 | mnt = current->nsproxy->pid_ns->proc_mnt; |
1354 | result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); | 1348 | file = file_open_root(mnt->mnt_root, mnt, pathname, flags); |
1355 | if (result) | ||
1356 | goto out_putname; | ||
1357 | |||
1358 | result = may_open(&nd.path, acc_mode, flags); | ||
1359 | if (result) | ||
1360 | goto out_putpath; | ||
1361 | |||
1362 | file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); | ||
1363 | result = PTR_ERR(file); | 1349 | result = PTR_ERR(file); |
1364 | if (IS_ERR(file)) | 1350 | if (IS_ERR(file)) |
1365 | goto out_putname; | 1351 | goto out_putname; |
@@ -1371,10 +1357,6 @@ out_putname: | |||
1371 | putname(pathname); | 1357 | putname(pathname); |
1372 | out: | 1358 | out: |
1373 | return result; | 1359 | return result; |
1374 | |||
1375 | out_putpath: | ||
1376 | path_put(&nd.path); | ||
1377 | goto out_putname; | ||
1378 | } | 1360 | } |
1379 | 1361 | ||
1380 | 1362 | ||
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8a03c4..4e4932a7b360 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
111 | const char *fail = NULL; | 111 | const char *fail = NULL; |
112 | 112 | ||
113 | if (table->parent) { | 113 | if (table->parent) { |
114 | if (table->procname && !table->parent->procname) | 114 | if (!table->parent->procname) |
115 | set_fail(&fail, table, "Parent without procname"); | 115 | set_fail(&fail, table, "Parent without procname"); |
116 | } | 116 | } |
117 | if (!table->procname) | ||
118 | set_fail(&fail, table, "No procname"); | ||
119 | if (table->child) { | 117 | if (table->child) { |
120 | if (table->data) | 118 | if (table->data) |
121 | set_fail(&fail, table, "Directory with data?"); | 119 | set_fail(&fail, table, "Directory with data?"); |
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
144 | set_fail(&fail, table, "No maxlen"); | 142 | set_fail(&fail, table, "No maxlen"); |
145 | } | 143 | } |
146 | #ifdef CONFIG_PROC_SYSCTL | 144 | #ifdef CONFIG_PROC_SYSCTL |
147 | if (table->procname && !table->proc_handler) | 145 | if (!table->proc_handler) |
148 | set_fail(&fail, table, "No proc_handler"); | 146 | set_fail(&fail, table, "No proc_handler"); |
149 | #endif | 147 | #endif |
150 | #if 0 | ||
151 | if (!table->procname && table->proc_handler) | ||
152 | set_fail(&fail, table, "proc_handler without procname"); | ||
153 | #endif | ||
154 | sysctl_check_leaf(namespaces, table, &fail); | 148 | sysctl_check_leaf(namespaces, table, &fail); |
155 | } | 149 | } |
156 | if (table->mode > 0777) | 150 | if (table->mode > 0777) |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792bd..fc0f22005417 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
89 | return -ENOMEM; | 89 | return -ENOMEM; |
90 | 90 | ||
91 | if (!info) { | 91 | if (!info) { |
92 | int seq = get_cpu_var(taskstats_seqnum)++; | 92 | int seq = this_cpu_inc_return(taskstats_seqnum) - 1; |
93 | put_cpu_var(taskstats_seqnum); | ||
94 | 93 | ||
95 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); | 94 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); |
96 | } else | 95 | } else |
@@ -175,22 +174,8 @@ static void send_cpu_listeners(struct sk_buff *skb, | |||
175 | up_write(&listeners->sem); | 174 | up_write(&listeners->sem); |
176 | } | 175 | } |
177 | 176 | ||
178 | static int fill_pid(pid_t pid, struct task_struct *tsk, | 177 | static void fill_stats(struct task_struct *tsk, struct taskstats *stats) |
179 | struct taskstats *stats) | ||
180 | { | 178 | { |
181 | int rc = 0; | ||
182 | |||
183 | if (!tsk) { | ||
184 | rcu_read_lock(); | ||
185 | tsk = find_task_by_vpid(pid); | ||
186 | if (tsk) | ||
187 | get_task_struct(tsk); | ||
188 | rcu_read_unlock(); | ||
189 | if (!tsk) | ||
190 | return -ESRCH; | ||
191 | } else | ||
192 | get_task_struct(tsk); | ||
193 | |||
194 | memset(stats, 0, sizeof(*stats)); | 179 | memset(stats, 0, sizeof(*stats)); |
195 | /* | 180 | /* |
196 | * Each accounting subsystem adds calls to its functions to | 181 | * Each accounting subsystem adds calls to its functions to |
@@ -209,17 +194,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, | |||
209 | 194 | ||
210 | /* fill in extended acct fields */ | 195 | /* fill in extended acct fields */ |
211 | xacct_add_tsk(stats, tsk); | 196 | xacct_add_tsk(stats, tsk); |
197 | } | ||
212 | 198 | ||
213 | /* Define err: label here if needed */ | 199 | static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) |
214 | put_task_struct(tsk); | 200 | { |
215 | return rc; | 201 | struct task_struct *tsk; |
216 | 202 | ||
203 | rcu_read_lock(); | ||
204 | tsk = find_task_by_vpid(pid); | ||
205 | if (tsk) | ||
206 | get_task_struct(tsk); | ||
207 | rcu_read_unlock(); | ||
208 | if (!tsk) | ||
209 | return -ESRCH; | ||
210 | fill_stats(tsk, stats); | ||
211 | put_task_struct(tsk); | ||
212 | return 0; | ||
217 | } | 213 | } |
218 | 214 | ||
219 | static int fill_tgid(pid_t tgid, struct task_struct *first, | 215 | static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) |
220 | struct taskstats *stats) | ||
221 | { | 216 | { |
222 | struct task_struct *tsk; | 217 | struct task_struct *tsk, *first; |
223 | unsigned long flags; | 218 | unsigned long flags; |
224 | int rc = -ESRCH; | 219 | int rc = -ESRCH; |
225 | 220 | ||
@@ -228,8 +223,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, | |||
228 | * leaders who are already counted with the dead tasks | 223 | * leaders who are already counted with the dead tasks |
229 | */ | 224 | */ |
230 | rcu_read_lock(); | 225 | rcu_read_lock(); |
231 | if (!first) | 226 | first = find_task_by_vpid(tgid); |
232 | first = find_task_by_vpid(tgid); | ||
233 | 227 | ||
234 | if (!first || !lock_task_sighand(first, &flags)) | 228 | if (!first || !lock_task_sighand(first, &flags)) |
235 | goto out; | 229 | goto out; |
@@ -268,7 +262,6 @@ out: | |||
268 | return rc; | 262 | return rc; |
269 | } | 263 | } |
270 | 264 | ||
271 | |||
272 | static void fill_tgid_exit(struct task_struct *tsk) | 265 | static void fill_tgid_exit(struct task_struct *tsk) |
273 | { | 266 | { |
274 | unsigned long flags; | 267 | unsigned long flags; |
@@ -292,16 +285,18 @@ ret: | |||
292 | static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | 285 | static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) |
293 | { | 286 | { |
294 | struct listener_list *listeners; | 287 | struct listener_list *listeners; |
295 | struct listener *s, *tmp; | 288 | struct listener *s, *tmp, *s2; |
296 | unsigned int cpu; | 289 | unsigned int cpu; |
297 | 290 | ||
298 | if (!cpumask_subset(mask, cpu_possible_mask)) | 291 | if (!cpumask_subset(mask, cpu_possible_mask)) |
299 | return -EINVAL; | 292 | return -EINVAL; |
300 | 293 | ||
294 | s = NULL; | ||
301 | if (isadd == REGISTER) { | 295 | if (isadd == REGISTER) { |
302 | for_each_cpu(cpu, mask) { | 296 | for_each_cpu(cpu, mask) { |
303 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | 297 | if (!s) |
304 | cpu_to_node(cpu)); | 298 | s = kmalloc_node(sizeof(struct listener), |
299 | GFP_KERNEL, cpu_to_node(cpu)); | ||
305 | if (!s) | 300 | if (!s) |
306 | goto cleanup; | 301 | goto cleanup; |
307 | s->pid = pid; | 302 | s->pid = pid; |
@@ -310,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
310 | 305 | ||
311 | listeners = &per_cpu(listener_array, cpu); | 306 | listeners = &per_cpu(listener_array, cpu); |
312 | down_write(&listeners->sem); | 307 | down_write(&listeners->sem); |
308 | list_for_each_entry_safe(s2, tmp, &listeners->list, list) { | ||
309 | if (s2->pid == pid) | ||
310 | goto next_cpu; | ||
311 | } | ||
313 | list_add(&s->list, &listeners->list); | 312 | list_add(&s->list, &listeners->list); |
313 | s = NULL; | ||
314 | next_cpu: | ||
314 | up_write(&listeners->sem); | 315 | up_write(&listeners->sem); |
315 | } | 316 | } |
317 | kfree(s); | ||
316 | return 0; | 318 | return 0; |
317 | } | 319 | } |
318 | 320 | ||
@@ -355,6 +357,10 @@ static int parse(struct nlattr *na, struct cpumask *mask) | |||
355 | return ret; | 357 | return ret; |
356 | } | 358 | } |
357 | 359 | ||
360 | #if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
361 | #define TASKSTATS_NEEDS_PADDING 1 | ||
362 | #endif | ||
363 | |||
358 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | 364 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) |
359 | { | 365 | { |
360 | struct nlattr *na, *ret; | 366 | struct nlattr *na, *ret; |
@@ -364,9 +370,33 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | |||
364 | ? TASKSTATS_TYPE_AGGR_PID | 370 | ? TASKSTATS_TYPE_AGGR_PID |
365 | : TASKSTATS_TYPE_AGGR_TGID; | 371 | : TASKSTATS_TYPE_AGGR_TGID; |
366 | 372 | ||
373 | /* | ||
374 | * The taskstats structure is internally aligned on 8 byte | ||
375 | * boundaries but the layout of the aggregrate reply, with | ||
376 | * two NLA headers and the pid (each 4 bytes), actually | ||
377 | * force the entire structure to be unaligned. This causes | ||
378 | * the kernel to issue unaligned access warnings on some | ||
379 | * architectures like ia64. Unfortunately, some software out there | ||
380 | * doesn't properly unroll the NLA packet and assumes that the start | ||
381 | * of the taskstats structure will always be 20 bytes from the start | ||
382 | * of the netlink payload. Aligning the start of the taskstats | ||
383 | * structure breaks this software, which we don't want. So, for now | ||
384 | * the alignment only happens on architectures that require it | ||
385 | * and those users will have to update to fixed versions of those | ||
386 | * packages. Space is reserved in the packet only when needed. | ||
387 | * This ifdef should be removed in several years e.g. 2012 once | ||
388 | * we can be confident that fixed versions are installed on most | ||
389 | * systems. We add the padding before the aggregate since the | ||
390 | * aggregate is already a defined type. | ||
391 | */ | ||
392 | #ifdef TASKSTATS_NEEDS_PADDING | ||
393 | if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) | ||
394 | goto err; | ||
395 | #endif | ||
367 | na = nla_nest_start(skb, aggr); | 396 | na = nla_nest_start(skb, aggr); |
368 | if (!na) | 397 | if (!na) |
369 | goto err; | 398 | goto err; |
399 | |||
370 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | 400 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) |
371 | goto err; | 401 | goto err; |
372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | 402 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); |
@@ -424,74 +454,122 @@ err: | |||
424 | return rc; | 454 | return rc; |
425 | } | 455 | } |
426 | 456 | ||
427 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | 457 | static int cmd_attr_register_cpumask(struct genl_info *info) |
428 | { | 458 | { |
429 | int rc; | ||
430 | struct sk_buff *rep_skb; | ||
431 | struct taskstats *stats; | ||
432 | size_t size; | ||
433 | cpumask_var_t mask; | 459 | cpumask_var_t mask; |
460 | int rc; | ||
434 | 461 | ||
435 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | 462 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
436 | return -ENOMEM; | 463 | return -ENOMEM; |
437 | |||
438 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); | 464 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); |
439 | if (rc < 0) | 465 | if (rc < 0) |
440 | goto free_return_rc; | 466 | goto out; |
441 | if (rc == 0) { | 467 | rc = add_del_listener(info->snd_pid, mask, REGISTER); |
442 | rc = add_del_listener(info->snd_pid, mask, REGISTER); | 468 | out: |
443 | goto free_return_rc; | 469 | free_cpumask_var(mask); |
444 | } | 470 | return rc; |
471 | } | ||
445 | 472 | ||
473 | static int cmd_attr_deregister_cpumask(struct genl_info *info) | ||
474 | { | ||
475 | cpumask_var_t mask; | ||
476 | int rc; | ||
477 | |||
478 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | ||
479 | return -ENOMEM; | ||
446 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); | 480 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); |
447 | if (rc < 0) | 481 | if (rc < 0) |
448 | goto free_return_rc; | 482 | goto out; |
449 | if (rc == 0) { | 483 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); |
450 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); | 484 | out: |
451 | free_return_rc: | ||
452 | free_cpumask_var(mask); | ||
453 | return rc; | ||
454 | } | ||
455 | free_cpumask_var(mask); | 485 | free_cpumask_var(mask); |
486 | return rc; | ||
487 | } | ||
488 | |||
489 | static size_t taskstats_packet_size(void) | ||
490 | { | ||
491 | size_t size; | ||
456 | 492 | ||
457 | /* | ||
458 | * Size includes space for nested attributes | ||
459 | */ | ||
460 | size = nla_total_size(sizeof(u32)) + | 493 | size = nla_total_size(sizeof(u32)) + |
461 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 494 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
495 | #ifdef TASKSTATS_NEEDS_PADDING | ||
496 | size += nla_total_size(0); /* Padding for alignment */ | ||
497 | #endif | ||
498 | return size; | ||
499 | } | ||
500 | |||
501 | static int cmd_attr_pid(struct genl_info *info) | ||
502 | { | ||
503 | struct taskstats *stats; | ||
504 | struct sk_buff *rep_skb; | ||
505 | size_t size; | ||
506 | u32 pid; | ||
507 | int rc; | ||
508 | |||
509 | size = taskstats_packet_size(); | ||
462 | 510 | ||
463 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | 511 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
464 | if (rc < 0) | 512 | if (rc < 0) |
465 | return rc; | 513 | return rc; |
466 | 514 | ||
467 | rc = -EINVAL; | 515 | rc = -EINVAL; |
468 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | 516 | pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); |
469 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | 517 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); |
470 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); | 518 | if (!stats) |
471 | if (!stats) | 519 | goto err; |
472 | goto err; | 520 | |
473 | 521 | rc = fill_stats_for_pid(pid, stats); | |
474 | rc = fill_pid(pid, NULL, stats); | 522 | if (rc < 0) |
475 | if (rc < 0) | 523 | goto err; |
476 | goto err; | 524 | return send_reply(rep_skb, info); |
477 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | 525 | err: |
478 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | 526 | nlmsg_free(rep_skb); |
479 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); | 527 | return rc; |
480 | if (!stats) | 528 | } |
481 | goto err; | 529 | |
482 | 530 | static int cmd_attr_tgid(struct genl_info *info) | |
483 | rc = fill_tgid(tgid, NULL, stats); | 531 | { |
484 | if (rc < 0) | 532 | struct taskstats *stats; |
485 | goto err; | 533 | struct sk_buff *rep_skb; |
486 | } else | 534 | size_t size; |
535 | u32 tgid; | ||
536 | int rc; | ||
537 | |||
538 | size = taskstats_packet_size(); | ||
539 | |||
540 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | ||
541 | if (rc < 0) | ||
542 | return rc; | ||
543 | |||
544 | rc = -EINVAL; | ||
545 | tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
546 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
547 | if (!stats) | ||
487 | goto err; | 548 | goto err; |
488 | 549 | ||
550 | rc = fill_stats_for_tgid(tgid, stats); | ||
551 | if (rc < 0) | ||
552 | goto err; | ||
489 | return send_reply(rep_skb, info); | 553 | return send_reply(rep_skb, info); |
490 | err: | 554 | err: |
491 | nlmsg_free(rep_skb); | 555 | nlmsg_free(rep_skb); |
492 | return rc; | 556 | return rc; |
493 | } | 557 | } |
494 | 558 | ||
559 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | ||
560 | { | ||
561 | if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) | ||
562 | return cmd_attr_register_cpumask(info); | ||
563 | else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) | ||
564 | return cmd_attr_deregister_cpumask(info); | ||
565 | else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) | ||
566 | return cmd_attr_pid(info); | ||
567 | else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) | ||
568 | return cmd_attr_tgid(info); | ||
569 | else | ||
570 | return -EINVAL; | ||
571 | } | ||
572 | |||
495 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) | 573 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) |
496 | { | 574 | { |
497 | struct signal_struct *sig = tsk->signal; | 575 | struct signal_struct *sig = tsk->signal; |
@@ -532,8 +610,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
532 | /* | 610 | /* |
533 | * Size includes space for nested attributes | 611 | * Size includes space for nested attributes |
534 | */ | 612 | */ |
535 | size = nla_total_size(sizeof(u32)) + | 613 | size = taskstats_packet_size(); |
536 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
537 | 614 | ||
538 | is_thread_group = !!taskstats_tgid_alloc(tsk); | 615 | is_thread_group = !!taskstats_tgid_alloc(tsk); |
539 | if (is_thread_group) { | 616 | if (is_thread_group) { |
@@ -543,7 +620,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
543 | fill_tgid_exit(tsk); | 620 | fill_tgid_exit(tsk); |
544 | } | 621 | } |
545 | 622 | ||
546 | listeners = &__raw_get_cpu_var(listener_array); | 623 | listeners = __this_cpu_ptr(&listener_array); |
547 | if (list_empty(&listeners->list)) | 624 | if (list_empty(&listeners->list)) |
548 | return; | 625 | return; |
549 | 626 | ||
@@ -555,9 +632,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
555 | if (!stats) | 632 | if (!stats) |
556 | goto err; | 633 | goto err; |
557 | 634 | ||
558 | rc = fill_pid(-1, tsk, stats); | 635 | fill_stats(tsk, stats); |
559 | if (rc < 0) | ||
560 | goto err; | ||
561 | 636 | ||
562 | /* | 637 | /* |
563 | * Doesn't matter if tsk is the leader or the last group member leaving | 638 | * Doesn't matter if tsk is the leader or the last group member leaving |
@@ -619,7 +694,7 @@ static int __init taskstats_init(void) | |||
619 | goto err_cgroup_ops; | 694 | goto err_cgroup_ops; |
620 | 695 | ||
621 | family_registered = 1; | 696 | family_registered = 1; |
622 | printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); | 697 | pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); |
623 | return 0; | 698 | return 0; |
624 | err_cgroup_ops: | 699 | err_cgroup_ops: |
625 | genl_unregister_ops(&family, &taskstats_ops); | 700 | genl_unregister_ops(&family, &taskstats_ops); |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
@@ -115,7 +115,9 @@ static int test_kprobes(void) | |||
115 | int ret; | 115 | int ret; |
116 | struct kprobe *kps[2] = {&kp, &kp2}; | 116 | struct kprobe *kps[2] = {&kp, &kp2}; |
117 | 117 | ||
118 | kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 118 | /* addr and flags should be cleard for reusing kprobe. */ |
119 | kp.addr = NULL; | ||
120 | kp.flags = 0; | ||
119 | ret = register_kprobes(kps, 2); | 121 | ret = register_kprobes(kps, 2); |
120 | if (ret < 0) { | 122 | if (ret < 0) { |
121 | printk(KERN_ERR "Kprobe smoke test failed: " | 123 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -210,7 +212,9 @@ static int test_jprobes(void) | |||
210 | int ret; | 212 | int ret; |
211 | struct jprobe *jps[2] = {&jp, &jp2}; | 213 | struct jprobe *jps[2] = {&jp, &jp2}; |
212 | 214 | ||
213 | jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 215 | /* addr and flags should be cleard for reusing kprobe. */ |
216 | jp.kp.addr = NULL; | ||
217 | jp.kp.flags = 0; | ||
214 | ret = register_jprobes(jps, 2); | 218 | ret = register_jprobes(jps, 2); |
215 | if (ret < 0) { | 219 | if (ret < 0) { |
216 | printk(KERN_ERR "Kprobe smoke test failed: " | 220 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -323,7 +327,9 @@ static int test_kretprobes(void) | |||
323 | int ret; | 327 | int ret; |
324 | struct kretprobe *rps[2] = {&rp, &rp2}; | 328 | struct kretprobe *rps[2] = {&rp, &rp2}; |
325 | 329 | ||
326 | rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 330 | /* addr and flags should be cleard for reusing kprobe. */ |
331 | rp.kp.addr = NULL; | ||
332 | rp.kp.flags = 0; | ||
327 | ret = register_kretprobes(rps, 2); | 333 | ret = register_kretprobes(rps, 2); |
328 | if (ret < 0) { | 334 | if (ret < 0) { |
329 | printk(KERN_ERR "Kprobe smoke test failed: " | 335 | printk(KERN_ERR "Kprobe smoke test failed: " |
diff --git a/kernel/time.c b/kernel/time.c index ba9b338d1835..8e8dc6d705c9 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -150,7 +150,7 @@ static inline void warp_clock(void) | |||
150 | * various programs will get confused when the clock gets warped. | 150 | * various programs will get confused when the clock gets warped. |
151 | */ | 151 | */ |
152 | 152 | ||
153 | int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) | 153 | int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) |
154 | { | 154 | { |
155 | static int firsttime = 1; | 155 | static int firsttime = 1; |
156 | int error = 0; | 156 | int error = 0; |
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); | |||
238 | * Avoid unnecessary multiplications/divisions in the | 238 | * Avoid unnecessary multiplications/divisions in the |
239 | * two most common HZ cases: | 239 | * two most common HZ cases: |
240 | */ | 240 | */ |
241 | unsigned int inline jiffies_to_msecs(const unsigned long j) | 241 | inline unsigned int jiffies_to_msecs(const unsigned long j) |
242 | { | 242 | { |
243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | 243 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) |
244 | return (MSEC_PER_SEC / HZ) * j; | 244 | return (MSEC_PER_SEC / HZ) * j; |
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) | |||
254 | } | 254 | } |
255 | EXPORT_SYMBOL(jiffies_to_msecs); | 255 | EXPORT_SYMBOL(jiffies_to_msecs); |
256 | 256 | ||
257 | unsigned int inline jiffies_to_usecs(const unsigned long j) | 257 | inline unsigned int jiffies_to_usecs(const unsigned long j) |
258 | { | 258 | { |
259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | 259 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) |
260 | return (USEC_PER_SEC / HZ) * j; | 260 | return (USEC_PER_SEC / HZ) * j; |
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) | |||
645 | } | 645 | } |
646 | 646 | ||
647 | /** | 647 | /** |
648 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | 648 | * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 |
649 | * | 649 | * |
650 | * @n: nsecs in u64 | 650 | * @n: nsecs in u64 |
651 | * | 651 | * |
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) | |||
657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | 657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) |
658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | 658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years |
659 | */ | 659 | */ |
660 | unsigned long nsecs_to_jiffies(u64 n) | 660 | u64 nsecs_to_jiffies64(u64 n) |
661 | { | 661 | { |
662 | #if (NSEC_PER_SEC % HZ) == 0 | 662 | #if (NSEC_PER_SEC % HZ) == 0 |
663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ | 663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ |
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n) | |||
674 | #endif | 674 | #endif |
675 | } | 675 | } |
676 | 676 | ||
677 | #if (BITS_PER_LONG < 64) | 677 | /** |
678 | u64 get_jiffies_64(void) | 678 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies |
679 | * | ||
680 | * @n: nsecs in u64 | ||
681 | * | ||
682 | * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. | ||
683 | * And this doesn't return MAX_JIFFY_OFFSET since this function is designed | ||
684 | * for scheduler, not for use in device drivers to calculate timeout value. | ||
685 | * | ||
686 | * note: | ||
687 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | ||
688 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | ||
689 | */ | ||
690 | unsigned long nsecs_to_jiffies(u64 n) | ||
679 | { | 691 | { |
680 | unsigned long seq; | 692 | return (unsigned long)nsecs_to_jiffies64(n); |
681 | u64 ret; | ||
682 | |||
683 | do { | ||
684 | seq = read_seqbegin(&xtime_lock); | ||
685 | ret = jiffies_64; | ||
686 | } while (read_seqretry(&xtime_lock, seq)); | ||
687 | return ret; | ||
688 | } | 693 | } |
689 | EXPORT_SYMBOL(get_jiffies_64); | ||
690 | #endif | ||
691 | |||
692 | EXPORT_SYMBOL(jiffies); | ||
693 | 694 | ||
694 | /* | 695 | /* |
695 | * Add two timespec values and do a safety check for overflow. | 696 | * Add two timespec values and do a safety check for overflow. |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..e2fd74b8e8c2 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,4 +1,5 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
2 | obj-y += timeconv.o posix-clock.o alarmtimer.o | ||
2 | 3 | ||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000000..59f369f98a04 --- /dev/null +++ b/kernel/time/alarmtimer.c | |||
@@ -0,0 +1,720 @@ | |||
1 | /* | ||
2 | * Alarmtimer interface | ||
3 | * | ||
4 | * This interface provides a timer which is similarto hrtimers, | ||
5 | * but triggers a RTC alarm if the box is suspend. | ||
6 | * | ||
7 | * This interface is influenced by the Android RTC Alarm timer | ||
8 | * interface. | ||
9 | * | ||
10 | * Copyright (C) 2010 IBM Corperation | ||
11 | * | ||
12 | * Author: John Stultz <john.stultz@linaro.org> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License version 2 as | ||
16 | * published by the Free Software Foundation. | ||
17 | */ | ||
18 | #include <linux/time.h> | ||
19 | #include <linux/hrtimer.h> | ||
20 | #include <linux/timerqueue.h> | ||
21 | #include <linux/rtc.h> | ||
22 | #include <linux/alarmtimer.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <linux/platform_device.h> | ||
25 | #include <linux/posix-timers.h> | ||
26 | #include <linux/workqueue.h> | ||
27 | #include <linux/freezer.h> | ||
28 | |||
29 | /** | ||
30 | * struct alarm_base - Alarm timer bases | ||
31 | * @lock: Lock for syncrhonized access to the base | ||
32 | * @timerqueue: Timerqueue head managing the list of events | ||
33 | * @timer: hrtimer used to schedule events while running | ||
34 | * @gettime: Function to read the time correlating to the base | ||
35 | * @base_clockid: clockid for the base | ||
36 | */ | ||
37 | static struct alarm_base { | ||
38 | spinlock_t lock; | ||
39 | struct timerqueue_head timerqueue; | ||
40 | struct hrtimer timer; | ||
41 | ktime_t (*gettime)(void); | ||
42 | clockid_t base_clockid; | ||
43 | } alarm_bases[ALARM_NUMTYPE]; | ||
44 | |||
45 | /* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ | ||
46 | static ktime_t freezer_delta; | ||
47 | static DEFINE_SPINLOCK(freezer_delta_lock); | ||
48 | |||
49 | #ifdef CONFIG_RTC_CLASS | ||
50 | /* rtc timer and device for setting alarm wakeups at suspend */ | ||
51 | static struct rtc_timer rtctimer; | ||
52 | static struct rtc_device *rtcdev; | ||
53 | static DEFINE_SPINLOCK(rtcdev_lock); | ||
54 | |||
55 | /** | ||
56 | * has_wakealarm - check rtc device has wakealarm ability | ||
57 | * @dev: current device | ||
58 | * @name_ptr: name to be returned | ||
59 | * | ||
60 | * This helper function checks to see if the rtc device can wake | ||
61 | * from suspend. | ||
62 | */ | ||
63 | static int has_wakealarm(struct device *dev, void *name_ptr) | ||
64 | { | ||
65 | struct rtc_device *candidate = to_rtc_device(dev); | ||
66 | |||
67 | if (!candidate->ops->set_alarm) | ||
68 | return 0; | ||
69 | if (!device_may_wakeup(candidate->dev.parent)) | ||
70 | return 0; | ||
71 | |||
72 | *(const char **)name_ptr = dev_name(dev); | ||
73 | return 1; | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * alarmtimer_get_rtcdev - Return selected rtcdevice | ||
78 | * | ||
79 | * This function returns the rtc device to use for wakealarms. | ||
80 | * If one has not already been chosen, it checks to see if a | ||
81 | * functional rtc device is available. | ||
82 | */ | ||
83 | static struct rtc_device *alarmtimer_get_rtcdev(void) | ||
84 | { | ||
85 | struct device *dev; | ||
86 | char *str; | ||
87 | unsigned long flags; | ||
88 | struct rtc_device *ret; | ||
89 | |||
90 | spin_lock_irqsave(&rtcdev_lock, flags); | ||
91 | if (!rtcdev) { | ||
92 | /* Find an rtc device and init the rtc_timer */ | ||
93 | dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); | ||
94 | /* If we have a device then str is valid. See has_wakealarm() */ | ||
95 | if (dev) { | ||
96 | rtcdev = rtc_class_open(str); | ||
97 | /* | ||
98 | * Drop the reference we got in class_find_device, | ||
99 | * rtc_open takes its own. | ||
100 | */ | ||
101 | put_device(dev); | ||
102 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
103 | } | ||
104 | } | ||
105 | ret = rtcdev; | ||
106 | spin_unlock_irqrestore(&rtcdev_lock, flags); | ||
107 | |||
108 | return ret; | ||
109 | } | ||
110 | #else | ||
111 | #define alarmtimer_get_rtcdev() (0) | ||
112 | #define rtcdev (0) | ||
113 | #endif | ||
114 | |||
115 | |||
116 | /** | ||
117 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue | ||
118 | * @base: pointer to the base where the timer is being run | ||
119 | * @alarm: pointer to alarm being enqueued. | ||
120 | * | ||
121 | * Adds alarm to a alarm_base timerqueue and if necessary sets | ||
122 | * an hrtimer to run. | ||
123 | * | ||
124 | * Must hold base->lock when calling. | ||
125 | */ | ||
126 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | ||
127 | { | ||
128 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
129 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | ||
130 | hrtimer_try_to_cancel(&base->timer); | ||
131 | hrtimer_start(&base->timer, alarm->node.expires, | ||
132 | HRTIMER_MODE_ABS); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | /** | ||
137 | * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue | ||
138 | * @base: pointer to the base where the timer is running | ||
139 | * @alarm: pointer to alarm being removed | ||
140 | * | ||
141 | * Removes alarm to a alarm_base timerqueue and if necessary sets | ||
142 | * a new timer to run. | ||
143 | * | ||
144 | * Must hold base->lock when calling. | ||
145 | */ | ||
146 | static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | ||
147 | { | ||
148 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | ||
149 | |||
150 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
151 | if (next == &alarm->node) { | ||
152 | hrtimer_try_to_cancel(&base->timer); | ||
153 | next = timerqueue_getnext(&base->timerqueue); | ||
154 | if (!next) | ||
155 | return; | ||
156 | hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); | ||
157 | } | ||
158 | } | ||
159 | |||
160 | |||
161 | /** | ||
162 | * alarmtimer_fired - Handles alarm hrtimer being fired. | ||
163 | * @timer: pointer to hrtimer being run | ||
164 | * | ||
165 | * When a alarm timer fires, this runs through the timerqueue to | ||
166 | * see which alarms expired, and runs those. If there are more alarm | ||
167 | * timers queued for the future, we set the hrtimer to fire when | ||
168 | * when the next future alarm timer expires. | ||
169 | */ | ||
170 | static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | ||
171 | { | ||
172 | struct alarm_base *base = container_of(timer, struct alarm_base, timer); | ||
173 | struct timerqueue_node *next; | ||
174 | unsigned long flags; | ||
175 | ktime_t now; | ||
176 | int ret = HRTIMER_NORESTART; | ||
177 | |||
178 | spin_lock_irqsave(&base->lock, flags); | ||
179 | now = base->gettime(); | ||
180 | while ((next = timerqueue_getnext(&base->timerqueue))) { | ||
181 | struct alarm *alarm; | ||
182 | ktime_t expired = next->expires; | ||
183 | |||
184 | if (expired.tv64 >= now.tv64) | ||
185 | break; | ||
186 | |||
187 | alarm = container_of(next, struct alarm, node); | ||
188 | |||
189 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
190 | alarm->enabled = 0; | ||
191 | /* Re-add periodic timers */ | ||
192 | if (alarm->period.tv64) { | ||
193 | alarm->node.expires = ktime_add(expired, alarm->period); | ||
194 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
195 | alarm->enabled = 1; | ||
196 | } | ||
197 | spin_unlock_irqrestore(&base->lock, flags); | ||
198 | if (alarm->function) | ||
199 | alarm->function(alarm); | ||
200 | spin_lock_irqsave(&base->lock, flags); | ||
201 | } | ||
202 | |||
203 | if (next) { | ||
204 | hrtimer_set_expires(&base->timer, next->expires); | ||
205 | ret = HRTIMER_RESTART; | ||
206 | } | ||
207 | spin_unlock_irqrestore(&base->lock, flags); | ||
208 | |||
209 | return ret; | ||
210 | |||
211 | } | ||
212 | |||
213 | #ifdef CONFIG_RTC_CLASS | ||
214 | /** | ||
215 | * alarmtimer_suspend - Suspend time callback | ||
216 | * @dev: unused | ||
217 | * @state: unused | ||
218 | * | ||
219 | * When we are going into suspend, we look through the bases | ||
220 | * to see which is the soonest timer to expire. We then | ||
221 | * set an rtc timer to fire that far into the future, which | ||
222 | * will wake us from suspend. | ||
223 | */ | ||
224 | static int alarmtimer_suspend(struct device *dev) | ||
225 | { | ||
226 | struct rtc_time tm; | ||
227 | ktime_t min, now; | ||
228 | unsigned long flags; | ||
229 | struct rtc_device *rtc; | ||
230 | int i; | ||
231 | |||
232 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
233 | min = freezer_delta; | ||
234 | freezer_delta = ktime_set(0, 0); | ||
235 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
236 | |||
237 | rtc = rtcdev; | ||
238 | /* If we have no rtcdev, just return */ | ||
239 | if (!rtc) | ||
240 | return 0; | ||
241 | |||
242 | /* Find the soonest timer to expire*/ | ||
243 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
244 | struct alarm_base *base = &alarm_bases[i]; | ||
245 | struct timerqueue_node *next; | ||
246 | ktime_t delta; | ||
247 | |||
248 | spin_lock_irqsave(&base->lock, flags); | ||
249 | next = timerqueue_getnext(&base->timerqueue); | ||
250 | spin_unlock_irqrestore(&base->lock, flags); | ||
251 | if (!next) | ||
252 | continue; | ||
253 | delta = ktime_sub(next->expires, base->gettime()); | ||
254 | if (!min.tv64 || (delta.tv64 < min.tv64)) | ||
255 | min = delta; | ||
256 | } | ||
257 | if (min.tv64 == 0) | ||
258 | return 0; | ||
259 | |||
260 | /* XXX - Should we enforce a minimum sleep time? */ | ||
261 | WARN_ON(min.tv64 < NSEC_PER_SEC); | ||
262 | |||
263 | /* Setup an rtc timer to fire that far in the future */ | ||
264 | rtc_timer_cancel(rtc, &rtctimer); | ||
265 | rtc_read_time(rtc, &tm); | ||
266 | now = rtc_tm_to_ktime(tm); | ||
267 | now = ktime_add(now, min); | ||
268 | |||
269 | rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); | ||
270 | |||
271 | return 0; | ||
272 | } | ||
273 | #else | ||
274 | static int alarmtimer_suspend(struct device *dev) | ||
275 | { | ||
276 | return 0; | ||
277 | } | ||
278 | #endif | ||
279 | |||
280 | static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | ||
281 | { | ||
282 | ktime_t delta; | ||
283 | unsigned long flags; | ||
284 | struct alarm_base *base = &alarm_bases[type]; | ||
285 | |||
286 | delta = ktime_sub(absexp, base->gettime()); | ||
287 | |||
288 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
289 | if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) | ||
290 | freezer_delta = delta; | ||
291 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
292 | } | ||
293 | |||
294 | |||
295 | /** | ||
296 | * alarm_init - Initialize an alarm structure | ||
297 | * @alarm: ptr to alarm to be initialized | ||
298 | * @type: the type of the alarm | ||
299 | * @function: callback that is run when the alarm fires | ||
300 | */ | ||
301 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | ||
302 | void (*function)(struct alarm *)) | ||
303 | { | ||
304 | timerqueue_init(&alarm->node); | ||
305 | alarm->period = ktime_set(0, 0); | ||
306 | alarm->function = function; | ||
307 | alarm->type = type; | ||
308 | alarm->enabled = 0; | ||
309 | } | ||
310 | |||
311 | /** | ||
312 | * alarm_start - Sets an alarm to fire | ||
313 | * @alarm: ptr to alarm to set | ||
314 | * @start: time to run the alarm | ||
315 | * @period: period at which the alarm will recur | ||
316 | */ | ||
317 | void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) | ||
318 | { | ||
319 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
320 | unsigned long flags; | ||
321 | |||
322 | spin_lock_irqsave(&base->lock, flags); | ||
323 | if (alarm->enabled) | ||
324 | alarmtimer_remove(base, alarm); | ||
325 | alarm->node.expires = start; | ||
326 | alarm->period = period; | ||
327 | alarmtimer_enqueue(base, alarm); | ||
328 | alarm->enabled = 1; | ||
329 | spin_unlock_irqrestore(&base->lock, flags); | ||
330 | } | ||
331 | |||
332 | /** | ||
333 | * alarm_cancel - Tries to cancel an alarm timer | ||
334 | * @alarm: ptr to alarm to be canceled | ||
335 | */ | ||
336 | void alarm_cancel(struct alarm *alarm) | ||
337 | { | ||
338 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
339 | unsigned long flags; | ||
340 | |||
341 | spin_lock_irqsave(&base->lock, flags); | ||
342 | if (alarm->enabled) | ||
343 | alarmtimer_remove(base, alarm); | ||
344 | alarm->enabled = 0; | ||
345 | spin_unlock_irqrestore(&base->lock, flags); | ||
346 | } | ||
347 | |||
348 | |||
349 | /** | ||
350 | * clock2alarm - helper that converts from clockid to alarmtypes | ||
351 | * @clockid: clockid. | ||
352 | */ | ||
353 | static enum alarmtimer_type clock2alarm(clockid_t clockid) | ||
354 | { | ||
355 | if (clockid == CLOCK_REALTIME_ALARM) | ||
356 | return ALARM_REALTIME; | ||
357 | if (clockid == CLOCK_BOOTTIME_ALARM) | ||
358 | return ALARM_BOOTTIME; | ||
359 | return -1; | ||
360 | } | ||
361 | |||
362 | /** | ||
363 | * alarm_handle_timer - Callback for posix timers | ||
364 | * @alarm: alarm that fired | ||
365 | * | ||
366 | * Posix timer callback for expired alarm timers. | ||
367 | */ | ||
368 | static void alarm_handle_timer(struct alarm *alarm) | ||
369 | { | ||
370 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | ||
371 | it.alarmtimer); | ||
372 | if (posix_timer_event(ptr, 0) != 0) | ||
373 | ptr->it_overrun++; | ||
374 | } | ||
375 | |||
376 | /** | ||
377 | * alarm_clock_getres - posix getres interface | ||
378 | * @which_clock: clockid | ||
379 | * @tp: timespec to fill | ||
380 | * | ||
381 | * Returns the granularity of underlying alarm base clock | ||
382 | */ | ||
383 | static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
384 | { | ||
385 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | ||
386 | |||
387 | if (!alarmtimer_get_rtcdev()) | ||
388 | return -ENOTSUPP; | ||
389 | |||
390 | return hrtimer_get_res(baseid, tp); | ||
391 | } | ||
392 | |||
393 | /** | ||
394 | * alarm_clock_get - posix clock_get interface | ||
395 | * @which_clock: clockid | ||
396 | * @tp: timespec to fill. | ||
397 | * | ||
398 | * Provides the underlying alarm base time. | ||
399 | */ | ||
400 | static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | ||
401 | { | ||
402 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | ||
403 | |||
404 | if (!alarmtimer_get_rtcdev()) | ||
405 | return -ENOTSUPP; | ||
406 | |||
407 | *tp = ktime_to_timespec(base->gettime()); | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | /** | ||
412 | * alarm_timer_create - posix timer_create interface | ||
413 | * @new_timer: k_itimer pointer to manage | ||
414 | * | ||
415 | * Initializes the k_itimer structure. | ||
416 | */ | ||
417 | static int alarm_timer_create(struct k_itimer *new_timer) | ||
418 | { | ||
419 | enum alarmtimer_type type; | ||
420 | struct alarm_base *base; | ||
421 | |||
422 | if (!alarmtimer_get_rtcdev()) | ||
423 | return -ENOTSUPP; | ||
424 | |||
425 | if (!capable(CAP_WAKE_ALARM)) | ||
426 | return -EPERM; | ||
427 | |||
428 | type = clock2alarm(new_timer->it_clock); | ||
429 | base = &alarm_bases[type]; | ||
430 | alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); | ||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | /** | ||
435 | * alarm_timer_get - posix timer_get interface | ||
436 | * @new_timer: k_itimer pointer | ||
437 | * @cur_setting: itimerspec data to fill | ||
438 | * | ||
439 | * Copies the itimerspec data out from the k_itimer | ||
440 | */ | ||
441 | static void alarm_timer_get(struct k_itimer *timr, | ||
442 | struct itimerspec *cur_setting) | ||
443 | { | ||
444 | cur_setting->it_interval = | ||
445 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
446 | cur_setting->it_value = | ||
447 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
448 | return; | ||
449 | } | ||
450 | |||
451 | /** | ||
452 | * alarm_timer_del - posix timer_del interface | ||
453 | * @timr: k_itimer pointer to be deleted | ||
454 | * | ||
455 | * Cancels any programmed alarms for the given timer. | ||
456 | */ | ||
457 | static int alarm_timer_del(struct k_itimer *timr) | ||
458 | { | ||
459 | if (!rtcdev) | ||
460 | return -ENOTSUPP; | ||
461 | |||
462 | alarm_cancel(&timr->it.alarmtimer); | ||
463 | return 0; | ||
464 | } | ||
465 | |||
466 | /** | ||
467 | * alarm_timer_set - posix timer_set interface | ||
468 | * @timr: k_itimer pointer to be deleted | ||
469 | * @flags: timer flags | ||
470 | * @new_setting: itimerspec to be used | ||
471 | * @old_setting: itimerspec being replaced | ||
472 | * | ||
473 | * Sets the timer to new_setting, and starts the timer. | ||
474 | */ | ||
475 | static int alarm_timer_set(struct k_itimer *timr, int flags, | ||
476 | struct itimerspec *new_setting, | ||
477 | struct itimerspec *old_setting) | ||
478 | { | ||
479 | if (!rtcdev) | ||
480 | return -ENOTSUPP; | ||
481 | |||
482 | /* Save old values */ | ||
483 | old_setting->it_interval = | ||
484 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
485 | old_setting->it_value = | ||
486 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
487 | |||
488 | /* If the timer was already set, cancel it */ | ||
489 | alarm_cancel(&timr->it.alarmtimer); | ||
490 | |||
491 | /* start the timer */ | ||
492 | alarm_start(&timr->it.alarmtimer, | ||
493 | timespec_to_ktime(new_setting->it_value), | ||
494 | timespec_to_ktime(new_setting->it_interval)); | ||
495 | return 0; | ||
496 | } | ||
497 | |||
498 | /** | ||
499 | * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep | ||
500 | * @alarm: ptr to alarm that fired | ||
501 | * | ||
502 | * Wakes up the task that set the alarmtimer | ||
503 | */ | ||
504 | static void alarmtimer_nsleep_wakeup(struct alarm *alarm) | ||
505 | { | ||
506 | struct task_struct *task = (struct task_struct *)alarm->data; | ||
507 | |||
508 | alarm->data = NULL; | ||
509 | if (task) | ||
510 | wake_up_process(task); | ||
511 | } | ||
512 | |||
513 | /** | ||
514 | * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation | ||
515 | * @alarm: ptr to alarmtimer | ||
516 | * @absexp: absolute expiration time | ||
517 | * | ||
518 | * Sets the alarm timer and sleeps until it is fired or interrupted. | ||
519 | */ | ||
520 | static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | ||
521 | { | ||
522 | alarm->data = (void *)current; | ||
523 | do { | ||
524 | set_current_state(TASK_INTERRUPTIBLE); | ||
525 | alarm_start(alarm, absexp, ktime_set(0, 0)); | ||
526 | if (likely(alarm->data)) | ||
527 | schedule(); | ||
528 | |||
529 | alarm_cancel(alarm); | ||
530 | } while (alarm->data && !signal_pending(current)); | ||
531 | |||
532 | __set_current_state(TASK_RUNNING); | ||
533 | |||
534 | return (alarm->data == NULL); | ||
535 | } | ||
536 | |||
537 | |||
538 | /** | ||
539 | * update_rmtp - Update remaining timespec value | ||
540 | * @exp: expiration time | ||
541 | * @type: timer type | ||
542 | * @rmtp: user pointer to remaining timepsec value | ||
543 | * | ||
544 | * Helper function that fills in rmtp value with time between | ||
545 | * now and the exp value | ||
546 | */ | ||
547 | static int update_rmtp(ktime_t exp, enum alarmtimer_type type, | ||
548 | struct timespec __user *rmtp) | ||
549 | { | ||
550 | struct timespec rmt; | ||
551 | ktime_t rem; | ||
552 | |||
553 | rem = ktime_sub(exp, alarm_bases[type].gettime()); | ||
554 | |||
555 | if (rem.tv64 <= 0) | ||
556 | return 0; | ||
557 | rmt = ktime_to_timespec(rem); | ||
558 | |||
559 | if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) | ||
560 | return -EFAULT; | ||
561 | |||
562 | return 1; | ||
563 | |||
564 | } | ||
565 | |||
566 | /** | ||
567 | * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep | ||
568 | * @restart: ptr to restart block | ||
569 | * | ||
570 | * Handles restarted clock_nanosleep calls | ||
571 | */ | ||
572 | static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) | ||
573 | { | ||
574 | enum alarmtimer_type type = restart->nanosleep.clockid; | ||
575 | ktime_t exp; | ||
576 | struct timespec __user *rmtp; | ||
577 | struct alarm alarm; | ||
578 | int ret = 0; | ||
579 | |||
580 | exp.tv64 = restart->nanosleep.expires; | ||
581 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
582 | |||
583 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
584 | goto out; | ||
585 | |||
586 | if (freezing(current)) | ||
587 | alarmtimer_freezerset(exp, type); | ||
588 | |||
589 | rmtp = restart->nanosleep.rmtp; | ||
590 | if (rmtp) { | ||
591 | ret = update_rmtp(exp, type, rmtp); | ||
592 | if (ret <= 0) | ||
593 | goto out; | ||
594 | } | ||
595 | |||
596 | |||
597 | /* The other values in restart are already filled in */ | ||
598 | ret = -ERESTART_RESTARTBLOCK; | ||
599 | out: | ||
600 | return ret; | ||
601 | } | ||
602 | |||
603 | /** | ||
604 | * alarm_timer_nsleep - alarmtimer nanosleep | ||
605 | * @which_clock: clockid | ||
606 | * @flags: determins abstime or relative | ||
607 | * @tsreq: requested sleep time (abs or rel) | ||
608 | * @rmtp: remaining sleep time saved | ||
609 | * | ||
610 | * Handles clock_nanosleep calls against _ALARM clockids | ||
611 | */ | ||
612 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | ||
613 | struct timespec *tsreq, struct timespec __user *rmtp) | ||
614 | { | ||
615 | enum alarmtimer_type type = clock2alarm(which_clock); | ||
616 | struct alarm alarm; | ||
617 | ktime_t exp; | ||
618 | int ret = 0; | ||
619 | struct restart_block *restart; | ||
620 | |||
621 | if (!alarmtimer_get_rtcdev()) | ||
622 | return -ENOTSUPP; | ||
623 | |||
624 | if (!capable(CAP_WAKE_ALARM)) | ||
625 | return -EPERM; | ||
626 | |||
627 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
628 | |||
629 | exp = timespec_to_ktime(*tsreq); | ||
630 | /* Convert (if necessary) to absolute time */ | ||
631 | if (flags != TIMER_ABSTIME) { | ||
632 | ktime_t now = alarm_bases[type].gettime(); | ||
633 | exp = ktime_add(now, exp); | ||
634 | } | ||
635 | |||
636 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
637 | goto out; | ||
638 | |||
639 | if (freezing(current)) | ||
640 | alarmtimer_freezerset(exp, type); | ||
641 | |||
642 | /* abs timers don't set remaining time or restart */ | ||
643 | if (flags == TIMER_ABSTIME) { | ||
644 | ret = -ERESTARTNOHAND; | ||
645 | goto out; | ||
646 | } | ||
647 | |||
648 | if (rmtp) { | ||
649 | ret = update_rmtp(exp, type, rmtp); | ||
650 | if (ret <= 0) | ||
651 | goto out; | ||
652 | } | ||
653 | |||
654 | restart = ¤t_thread_info()->restart_block; | ||
655 | restart->fn = alarm_timer_nsleep_restart; | ||
656 | restart->nanosleep.clockid = type; | ||
657 | restart->nanosleep.expires = exp.tv64; | ||
658 | restart->nanosleep.rmtp = rmtp; | ||
659 | ret = -ERESTART_RESTARTBLOCK; | ||
660 | |||
661 | out: | ||
662 | return ret; | ||
663 | } | ||
664 | |||
665 | |||
666 | /* Suspend hook structures */ | ||
667 | static const struct dev_pm_ops alarmtimer_pm_ops = { | ||
668 | .suspend = alarmtimer_suspend, | ||
669 | }; | ||
670 | |||
671 | static struct platform_driver alarmtimer_driver = { | ||
672 | .driver = { | ||
673 | .name = "alarmtimer", | ||
674 | .pm = &alarmtimer_pm_ops, | ||
675 | } | ||
676 | }; | ||
677 | |||
678 | /** | ||
679 | * alarmtimer_init - Initialize alarm timer code | ||
680 | * | ||
681 | * This function initializes the alarm bases and registers | ||
682 | * the posix clock ids. | ||
683 | */ | ||
684 | static int __init alarmtimer_init(void) | ||
685 | { | ||
686 | int error = 0; | ||
687 | int i; | ||
688 | struct k_clock alarm_clock = { | ||
689 | .clock_getres = alarm_clock_getres, | ||
690 | .clock_get = alarm_clock_get, | ||
691 | .timer_create = alarm_timer_create, | ||
692 | .timer_set = alarm_timer_set, | ||
693 | .timer_del = alarm_timer_del, | ||
694 | .timer_get = alarm_timer_get, | ||
695 | .nsleep = alarm_timer_nsleep, | ||
696 | }; | ||
697 | |||
698 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); | ||
699 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); | ||
700 | |||
701 | /* Initialize alarm bases */ | ||
702 | alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; | ||
703 | alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; | ||
704 | alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; | ||
705 | alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; | ||
706 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
707 | timerqueue_init_head(&alarm_bases[i].timerqueue); | ||
708 | spin_lock_init(&alarm_bases[i].lock); | ||
709 | hrtimer_init(&alarm_bases[i].timer, | ||
710 | alarm_bases[i].base_clockid, | ||
711 | HRTIMER_MODE_ABS); | ||
712 | alarm_bases[i].timer.function = alarmtimer_fired; | ||
713 | } | ||
714 | error = platform_driver_register(&alarmtimer_driver); | ||
715 | platform_device_register_simple("alarmtimer", -1, NULL, 0); | ||
716 | |||
717 | return error; | ||
718 | } | ||
719 | device_initcall(alarmtimer_init); | ||
720 | |||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f3..e4c699dfa4e8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/sysdev.h> | 20 | #include <linux/sysdev.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
24 | 23 | ||
@@ -183,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
183 | unsigned long flags; | 182 | unsigned long flags; |
184 | 183 | ||
185 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 184 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); |
186 | BUG_ON(!dev->cpumask); | 185 | if (!dev->cpumask) { |
186 | WARN_ON(num_possible_cpus() > 1); | ||
187 | dev->cpumask = cpumask_of(smp_processor_id()); | ||
188 | } | ||
187 | 189 | ||
188 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 190 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
189 | 191 | ||
@@ -195,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
195 | } | 197 | } |
196 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 198 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
197 | 199 | ||
200 | static void clockevents_config(struct clock_event_device *dev, | ||
201 | u32 freq) | ||
202 | { | ||
203 | u64 sec; | ||
204 | |||
205 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
206 | return; | ||
207 | |||
208 | /* | ||
209 | * Calculate the maximum number of seconds we can sleep. Limit | ||
210 | * to 10 minutes for hardware which can program more than | ||
211 | * 32bit ticks so we still get reasonable conversion values. | ||
212 | */ | ||
213 | sec = dev->max_delta_ticks; | ||
214 | do_div(sec, freq); | ||
215 | if (!sec) | ||
216 | sec = 1; | ||
217 | else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) | ||
218 | sec = 600; | ||
219 | |||
220 | clockevents_calc_mult_shift(dev, freq, sec); | ||
221 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | ||
222 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * clockevents_config_and_register - Configure and register a clock event device | ||
227 | * @dev: device to register | ||
228 | * @freq: The clock frequency | ||
229 | * @min_delta: The minimum clock ticks to program in oneshot mode | ||
230 | * @max_delta: The maximum clock ticks to program in oneshot mode | ||
231 | * | ||
232 | * min/max_delta can be 0 for devices which do not support oneshot mode. | ||
233 | */ | ||
234 | void clockevents_config_and_register(struct clock_event_device *dev, | ||
235 | u32 freq, unsigned long min_delta, | ||
236 | unsigned long max_delta) | ||
237 | { | ||
238 | dev->min_delta_ticks = min_delta; | ||
239 | dev->max_delta_ticks = max_delta; | ||
240 | clockevents_config(dev, freq); | ||
241 | clockevents_register_device(dev); | ||
242 | } | ||
243 | |||
244 | /** | ||
245 | * clockevents_update_freq - Update frequency and reprogram a clock event device. | ||
246 | * @dev: device to modify | ||
247 | * @freq: new device frequency | ||
248 | * | ||
249 | * Reconfigure and reprogram a clock event device in oneshot | ||
250 | * mode. Must be called on the cpu for which the device delivers per | ||
251 | * cpu timer events with interrupts disabled! Returns 0 on success, | ||
252 | * -ETIME when the event is in the past. | ||
253 | */ | ||
254 | int clockevents_update_freq(struct clock_event_device *dev, u32 freq) | ||
255 | { | ||
256 | clockevents_config(dev, freq); | ||
257 | |||
258 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | ||
259 | return 0; | ||
260 | |||
261 | return clockevents_program_event(dev, dev->next_event, ktime_get()); | ||
262 | } | ||
263 | |||
198 | /* | 264 | /* |
199 | * Noop handler when we shut down an event device | 265 | * Noop handler when we shut down an event device |
200 | */ | 266 | */ |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4b..e0980f0d9a0a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
113 | * @shift: pointer to shift variable | 113 | * @shift: pointer to shift variable |
114 | * @from: frequency to convert from | 114 | * @from: frequency to convert from |
115 | * @to: frequency to convert to | 115 | * @to: frequency to convert to |
116 | * @minsec: guaranteed runtime conversion range in seconds | 116 | * @maxsec: guaranteed runtime conversion range in seconds |
117 | * | 117 | * |
118 | * The function evaluates the shift/mult pair for the scaled math | 118 | * The function evaluates the shift/mult pair for the scaled math |
119 | * operations of clocksources and clockevents. | 119 | * operations of clocksources and clockevents. |
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock | 122 | * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock |
123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. | 123 | * event @to is the counter frequency and @from is NSEC_PER_SEC. |
124 | * | 124 | * |
125 | * The @minsec conversion range argument controls the time frame in | 125 | * The @maxsec conversion range argument controls the time frame in |
126 | * seconds which must be covered by the runtime conversion with the | 126 | * seconds which must be covered by the runtime conversion with the |
127 | * calculated mult and shift factors. This guarantees that no 64bit | 127 | * calculated mult and shift factors. This guarantees that no 64bit |
128 | * overflow happens when the input value of the conversion is | 128 | * overflow happens when the input value of the conversion is |
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); | |||
131 | * factors. | 131 | * factors. |
132 | */ | 132 | */ |
133 | void | 133 | void |
134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | 134 | clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) |
135 | { | 135 | { |
136 | u64 tmp; | 136 | u64 tmp; |
137 | u32 sft, sftacc= 32; | 137 | u32 sft, sftacc= 32; |
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
140 | * Calculate the shift factor which is limiting the conversion | 140 | * Calculate the shift factor which is limiting the conversion |
141 | * range: | 141 | * range: |
142 | */ | 142 | */ |
143 | tmp = ((u64)minsec * from) >> 32; | 143 | tmp = ((u64)maxsec * from) >> 32; |
144 | while (tmp) { | 144 | while (tmp) { |
145 | tmp >>=1; | 145 | tmp >>=1; |
146 | sftacc--; | 146 | sftacc--; |
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) | |||
152 | */ | 152 | */ |
153 | for (sft = 32; sft > 0; sft--) { | 153 | for (sft = 32; sft > 0; sft--) { |
154 | tmp = (u64) to << sft; | 154 | tmp = (u64) to << sft; |
155 | tmp += from / 2; | ||
155 | do_div(tmp, from); | 156 | do_div(tmp, from); |
156 | if ((tmp >> sftacc) == 0) | 157 | if ((tmp >> sftacc) == 0) |
157 | break; | 158 | break; |
@@ -184,7 +185,6 @@ static struct clocksource *watchdog; | |||
184 | static struct timer_list watchdog_timer; | 185 | static struct timer_list watchdog_timer; |
185 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); | 186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); |
186 | static DEFINE_SPINLOCK(watchdog_lock); | 187 | static DEFINE_SPINLOCK(watchdog_lock); |
187 | static cycle_t watchdog_last; | ||
188 | static int watchdog_running; | 188 | static int watchdog_running; |
189 | 189 | ||
190 | static int clocksource_watchdog_kthread(void *data); | 190 | static int clocksource_watchdog_kthread(void *data); |
@@ -253,11 +253,6 @@ static void clocksource_watchdog(unsigned long data) | |||
253 | if (!watchdog_running) | 253 | if (!watchdog_running) |
254 | goto out; | 254 | goto out; |
255 | 255 | ||
256 | wdnow = watchdog->read(watchdog); | ||
257 | wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, | ||
258 | watchdog->mult, watchdog->shift); | ||
259 | watchdog_last = wdnow; | ||
260 | |||
261 | list_for_each_entry(cs, &watchdog_list, wd_list) { | 256 | list_for_each_entry(cs, &watchdog_list, wd_list) { |
262 | 257 | ||
263 | /* Clocksource already marked unstable? */ | 258 | /* Clocksource already marked unstable? */ |
@@ -267,19 +262,28 @@ static void clocksource_watchdog(unsigned long data) | |||
267 | continue; | 262 | continue; |
268 | } | 263 | } |
269 | 264 | ||
265 | local_irq_disable(); | ||
270 | csnow = cs->read(cs); | 266 | csnow = cs->read(cs); |
267 | wdnow = watchdog->read(watchdog); | ||
268 | local_irq_enable(); | ||
271 | 269 | ||
272 | /* Clocksource initialized ? */ | 270 | /* Clocksource initialized ? */ |
273 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | 271 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { |
274 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | 272 | cs->flags |= CLOCK_SOURCE_WATCHDOG; |
275 | cs->wd_last = csnow; | 273 | cs->wd_last = wdnow; |
274 | cs->cs_last = csnow; | ||
276 | continue; | 275 | continue; |
277 | } | 276 | } |
278 | 277 | ||
279 | /* Check the deviation from the watchdog clocksource. */ | 278 | wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, |
280 | cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & | 279 | watchdog->mult, watchdog->shift); |
280 | |||
281 | cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & | ||
281 | cs->mask, cs->mult, cs->shift); | 282 | cs->mask, cs->mult, cs->shift); |
282 | cs->wd_last = csnow; | 283 | cs->cs_last = csnow; |
284 | cs->wd_last = wdnow; | ||
285 | |||
286 | /* Check the deviation from the watchdog clocksource. */ | ||
283 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { | 287 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { |
284 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 288 | clocksource_unstable(cs, cs_nsec - wd_nsec); |
285 | continue; | 289 | continue; |
@@ -317,7 +321,6 @@ static inline void clocksource_start_watchdog(void) | |||
317 | return; | 321 | return; |
318 | init_timer(&watchdog_timer); | 322 | init_timer(&watchdog_timer); |
319 | watchdog_timer.function = clocksource_watchdog; | 323 | watchdog_timer.function = clocksource_watchdog; |
320 | watchdog_last = watchdog->read(watchdog); | ||
321 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 324 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
322 | add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); | 325 | add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); |
323 | watchdog_running = 1; | 326 | watchdog_running = 1; |
@@ -625,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
625 | list_add(&cs->list, entry); | 628 | list_add(&cs->list, entry); |
626 | } | 629 | } |
627 | 630 | ||
628 | |||
629 | /* | ||
630 | * Maximum time we expect to go between ticks. This includes idle | ||
631 | * tickless time. It provides the trade off between selecting a | ||
632 | * mult/shift pair that is very precise but can only handle a short | ||
633 | * period of time, vs. a mult/shift pair that can handle long periods | ||
634 | * of time but isn't as precise. | ||
635 | * | ||
636 | * This is a subsystem constant, and actual hardware limitations | ||
637 | * may override it (ie: clocksources that wrap every 3 seconds). | ||
638 | */ | ||
639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | ||
640 | |||
641 | /** | 631 | /** |
642 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 632 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
643 | * @t: clocksource to be registered | 633 | * @t: clocksource to be registered |
@@ -651,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
651 | */ | 641 | */ |
652 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 642 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
653 | { | 643 | { |
644 | u64 sec; | ||
645 | |||
654 | /* | 646 | /* |
655 | * Ideally we want to use some of the limits used in | 647 | * Calc the maximum number of seconds which we can run before |
656 | * clocksource_max_deferment, to provide a more informed | 648 | * wrapping around. For clocksources which have a mask > 32bit |
657 | * MAX_UPDATE_LENGTH. But for now this just gets the | 649 | * we need to limit the max sleep time to have a good |
658 | * register interface working properly. | 650 | * conversion precision. 10 minutes is still a reasonable |
651 | * amount. That results in a shift value of 24 for a | ||
652 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
653 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
654 | * margin as we do in clocksource_max_deferment() | ||
659 | */ | 655 | */ |
656 | sec = (cs->mask - (cs->mask >> 5)); | ||
657 | do_div(sec, freq); | ||
658 | do_div(sec, scale); | ||
659 | if (!sec) | ||
660 | sec = 1; | ||
661 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
662 | sec = 600; | ||
663 | |||
660 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 664 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
661 | NSEC_PER_SEC/scale, | 665 | NSEC_PER_SEC / scale, sec * scale); |
662 | MAX_UPDATE_LENGTH*scale); | ||
663 | cs->max_idle_ns = clocksource_max_deferment(cs); | 666 | cs->max_idle_ns = clocksource_max_deferment(cs); |
664 | } | 667 | } |
665 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 668 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
@@ -678,14 +681,14 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | |||
678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 681 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) |
679 | { | 682 | { |
680 | 683 | ||
681 | /* Intialize mult/shift and max_idle_ns */ | 684 | /* Initialize mult/shift and max_idle_ns */ |
682 | __clocksource_updatefreq_scale(cs, scale, freq); | 685 | __clocksource_updatefreq_scale(cs, scale, freq); |
683 | 686 | ||
684 | /* Add clocksource to the clcoksource list */ | 687 | /* Add clocksource to the clcoksource list */ |
685 | mutex_lock(&clocksource_mutex); | 688 | mutex_lock(&clocksource_mutex); |
686 | clocksource_enqueue(cs); | 689 | clocksource_enqueue(cs); |
687 | clocksource_select(); | ||
688 | clocksource_enqueue_watchdog(cs); | 690 | clocksource_enqueue_watchdog(cs); |
691 | clocksource_select(); | ||
689 | mutex_unlock(&clocksource_mutex); | 692 | mutex_unlock(&clocksource_mutex); |
690 | return 0; | 693 | return 0; |
691 | } | 694 | } |
@@ -705,8 +708,8 @@ int clocksource_register(struct clocksource *cs) | |||
705 | 708 | ||
706 | mutex_lock(&clocksource_mutex); | 709 | mutex_lock(&clocksource_mutex); |
707 | clocksource_enqueue(cs); | 710 | clocksource_enqueue(cs); |
708 | clocksource_select(); | ||
709 | clocksource_enqueue_watchdog(cs); | 711 | clocksource_enqueue_watchdog(cs); |
712 | clocksource_select(); | ||
710 | mutex_unlock(&clocksource_mutex); | 713 | mutex_unlock(&clocksource_mutex); |
711 | return 0; | 714 | return 0; |
712 | } | 715 | } |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a8456909..a470154e0408 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -22,8 +22,11 @@ | |||
22 | ************************************************************************/ | 22 | ************************************************************************/ |
23 | #include <linux/clocksource.h> | 23 | #include <linux/clocksource.h> |
24 | #include <linux/jiffies.h> | 24 | #include <linux/jiffies.h> |
25 | #include <linux/module.h> | ||
25 | #include <linux/init.h> | 26 | #include <linux/init.h> |
26 | 27 | ||
28 | #include "tick-internal.h" | ||
29 | |||
27 | /* The Jiffies based clocksource is the lowest common | 30 | /* The Jiffies based clocksource is the lowest common |
28 | * denominator clock source which should function on | 31 | * denominator clock source which should function on |
29 | * all systems. It has the same coarse resolution as | 32 | * all systems. It has the same coarse resolution as |
@@ -31,7 +34,7 @@ | |||
31 | * inaccuracies caused by missed or lost timer | 34 | * inaccuracies caused by missed or lost timer |
32 | * interrupts and the inability for the timer | 35 | * interrupts and the inability for the timer |
33 | * interrupt hardware to accuratly tick at the | 36 | * interrupt hardware to accuratly tick at the |
34 | * requested HZ value. It is also not reccomended | 37 | * requested HZ value. It is also not recommended |
35 | * for "tick-less" systems. | 38 | * for "tick-less" systems. |
36 | */ | 39 | */ |
37 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | 40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) |
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = { | |||
64 | .shift = JIFFIES_SHIFT, | 67 | .shift = JIFFIES_SHIFT, |
65 | }; | 68 | }; |
66 | 69 | ||
70 | #if (BITS_PER_LONG < 64) | ||
71 | u64 get_jiffies_64(void) | ||
72 | { | ||
73 | unsigned long seq; | ||
74 | u64 ret; | ||
75 | |||
76 | do { | ||
77 | seq = read_seqbegin(&xtime_lock); | ||
78 | ret = jiffies_64; | ||
79 | } while (read_seqretry(&xtime_lock, seq)); | ||
80 | return ret; | ||
81 | } | ||
82 | EXPORT_SYMBOL(get_jiffies_64); | ||
83 | #endif | ||
84 | |||
85 | EXPORT_SYMBOL(jiffies); | ||
86 | |||
67 | static int __init init_jiffies_clocksource(void) | 87 | static int __init init_jiffies_clocksource(void) |
68 | { | 88 | { |
69 | return clocksource_register(&clocksource_jiffies); | 89 | return clocksource_register(&clocksource_jiffies); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c63116863a80..f6117a4c7cb8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -14,6 +14,9 @@ | |||
14 | #include <linux/timex.h> | 14 | #include <linux/timex.h> |
15 | #include <linux/time.h> | 15 | #include <linux/time.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/module.h> | ||
18 | |||
19 | #include "tick-internal.h" | ||
17 | 20 | ||
18 | /* | 21 | /* |
19 | * NTP timekeeping variables: | 22 | * NTP timekeeping variables: |
@@ -74,6 +77,162 @@ static long time_adjust; | |||
74 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ | 77 | /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ |
75 | static s64 ntp_tick_adj; | 78 | static s64 ntp_tick_adj; |
76 | 79 | ||
80 | #ifdef CONFIG_NTP_PPS | ||
81 | |||
82 | /* | ||
83 | * The following variables are used when a pulse-per-second (PPS) signal | ||
84 | * is available. They establish the engineering parameters of the clock | ||
85 | * discipline loop when controlled by the PPS signal. | ||
86 | */ | ||
87 | #define PPS_VALID 10 /* PPS signal watchdog max (s) */ | ||
88 | #define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ | ||
89 | #define PPS_INTMIN 2 /* min freq interval (s) (shift) */ | ||
90 | #define PPS_INTMAX 8 /* max freq interval (s) (shift) */ | ||
91 | #define PPS_INTCOUNT 4 /* number of consecutive good intervals to | ||
92 | increase pps_shift or consecutive bad | ||
93 | intervals to decrease it */ | ||
94 | #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ | ||
95 | |||
96 | static int pps_valid; /* signal watchdog counter */ | ||
97 | static long pps_tf[3]; /* phase median filter */ | ||
98 | static long pps_jitter; /* current jitter (ns) */ | ||
99 | static struct timespec pps_fbase; /* beginning of the last freq interval */ | ||
100 | static int pps_shift; /* current interval duration (s) (shift) */ | ||
101 | static int pps_intcnt; /* interval counter */ | ||
102 | static s64 pps_freq; /* frequency offset (scaled ns/s) */ | ||
103 | static long pps_stabil; /* current stability (scaled ns/s) */ | ||
104 | |||
105 | /* | ||
106 | * PPS signal quality monitors | ||
107 | */ | ||
108 | static long pps_calcnt; /* calibration intervals */ | ||
109 | static long pps_jitcnt; /* jitter limit exceeded */ | ||
110 | static long pps_stbcnt; /* stability limit exceeded */ | ||
111 | static long pps_errcnt; /* calibration errors */ | ||
112 | |||
113 | |||
114 | /* PPS kernel consumer compensates the whole phase error immediately. | ||
115 | * Otherwise, reduce the offset by a fixed factor times the time constant. | ||
116 | */ | ||
117 | static inline s64 ntp_offset_chunk(s64 offset) | ||
118 | { | ||
119 | if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) | ||
120 | return offset; | ||
121 | else | ||
122 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
123 | } | ||
124 | |||
125 | static inline void pps_reset_freq_interval(void) | ||
126 | { | ||
127 | /* the PPS calibration interval may end | ||
128 | surprisingly early */ | ||
129 | pps_shift = PPS_INTMIN; | ||
130 | pps_intcnt = 0; | ||
131 | } | ||
132 | |||
133 | /** | ||
134 | * pps_clear - Clears the PPS state variables | ||
135 | * | ||
136 | * Must be called while holding a write on the xtime_lock | ||
137 | */ | ||
138 | static inline void pps_clear(void) | ||
139 | { | ||
140 | pps_reset_freq_interval(); | ||
141 | pps_tf[0] = 0; | ||
142 | pps_tf[1] = 0; | ||
143 | pps_tf[2] = 0; | ||
144 | pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; | ||
145 | pps_freq = 0; | ||
146 | } | ||
147 | |||
148 | /* Decrease pps_valid to indicate that another second has passed since | ||
149 | * the last PPS signal. When it reaches 0, indicate that PPS signal is | ||
150 | * missing. | ||
151 | * | ||
152 | * Must be called while holding a write on the xtime_lock | ||
153 | */ | ||
154 | static inline void pps_dec_valid(void) | ||
155 | { | ||
156 | if (pps_valid > 0) | ||
157 | pps_valid--; | ||
158 | else { | ||
159 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | ||
160 | STA_PPSWANDER | STA_PPSERROR); | ||
161 | pps_clear(); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | static inline void pps_set_freq(s64 freq) | ||
166 | { | ||
167 | pps_freq = freq; | ||
168 | } | ||
169 | |||
170 | static inline int is_error_status(int status) | ||
171 | { | ||
172 | return (time_status & (STA_UNSYNC|STA_CLOCKERR)) | ||
173 | /* PPS signal lost when either PPS time or | ||
174 | * PPS frequency synchronization requested | ||
175 | */ | ||
176 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) | ||
177 | && !(time_status & STA_PPSSIGNAL)) | ||
178 | /* PPS jitter exceeded when | ||
179 | * PPS time synchronization requested */ | ||
180 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | ||
181 | == (STA_PPSTIME|STA_PPSJITTER)) | ||
182 | /* PPS wander exceeded or calibration error when | ||
183 | * PPS frequency synchronization requested | ||
184 | */ | ||
185 | || ((time_status & STA_PPSFREQ) | ||
186 | && (time_status & (STA_PPSWANDER|STA_PPSERROR))); | ||
187 | } | ||
188 | |||
189 | static inline void pps_fill_timex(struct timex *txc) | ||
190 | { | ||
191 | txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * | ||
192 | PPM_SCALE_INV, NTP_SCALE_SHIFT); | ||
193 | txc->jitter = pps_jitter; | ||
194 | if (!(time_status & STA_NANO)) | ||
195 | txc->jitter /= NSEC_PER_USEC; | ||
196 | txc->shift = pps_shift; | ||
197 | txc->stabil = pps_stabil; | ||
198 | txc->jitcnt = pps_jitcnt; | ||
199 | txc->calcnt = pps_calcnt; | ||
200 | txc->errcnt = pps_errcnt; | ||
201 | txc->stbcnt = pps_stbcnt; | ||
202 | } | ||
203 | |||
204 | #else /* !CONFIG_NTP_PPS */ | ||
205 | |||
206 | static inline s64 ntp_offset_chunk(s64 offset) | ||
207 | { | ||
208 | return shift_right(offset, SHIFT_PLL + time_constant); | ||
209 | } | ||
210 | |||
211 | static inline void pps_reset_freq_interval(void) {} | ||
212 | static inline void pps_clear(void) {} | ||
213 | static inline void pps_dec_valid(void) {} | ||
214 | static inline void pps_set_freq(s64 freq) {} | ||
215 | |||
216 | static inline int is_error_status(int status) | ||
217 | { | ||
218 | return status & (STA_UNSYNC|STA_CLOCKERR); | ||
219 | } | ||
220 | |||
221 | static inline void pps_fill_timex(struct timex *txc) | ||
222 | { | ||
223 | /* PPS is not implemented, so these are zero */ | ||
224 | txc->ppsfreq = 0; | ||
225 | txc->jitter = 0; | ||
226 | txc->shift = 0; | ||
227 | txc->stabil = 0; | ||
228 | txc->jitcnt = 0; | ||
229 | txc->calcnt = 0; | ||
230 | txc->errcnt = 0; | ||
231 | txc->stbcnt = 0; | ||
232 | } | ||
233 | |||
234 | #endif /* CONFIG_NTP_PPS */ | ||
235 | |||
77 | /* | 236 | /* |
78 | * NTP methods: | 237 | * NTP methods: |
79 | */ | 238 | */ |
@@ -149,10 +308,18 @@ static void ntp_update_offset(long offset) | |||
149 | time_reftime = get_seconds(); | 308 | time_reftime = get_seconds(); |
150 | 309 | ||
151 | offset64 = offset; | 310 | offset64 = offset; |
152 | freq_adj = (offset64 * secs) << | 311 | freq_adj = ntp_update_offset_fll(offset64, secs); |
153 | (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); | ||
154 | 312 | ||
155 | freq_adj += ntp_update_offset_fll(offset64, secs); | 313 | /* |
314 | * Clamp update interval to reduce PLL gain with low | ||
315 | * sampling rate (e.g. intermittent network connection) | ||
316 | * to avoid instability. | ||
317 | */ | ||
318 | if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) | ||
319 | secs = 1 << (SHIFT_PLL + 1 + time_constant); | ||
320 | |||
321 | freq_adj += (offset64 * secs) << | ||
322 | (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); | ||
156 | 323 | ||
157 | freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); | 324 | freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); |
158 | 325 | ||
@@ -177,6 +344,9 @@ void ntp_clear(void) | |||
177 | 344 | ||
178 | tick_length = tick_length_base; | 345 | tick_length = tick_length_base; |
179 | time_offset = 0; | 346 | time_offset = 0; |
347 | |||
348 | /* Clear PPS state variables */ | ||
349 | pps_clear(); | ||
180 | } | 350 | } |
181 | 351 | ||
182 | /* | 352 | /* |
@@ -242,16 +412,16 @@ void second_overflow(void) | |||
242 | time_status |= STA_UNSYNC; | 412 | time_status |= STA_UNSYNC; |
243 | } | 413 | } |
244 | 414 | ||
245 | /* | 415 | /* Compute the phase adjustment for the next second */ |
246 | * Compute the phase adjustment for the next second. The offset is | ||
247 | * reduced by a fixed factor times the time constant. | ||
248 | */ | ||
249 | tick_length = tick_length_base; | 416 | tick_length = tick_length_base; |
250 | 417 | ||
251 | delta = shift_right(time_offset, SHIFT_PLL + time_constant); | 418 | delta = ntp_offset_chunk(time_offset); |
252 | time_offset -= delta; | 419 | time_offset -= delta; |
253 | tick_length += delta; | 420 | tick_length += delta; |
254 | 421 | ||
422 | /* Check PPS signal */ | ||
423 | pps_dec_valid(); | ||
424 | |||
255 | if (!time_adjust) | 425 | if (!time_adjust) |
256 | return; | 426 | return; |
257 | 427 | ||
@@ -361,6 +531,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
361 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { | 531 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { |
362 | time_state = TIME_OK; | 532 | time_state = TIME_OK; |
363 | time_status = STA_UNSYNC; | 533 | time_status = STA_UNSYNC; |
534 | /* restart PPS frequency calibration */ | ||
535 | pps_reset_freq_interval(); | ||
364 | } | 536 | } |
365 | 537 | ||
366 | /* | 538 | /* |
@@ -410,6 +582,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts | |||
410 | time_freq = txc->freq * PPM_SCALE; | 582 | time_freq = txc->freq * PPM_SCALE; |
411 | time_freq = min(time_freq, MAXFREQ_SCALED); | 583 | time_freq = min(time_freq, MAXFREQ_SCALED); |
412 | time_freq = max(time_freq, -MAXFREQ_SCALED); | 584 | time_freq = max(time_freq, -MAXFREQ_SCALED); |
585 | /* update pps_freq */ | ||
586 | pps_set_freq(time_freq); | ||
413 | } | 587 | } |
414 | 588 | ||
415 | if (txc->modes & ADJ_MAXERROR) | 589 | if (txc->modes & ADJ_MAXERROR) |
@@ -474,6 +648,19 @@ int do_adjtimex(struct timex *txc) | |||
474 | hrtimer_cancel(&leap_timer); | 648 | hrtimer_cancel(&leap_timer); |
475 | } | 649 | } |
476 | 650 | ||
651 | if (txc->modes & ADJ_SETOFFSET) { | ||
652 | struct timespec delta; | ||
653 | delta.tv_sec = txc->time.tv_sec; | ||
654 | delta.tv_nsec = txc->time.tv_usec; | ||
655 | if (!capable(CAP_SYS_TIME)) | ||
656 | return -EPERM; | ||
657 | if (!(txc->modes & ADJ_NANO)) | ||
658 | delta.tv_nsec *= 1000; | ||
659 | result = timekeeping_inject_offset(&delta); | ||
660 | if (result) | ||
661 | return result; | ||
662 | } | ||
663 | |||
477 | getnstimeofday(&ts); | 664 | getnstimeofday(&ts); |
478 | 665 | ||
479 | write_seqlock_irq(&xtime_lock); | 666 | write_seqlock_irq(&xtime_lock); |
@@ -500,7 +687,8 @@ int do_adjtimex(struct timex *txc) | |||
500 | } | 687 | } |
501 | 688 | ||
502 | result = time_state; /* mostly `TIME_OK' */ | 689 | result = time_state; /* mostly `TIME_OK' */ |
503 | if (time_status & (STA_UNSYNC|STA_CLOCKERR)) | 690 | /* check for errors */ |
691 | if (is_error_status(time_status)) | ||
504 | result = TIME_ERROR; | 692 | result = TIME_ERROR; |
505 | 693 | ||
506 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * | 694 | txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * |
@@ -514,15 +702,8 @@ int do_adjtimex(struct timex *txc) | |||
514 | txc->tick = tick_usec; | 702 | txc->tick = tick_usec; |
515 | txc->tai = time_tai; | 703 | txc->tai = time_tai; |
516 | 704 | ||
517 | /* PPS is not implemented, so these are zero */ | 705 | /* fill PPS status fields */ |
518 | txc->ppsfreq = 0; | 706 | pps_fill_timex(txc); |
519 | txc->jitter = 0; | ||
520 | txc->shift = 0; | ||
521 | txc->stabil = 0; | ||
522 | txc->jitcnt = 0; | ||
523 | txc->calcnt = 0; | ||
524 | txc->errcnt = 0; | ||
525 | txc->stbcnt = 0; | ||
526 | 707 | ||
527 | write_sequnlock_irq(&xtime_lock); | 708 | write_sequnlock_irq(&xtime_lock); |
528 | 709 | ||
@@ -536,6 +717,243 @@ int do_adjtimex(struct timex *txc) | |||
536 | return result; | 717 | return result; |
537 | } | 718 | } |
538 | 719 | ||
720 | #ifdef CONFIG_NTP_PPS | ||
721 | |||
722 | /* actually struct pps_normtime is good old struct timespec, but it is | ||
723 | * semantically different (and it is the reason why it was invented): | ||
724 | * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] | ||
725 | * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ | ||
726 | struct pps_normtime { | ||
727 | __kernel_time_t sec; /* seconds */ | ||
728 | long nsec; /* nanoseconds */ | ||
729 | }; | ||
730 | |||
731 | /* normalize the timestamp so that nsec is in the | ||
732 | ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ | ||
733 | static inline struct pps_normtime pps_normalize_ts(struct timespec ts) | ||
734 | { | ||
735 | struct pps_normtime norm = { | ||
736 | .sec = ts.tv_sec, | ||
737 | .nsec = ts.tv_nsec | ||
738 | }; | ||
739 | |||
740 | if (norm.nsec > (NSEC_PER_SEC >> 1)) { | ||
741 | norm.nsec -= NSEC_PER_SEC; | ||
742 | norm.sec++; | ||
743 | } | ||
744 | |||
745 | return norm; | ||
746 | } | ||
747 | |||
748 | /* get current phase correction and jitter */ | ||
749 | static inline long pps_phase_filter_get(long *jitter) | ||
750 | { | ||
751 | *jitter = pps_tf[0] - pps_tf[1]; | ||
752 | if (*jitter < 0) | ||
753 | *jitter = -*jitter; | ||
754 | |||
755 | /* TODO: test various filters */ | ||
756 | return pps_tf[0]; | ||
757 | } | ||
758 | |||
759 | /* add the sample to the phase filter */ | ||
760 | static inline void pps_phase_filter_add(long err) | ||
761 | { | ||
762 | pps_tf[2] = pps_tf[1]; | ||
763 | pps_tf[1] = pps_tf[0]; | ||
764 | pps_tf[0] = err; | ||
765 | } | ||
766 | |||
767 | /* decrease frequency calibration interval length. | ||
768 | * It is halved after four consecutive unstable intervals. | ||
769 | */ | ||
770 | static inline void pps_dec_freq_interval(void) | ||
771 | { | ||
772 | if (--pps_intcnt <= -PPS_INTCOUNT) { | ||
773 | pps_intcnt = -PPS_INTCOUNT; | ||
774 | if (pps_shift > PPS_INTMIN) { | ||
775 | pps_shift--; | ||
776 | pps_intcnt = 0; | ||
777 | } | ||
778 | } | ||
779 | } | ||
780 | |||
781 | /* increase frequency calibration interval length. | ||
782 | * It is doubled after four consecutive stable intervals. | ||
783 | */ | ||
784 | static inline void pps_inc_freq_interval(void) | ||
785 | { | ||
786 | if (++pps_intcnt >= PPS_INTCOUNT) { | ||
787 | pps_intcnt = PPS_INTCOUNT; | ||
788 | if (pps_shift < PPS_INTMAX) { | ||
789 | pps_shift++; | ||
790 | pps_intcnt = 0; | ||
791 | } | ||
792 | } | ||
793 | } | ||
794 | |||
795 | /* update clock frequency based on MONOTONIC_RAW clock PPS signal | ||
796 | * timestamps | ||
797 | * | ||
798 | * At the end of the calibration interval the difference between the | ||
799 | * first and last MONOTONIC_RAW clock timestamps divided by the length | ||
800 | * of the interval becomes the frequency update. If the interval was | ||
801 | * too long, the data are discarded. | ||
802 | * Returns the difference between old and new frequency values. | ||
803 | */ | ||
804 | static long hardpps_update_freq(struct pps_normtime freq_norm) | ||
805 | { | ||
806 | long delta, delta_mod; | ||
807 | s64 ftemp; | ||
808 | |||
809 | /* check if the frequency interval was too long */ | ||
810 | if (freq_norm.sec > (2 << pps_shift)) { | ||
811 | time_status |= STA_PPSERROR; | ||
812 | pps_errcnt++; | ||
813 | pps_dec_freq_interval(); | ||
814 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | ||
815 | freq_norm.sec); | ||
816 | return 0; | ||
817 | } | ||
818 | |||
819 | /* here the raw frequency offset and wander (stability) is | ||
820 | * calculated. If the wander is less than the wander threshold | ||
821 | * the interval is increased; otherwise it is decreased. | ||
822 | */ | ||
823 | ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, | ||
824 | freq_norm.sec); | ||
825 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | ||
826 | pps_freq = ftemp; | ||
827 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | ||
828 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | ||
829 | time_status |= STA_PPSWANDER; | ||
830 | pps_stbcnt++; | ||
831 | pps_dec_freq_interval(); | ||
832 | } else { /* good sample */ | ||
833 | pps_inc_freq_interval(); | ||
834 | } | ||
835 | |||
836 | /* the stability metric is calculated as the average of recent | ||
837 | * frequency changes, but is used only for performance | ||
838 | * monitoring | ||
839 | */ | ||
840 | delta_mod = delta; | ||
841 | if (delta_mod < 0) | ||
842 | delta_mod = -delta_mod; | ||
843 | pps_stabil += (div_s64(((s64)delta_mod) << | ||
844 | (NTP_SCALE_SHIFT - SHIFT_USEC), | ||
845 | NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; | ||
846 | |||
847 | /* if enabled, the system clock frequency is updated */ | ||
848 | if ((time_status & STA_PPSFREQ) != 0 && | ||
849 | (time_status & STA_FREQHOLD) == 0) { | ||
850 | time_freq = pps_freq; | ||
851 | ntp_update_frequency(); | ||
852 | } | ||
853 | |||
854 | return delta; | ||
855 | } | ||
856 | |||
857 | /* correct REALTIME clock phase error against PPS signal */ | ||
858 | static void hardpps_update_phase(long error) | ||
859 | { | ||
860 | long correction = -error; | ||
861 | long jitter; | ||
862 | |||
863 | /* add the sample to the median filter */ | ||
864 | pps_phase_filter_add(correction); | ||
865 | correction = pps_phase_filter_get(&jitter); | ||
866 | |||
867 | /* Nominal jitter is due to PPS signal noise. If it exceeds the | ||
868 | * threshold, the sample is discarded; otherwise, if so enabled, | ||
869 | * the time offset is updated. | ||
870 | */ | ||
871 | if (jitter > (pps_jitter << PPS_POPCORN)) { | ||
872 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | ||
873 | jitter, (pps_jitter << PPS_POPCORN)); | ||
874 | time_status |= STA_PPSJITTER; | ||
875 | pps_jitcnt++; | ||
876 | } else if (time_status & STA_PPSTIME) { | ||
877 | /* correct the time using the phase offset */ | ||
878 | time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, | ||
879 | NTP_INTERVAL_FREQ); | ||
880 | /* cancel running adjtime() */ | ||
881 | time_adjust = 0; | ||
882 | } | ||
883 | /* update jitter */ | ||
884 | pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; | ||
885 | } | ||
886 | |||
887 | /* | ||
888 | * hardpps() - discipline CPU clock oscillator to external PPS signal | ||
889 | * | ||
890 | * This routine is called at each PPS signal arrival in order to | ||
891 | * discipline the CPU clock oscillator to the PPS signal. It takes two | ||
892 | * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former | ||
893 | * is used to correct clock phase error and the latter is used to | ||
894 | * correct the frequency. | ||
895 | * | ||
896 | * This code is based on David Mills's reference nanokernel | ||
897 | * implementation. It was mostly rewritten but keeps the same idea. | ||
898 | */ | ||
899 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | ||
900 | { | ||
901 | struct pps_normtime pts_norm, freq_norm; | ||
902 | unsigned long flags; | ||
903 | |||
904 | pts_norm = pps_normalize_ts(*phase_ts); | ||
905 | |||
906 | write_seqlock_irqsave(&xtime_lock, flags); | ||
907 | |||
908 | /* clear the error bits, they will be set again if needed */ | ||
909 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | ||
910 | |||
911 | /* indicate signal presence */ | ||
912 | time_status |= STA_PPSSIGNAL; | ||
913 | pps_valid = PPS_VALID; | ||
914 | |||
915 | /* when called for the first time, | ||
916 | * just start the frequency interval */ | ||
917 | if (unlikely(pps_fbase.tv_sec == 0)) { | ||
918 | pps_fbase = *raw_ts; | ||
919 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
920 | return; | ||
921 | } | ||
922 | |||
923 | /* ok, now we have a base for frequency calculation */ | ||
924 | freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); | ||
925 | |||
926 | /* check that the signal is in the range | ||
927 | * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ | ||
928 | if ((freq_norm.sec == 0) || | ||
929 | (freq_norm.nsec > MAXFREQ * freq_norm.sec) || | ||
930 | (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { | ||
931 | time_status |= STA_PPSJITTER; | ||
932 | /* restart the frequency calibration interval */ | ||
933 | pps_fbase = *raw_ts; | ||
934 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
935 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | ||
936 | return; | ||
937 | } | ||
938 | |||
939 | /* signal is ok */ | ||
940 | |||
941 | /* check if the current frequency interval is finished */ | ||
942 | if (freq_norm.sec >= (1 << pps_shift)) { | ||
943 | pps_calcnt++; | ||
944 | /* restart the frequency calibration interval */ | ||
945 | pps_fbase = *raw_ts; | ||
946 | hardpps_update_freq(freq_norm); | ||
947 | } | ||
948 | |||
949 | hardpps_update_phase(pts_norm.nsec); | ||
950 | |||
951 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
952 | } | ||
953 | EXPORT_SYMBOL(hardpps); | ||
954 | |||
955 | #endif /* CONFIG_NTP_PPS */ | ||
956 | |||
539 | static int __init ntp_tick_adj_setup(char *str) | 957 | static int __init ntp_tick_adj_setup(char *str) |
540 | { | 958 | { |
541 | ntp_tick_adj = simple_strtol(str, NULL, 0); | 959 | ntp_tick_adj = simple_strtol(str, NULL, 0); |
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000000..c340ca658f37 --- /dev/null +++ b/kernel/time/posix-clock.c | |||
@@ -0,0 +1,445 @@ | |||
1 | /* | ||
2 | * posix-clock.c - support for dynamic clock devices | ||
3 | * | ||
4 | * Copyright (C) 2010 OMICRON electronics GmbH | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
19 | */ | ||
20 | #include <linux/device.h> | ||
21 | #include <linux/file.h> | ||
22 | #include <linux/posix-clock.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/syscalls.h> | ||
25 | #include <linux/uaccess.h> | ||
26 | |||
27 | static void delete_clock(struct kref *kref); | ||
28 | |||
29 | /* | ||
30 | * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. | ||
31 | */ | ||
32 | static struct posix_clock *get_posix_clock(struct file *fp) | ||
33 | { | ||
34 | struct posix_clock *clk = fp->private_data; | ||
35 | |||
36 | down_read(&clk->rwsem); | ||
37 | |||
38 | if (!clk->zombie) | ||
39 | return clk; | ||
40 | |||
41 | up_read(&clk->rwsem); | ||
42 | |||
43 | return NULL; | ||
44 | } | ||
45 | |||
46 | static void put_posix_clock(struct posix_clock *clk) | ||
47 | { | ||
48 | up_read(&clk->rwsem); | ||
49 | } | ||
50 | |||
51 | static ssize_t posix_clock_read(struct file *fp, char __user *buf, | ||
52 | size_t count, loff_t *ppos) | ||
53 | { | ||
54 | struct posix_clock *clk = get_posix_clock(fp); | ||
55 | int err = -EINVAL; | ||
56 | |||
57 | if (!clk) | ||
58 | return -ENODEV; | ||
59 | |||
60 | if (clk->ops.read) | ||
61 | err = clk->ops.read(clk, fp->f_flags, buf, count); | ||
62 | |||
63 | put_posix_clock(clk); | ||
64 | |||
65 | return err; | ||
66 | } | ||
67 | |||
68 | static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) | ||
69 | { | ||
70 | struct posix_clock *clk = get_posix_clock(fp); | ||
71 | int result = 0; | ||
72 | |||
73 | if (!clk) | ||
74 | return -ENODEV; | ||
75 | |||
76 | if (clk->ops.poll) | ||
77 | result = clk->ops.poll(clk, fp, wait); | ||
78 | |||
79 | put_posix_clock(clk); | ||
80 | |||
81 | return result; | ||
82 | } | ||
83 | |||
84 | static int posix_clock_fasync(int fd, struct file *fp, int on) | ||
85 | { | ||
86 | struct posix_clock *clk = get_posix_clock(fp); | ||
87 | int err = 0; | ||
88 | |||
89 | if (!clk) | ||
90 | return -ENODEV; | ||
91 | |||
92 | if (clk->ops.fasync) | ||
93 | err = clk->ops.fasync(clk, fd, fp, on); | ||
94 | |||
95 | put_posix_clock(clk); | ||
96 | |||
97 | return err; | ||
98 | } | ||
99 | |||
100 | static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) | ||
101 | { | ||
102 | struct posix_clock *clk = get_posix_clock(fp); | ||
103 | int err = -ENODEV; | ||
104 | |||
105 | if (!clk) | ||
106 | return -ENODEV; | ||
107 | |||
108 | if (clk->ops.mmap) | ||
109 | err = clk->ops.mmap(clk, vma); | ||
110 | |||
111 | put_posix_clock(clk); | ||
112 | |||
113 | return err; | ||
114 | } | ||
115 | |||
116 | static long posix_clock_ioctl(struct file *fp, | ||
117 | unsigned int cmd, unsigned long arg) | ||
118 | { | ||
119 | struct posix_clock *clk = get_posix_clock(fp); | ||
120 | int err = -ENOTTY; | ||
121 | |||
122 | if (!clk) | ||
123 | return -ENODEV; | ||
124 | |||
125 | if (clk->ops.ioctl) | ||
126 | err = clk->ops.ioctl(clk, cmd, arg); | ||
127 | |||
128 | put_posix_clock(clk); | ||
129 | |||
130 | return err; | ||
131 | } | ||
132 | |||
133 | #ifdef CONFIG_COMPAT | ||
134 | static long posix_clock_compat_ioctl(struct file *fp, | ||
135 | unsigned int cmd, unsigned long arg) | ||
136 | { | ||
137 | struct posix_clock *clk = get_posix_clock(fp); | ||
138 | int err = -ENOTTY; | ||
139 | |||
140 | if (!clk) | ||
141 | return -ENODEV; | ||
142 | |||
143 | if (clk->ops.ioctl) | ||
144 | err = clk->ops.ioctl(clk, cmd, arg); | ||
145 | |||
146 | put_posix_clock(clk); | ||
147 | |||
148 | return err; | ||
149 | } | ||
150 | #endif | ||
151 | |||
152 | static int posix_clock_open(struct inode *inode, struct file *fp) | ||
153 | { | ||
154 | int err; | ||
155 | struct posix_clock *clk = | ||
156 | container_of(inode->i_cdev, struct posix_clock, cdev); | ||
157 | |||
158 | down_read(&clk->rwsem); | ||
159 | |||
160 | if (clk->zombie) { | ||
161 | err = -ENODEV; | ||
162 | goto out; | ||
163 | } | ||
164 | if (clk->ops.open) | ||
165 | err = clk->ops.open(clk, fp->f_mode); | ||
166 | else | ||
167 | err = 0; | ||
168 | |||
169 | if (!err) { | ||
170 | kref_get(&clk->kref); | ||
171 | fp->private_data = clk; | ||
172 | } | ||
173 | out: | ||
174 | up_read(&clk->rwsem); | ||
175 | return err; | ||
176 | } | ||
177 | |||
178 | static int posix_clock_release(struct inode *inode, struct file *fp) | ||
179 | { | ||
180 | struct posix_clock *clk = fp->private_data; | ||
181 | int err = 0; | ||
182 | |||
183 | if (clk->ops.release) | ||
184 | err = clk->ops.release(clk); | ||
185 | |||
186 | kref_put(&clk->kref, delete_clock); | ||
187 | |||
188 | fp->private_data = NULL; | ||
189 | |||
190 | return err; | ||
191 | } | ||
192 | |||
193 | static const struct file_operations posix_clock_file_operations = { | ||
194 | .owner = THIS_MODULE, | ||
195 | .llseek = no_llseek, | ||
196 | .read = posix_clock_read, | ||
197 | .poll = posix_clock_poll, | ||
198 | .unlocked_ioctl = posix_clock_ioctl, | ||
199 | .open = posix_clock_open, | ||
200 | .release = posix_clock_release, | ||
201 | .fasync = posix_clock_fasync, | ||
202 | .mmap = posix_clock_mmap, | ||
203 | #ifdef CONFIG_COMPAT | ||
204 | .compat_ioctl = posix_clock_compat_ioctl, | ||
205 | #endif | ||
206 | }; | ||
207 | |||
208 | int posix_clock_register(struct posix_clock *clk, dev_t devid) | ||
209 | { | ||
210 | int err; | ||
211 | |||
212 | kref_init(&clk->kref); | ||
213 | init_rwsem(&clk->rwsem); | ||
214 | |||
215 | cdev_init(&clk->cdev, &posix_clock_file_operations); | ||
216 | clk->cdev.owner = clk->ops.owner; | ||
217 | err = cdev_add(&clk->cdev, devid, 1); | ||
218 | |||
219 | return err; | ||
220 | } | ||
221 | EXPORT_SYMBOL_GPL(posix_clock_register); | ||
222 | |||
223 | static void delete_clock(struct kref *kref) | ||
224 | { | ||
225 | struct posix_clock *clk = container_of(kref, struct posix_clock, kref); | ||
226 | |||
227 | if (clk->release) | ||
228 | clk->release(clk); | ||
229 | } | ||
230 | |||
231 | void posix_clock_unregister(struct posix_clock *clk) | ||
232 | { | ||
233 | cdev_del(&clk->cdev); | ||
234 | |||
235 | down_write(&clk->rwsem); | ||
236 | clk->zombie = true; | ||
237 | up_write(&clk->rwsem); | ||
238 | |||
239 | kref_put(&clk->kref, delete_clock); | ||
240 | } | ||
241 | EXPORT_SYMBOL_GPL(posix_clock_unregister); | ||
242 | |||
243 | struct posix_clock_desc { | ||
244 | struct file *fp; | ||
245 | struct posix_clock *clk; | ||
246 | }; | ||
247 | |||
248 | static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) | ||
249 | { | ||
250 | struct file *fp = fget(CLOCKID_TO_FD(id)); | ||
251 | int err = -EINVAL; | ||
252 | |||
253 | if (!fp) | ||
254 | return err; | ||
255 | |||
256 | if (fp->f_op->open != posix_clock_open || !fp->private_data) | ||
257 | goto out; | ||
258 | |||
259 | cd->fp = fp; | ||
260 | cd->clk = get_posix_clock(fp); | ||
261 | |||
262 | err = cd->clk ? 0 : -ENODEV; | ||
263 | out: | ||
264 | if (err) | ||
265 | fput(fp); | ||
266 | return err; | ||
267 | } | ||
268 | |||
269 | static void put_clock_desc(struct posix_clock_desc *cd) | ||
270 | { | ||
271 | put_posix_clock(cd->clk); | ||
272 | fput(cd->fp); | ||
273 | } | ||
274 | |||
275 | static int pc_clock_adjtime(clockid_t id, struct timex *tx) | ||
276 | { | ||
277 | struct posix_clock_desc cd; | ||
278 | int err; | ||
279 | |||
280 | err = get_clock_desc(id, &cd); | ||
281 | if (err) | ||
282 | return err; | ||
283 | |||
284 | if ((cd.fp->f_mode & FMODE_WRITE) == 0) { | ||
285 | err = -EACCES; | ||
286 | goto out; | ||
287 | } | ||
288 | |||
289 | if (cd.clk->ops.clock_adjtime) | ||
290 | err = cd.clk->ops.clock_adjtime(cd.clk, tx); | ||
291 | else | ||
292 | err = -EOPNOTSUPP; | ||
293 | out: | ||
294 | put_clock_desc(&cd); | ||
295 | |||
296 | return err; | ||
297 | } | ||
298 | |||
299 | static int pc_clock_gettime(clockid_t id, struct timespec *ts) | ||
300 | { | ||
301 | struct posix_clock_desc cd; | ||
302 | int err; | ||
303 | |||
304 | err = get_clock_desc(id, &cd); | ||
305 | if (err) | ||
306 | return err; | ||
307 | |||
308 | if (cd.clk->ops.clock_gettime) | ||
309 | err = cd.clk->ops.clock_gettime(cd.clk, ts); | ||
310 | else | ||
311 | err = -EOPNOTSUPP; | ||
312 | |||
313 | put_clock_desc(&cd); | ||
314 | |||
315 | return err; | ||
316 | } | ||
317 | |||
318 | static int pc_clock_getres(clockid_t id, struct timespec *ts) | ||
319 | { | ||
320 | struct posix_clock_desc cd; | ||
321 | int err; | ||
322 | |||
323 | err = get_clock_desc(id, &cd); | ||
324 | if (err) | ||
325 | return err; | ||
326 | |||
327 | if (cd.clk->ops.clock_getres) | ||
328 | err = cd.clk->ops.clock_getres(cd.clk, ts); | ||
329 | else | ||
330 | err = -EOPNOTSUPP; | ||
331 | |||
332 | put_clock_desc(&cd); | ||
333 | |||
334 | return err; | ||
335 | } | ||
336 | |||
337 | static int pc_clock_settime(clockid_t id, const struct timespec *ts) | ||
338 | { | ||
339 | struct posix_clock_desc cd; | ||
340 | int err; | ||
341 | |||
342 | err = get_clock_desc(id, &cd); | ||
343 | if (err) | ||
344 | return err; | ||
345 | |||
346 | if ((cd.fp->f_mode & FMODE_WRITE) == 0) { | ||
347 | err = -EACCES; | ||
348 | goto out; | ||
349 | } | ||
350 | |||
351 | if (cd.clk->ops.clock_settime) | ||
352 | err = cd.clk->ops.clock_settime(cd.clk, ts); | ||
353 | else | ||
354 | err = -EOPNOTSUPP; | ||
355 | out: | ||
356 | put_clock_desc(&cd); | ||
357 | |||
358 | return err; | ||
359 | } | ||
360 | |||
361 | static int pc_timer_create(struct k_itimer *kit) | ||
362 | { | ||
363 | clockid_t id = kit->it_clock; | ||
364 | struct posix_clock_desc cd; | ||
365 | int err; | ||
366 | |||
367 | err = get_clock_desc(id, &cd); | ||
368 | if (err) | ||
369 | return err; | ||
370 | |||
371 | if (cd.clk->ops.timer_create) | ||
372 | err = cd.clk->ops.timer_create(cd.clk, kit); | ||
373 | else | ||
374 | err = -EOPNOTSUPP; | ||
375 | |||
376 | put_clock_desc(&cd); | ||
377 | |||
378 | return err; | ||
379 | } | ||
380 | |||
381 | static int pc_timer_delete(struct k_itimer *kit) | ||
382 | { | ||
383 | clockid_t id = kit->it_clock; | ||
384 | struct posix_clock_desc cd; | ||
385 | int err; | ||
386 | |||
387 | err = get_clock_desc(id, &cd); | ||
388 | if (err) | ||
389 | return err; | ||
390 | |||
391 | if (cd.clk->ops.timer_delete) | ||
392 | err = cd.clk->ops.timer_delete(cd.clk, kit); | ||
393 | else | ||
394 | err = -EOPNOTSUPP; | ||
395 | |||
396 | put_clock_desc(&cd); | ||
397 | |||
398 | return err; | ||
399 | } | ||
400 | |||
401 | static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) | ||
402 | { | ||
403 | clockid_t id = kit->it_clock; | ||
404 | struct posix_clock_desc cd; | ||
405 | |||
406 | if (get_clock_desc(id, &cd)) | ||
407 | return; | ||
408 | |||
409 | if (cd.clk->ops.timer_gettime) | ||
410 | cd.clk->ops.timer_gettime(cd.clk, kit, ts); | ||
411 | |||
412 | put_clock_desc(&cd); | ||
413 | } | ||
414 | |||
415 | static int pc_timer_settime(struct k_itimer *kit, int flags, | ||
416 | struct itimerspec *ts, struct itimerspec *old) | ||
417 | { | ||
418 | clockid_t id = kit->it_clock; | ||
419 | struct posix_clock_desc cd; | ||
420 | int err; | ||
421 | |||
422 | err = get_clock_desc(id, &cd); | ||
423 | if (err) | ||
424 | return err; | ||
425 | |||
426 | if (cd.clk->ops.timer_settime) | ||
427 | err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); | ||
428 | else | ||
429 | err = -EOPNOTSUPP; | ||
430 | |||
431 | put_clock_desc(&cd); | ||
432 | |||
433 | return err; | ||
434 | } | ||
435 | |||
436 | struct k_clock clock_posix_dynamic = { | ||
437 | .clock_getres = pc_clock_getres, | ||
438 | .clock_set = pc_clock_settime, | ||
439 | .clock_get = pc_clock_gettime, | ||
440 | .clock_adj = pc_clock_adjtime, | ||
441 | .timer_create = pc_timer_create, | ||
442 | .timer_set = pc_timer_settime, | ||
443 | .timer_del = pc_timer_delete, | ||
444 | .timer_get = pc_timer_gettime, | ||
445 | }; | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b5668..c7218d132738 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
24 | 23 | ||
@@ -457,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
457 | unsigned long flags; | 456 | unsigned long flags; |
458 | int cpu; | 457 | int cpu; |
459 | 458 | ||
460 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
461 | |||
462 | /* | 459 | /* |
463 | * Periodic mode does not care about the enter/exit of power | 460 | * Periodic mode does not care about the enter/exit of power |
464 | * states | 461 | * states |
465 | */ | 462 | */ |
466 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | 463 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) |
467 | goto out; | 464 | return; |
468 | 465 | ||
469 | bc = tick_broadcast_device.evtdev; | 466 | /* |
467 | * We are called with preemtion disabled from the depth of the | ||
468 | * idle code, so we can't be moved away. | ||
469 | */ | ||
470 | cpu = smp_processor_id(); | 470 | cpu = smp_processor_id(); |
471 | td = &per_cpu(tick_cpu_device, cpu); | 471 | td = &per_cpu(tick_cpu_device, cpu); |
472 | dev = td->evtdev; | 472 | dev = td->evtdev; |
473 | 473 | ||
474 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 474 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
475 | goto out; | 475 | return; |
476 | |||
477 | bc = tick_broadcast_device.evtdev; | ||
476 | 478 | ||
479 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
477 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | 480 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { |
478 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { | 481 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { |
479 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); | 482 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); |
@@ -490,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
490 | tick_program_event(dev->next_event, 1); | 493 | tick_program_event(dev->next_event, 1); |
491 | } | 494 | } |
492 | } | 495 | } |
493 | |||
494 | out: | ||
495 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 496 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
496 | } | 497 | } |
497 | 498 | ||
@@ -523,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, | |||
523 | */ | 524 | */ |
524 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 525 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
525 | { | 526 | { |
527 | int cpu = smp_processor_id(); | ||
528 | |||
526 | /* Set it up only once ! */ | 529 | /* Set it up only once ! */ |
527 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 530 | if (bc->event_handler != tick_handle_oneshot_broadcast) { |
528 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 531 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; |
529 | int cpu = smp_processor_id(); | ||
530 | 532 | ||
531 | bc->event_handler = tick_handle_oneshot_broadcast; | 533 | bc->event_handler = tick_handle_oneshot_broadcast; |
532 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 534 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
@@ -552,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
552 | tick_broadcast_set_event(tick_next_period, 1); | 554 | tick_broadcast_set_event(tick_next_period, 1); |
553 | } else | 555 | } else |
554 | bc->next_event.tv64 = KTIME_MAX; | 556 | bc->next_event.tv64 = KTIME_MAX; |
557 | } else { | ||
558 | /* | ||
559 | * The first cpu which switches to oneshot mode sets | ||
560 | * the bit for all other cpus which are in the general | ||
561 | * (periodic) broadcast mask. So the bit is set and | ||
562 | * would prevent the first broadcast enter after this | ||
563 | * to program the bc device. | ||
564 | */ | ||
565 | tick_broadcast_clear_oneshot(cpu); | ||
555 | } | 566 | } |
556 | } | 567 | } |
557 | 568 | ||
@@ -600,4 +611,14 @@ int tick_broadcast_oneshot_active(void) | |||
600 | return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; | 611 | return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; |
601 | } | 612 | } |
602 | 613 | ||
614 | /* | ||
615 | * Check whether the broadcast device supports oneshot. | ||
616 | */ | ||
617 | bool tick_broadcast_oneshot_available(void) | ||
618 | { | ||
619 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | ||
620 | |||
621 | return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; | ||
622 | } | ||
623 | |||
603 | #endif | 624 | #endif |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eeef..119528de8235 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include <asm/irq_regs.h> | 22 | #include <asm/irq_regs.h> |
24 | 23 | ||
@@ -49,9 +48,13 @@ struct tick_device *tick_get_device(int cpu) | |||
49 | */ | 48 | */ |
50 | int tick_is_oneshot_available(void) | 49 | int tick_is_oneshot_available(void) |
51 | { | 50 | { |
52 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 51 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
53 | 52 | ||
54 | return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); | 53 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) |
54 | return 0; | ||
55 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
56 | return 1; | ||
57 | return tick_broadcast_oneshot_available(); | ||
55 | } | 58 | } |
56 | 59 | ||
57 | /* | 60 | /* |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f60..1009b06d6f89 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -1,6 +1,10 @@ | |||
1 | /* | 1 | /* |
2 | * tick internal variable and functions used by low/high res code | 2 | * tick internal variable and functions used by low/high res code |
3 | */ | 3 | */ |
4 | #include <linux/hrtimer.h> | ||
5 | #include <linux/tick.h> | ||
6 | |||
7 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | ||
4 | 8 | ||
5 | #define TICK_DO_TIMER_NONE -1 | 9 | #define TICK_DO_TIMER_NONE -1 |
6 | #define TICK_DO_TIMER_BOOT -2 | 10 | #define TICK_DO_TIMER_BOOT -2 |
@@ -36,6 +40,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | |||
36 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | 40 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); |
37 | extern int tick_broadcast_oneshot_active(void); | 41 | extern int tick_broadcast_oneshot_active(void); |
38 | extern void tick_check_oneshot_broadcast(int cpu); | 42 | extern void tick_check_oneshot_broadcast(int cpu); |
43 | bool tick_broadcast_oneshot_available(void); | ||
39 | # else /* BROADCAST */ | 44 | # else /* BROADCAST */ |
40 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 45 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
41 | { | 46 | { |
@@ -46,6 +51,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { } | |||
46 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 51 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } |
47 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 52 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
48 | static inline void tick_check_oneshot_broadcast(int cpu) { } | 53 | static inline void tick_check_oneshot_broadcast(int cpu) { } |
54 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | ||
49 | # endif /* !BROADCAST */ | 55 | # endif /* !BROADCAST */ |
50 | 56 | ||
51 | #else /* !ONESHOT */ | 57 | #else /* !ONESHOT */ |
@@ -76,6 +82,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
76 | return 0; | 82 | return 0; |
77 | } | 83 | } |
78 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 84 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
85 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | ||
79 | #endif /* !TICK_ONESHOT */ | 86 | #endif /* !TICK_ONESHOT */ |
80 | 87 | ||
81 | /* | 88 | /* |
@@ -132,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) | |||
132 | { | 139 | { |
133 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | 140 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); |
134 | } | 141 | } |
142 | |||
143 | #endif | ||
144 | |||
145 | extern void do_timer(unsigned long ticks); | ||
146 | extern seqlock_t xtime_lock; | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e52680a..2d04411a5f05 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/tick.h> | ||
22 | 21 | ||
23 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
24 | 23 | ||
@@ -95,7 +94,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | |||
95 | */ | 94 | */ |
96 | int tick_program_event(ktime_t expires, int force) | 95 | int tick_program_event(ktime_t expires, int force) |
97 | { | 96 | { |
98 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 97 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
99 | 98 | ||
100 | return tick_dev_program_event(dev, expires, force); | 99 | return tick_dev_program_event(dev, expires, force); |
101 | } | 100 | } |
@@ -167,7 +166,7 @@ int tick_oneshot_mode_active(void) | |||
167 | int ret; | 166 | int ret; |
168 | 167 | ||
169 | local_irq_save(flags); | 168 | local_irq_save(flags); |
170 | ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; | 169 | ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; |
171 | local_irq_restore(flags); | 170 | local_irq_restore(flags); |
172 | 171 | ||
173 | return ret; | 172 | return ret; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index bb2d8b7850a3..0c0e02f1b819 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -19,7 +19,6 @@ | |||
19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
20 | #include <linux/profile.h> | 20 | #include <linux/profile.h> |
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/tick.h> | ||
23 | #include <linux/module.h> | 22 | #include <linux/module.h> |
24 | 23 | ||
25 | #include <asm/irq_regs.h> | 24 | #include <asm/irq_regs.h> |
@@ -642,8 +641,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
642 | } | 641 | } |
643 | local_irq_enable(); | 642 | local_irq_enable(); |
644 | 643 | ||
645 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | 644 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); |
646 | smp_processor_id()); | ||
647 | } | 645 | } |
648 | 646 | ||
649 | /* | 647 | /* |
@@ -842,8 +840,10 @@ void tick_setup_sched_timer(void) | |||
842 | } | 840 | } |
843 | 841 | ||
844 | #ifdef CONFIG_NO_HZ | 842 | #ifdef CONFIG_NO_HZ |
845 | if (tick_nohz_enabled) | 843 | if (tick_nohz_enabled) { |
846 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 844 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
845 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
846 | } | ||
847 | #endif | 847 | #endif |
848 | } | 848 | } |
849 | #endif /* HIGH_RES_TIMERS */ | 849 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176cc..a9ae369925ce 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
23 | #include <linux/math64.h> | 23 | #include <linux/math64.h> |
24 | #include <linux/kernel.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * fixed point arithmetic scale factor for skew | 27 | * fixed point arithmetic scale factor for skew |
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, | |||
57 | int index; | 58 | int index; |
58 | int num_samples = sync->num_samples; | 59 | int num_samples = sync->num_samples; |
59 | 60 | ||
60 | if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { | 61 | if (num_samples > ARRAY_SIZE(buffer)) { |
61 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); | 62 | samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); |
62 | if (!samples) { | 63 | if (!samples) { |
63 | samples = buffer; | 64 | samples = buffer; |
64 | num_samples = sizeof(buffer)/sizeof(buffer[0]); | 65 | num_samples = ARRAY_SIZE(buffer); |
65 | } | 66 | } |
66 | } else { | 67 | } else { |
67 | samples = buffer; | 68 | samples = buffer; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f72..342408cf68dd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/sysdev.h> | 17 | #include <linux/syscore_ops.h> |
18 | #include <linux/clocksource.h> | 18 | #include <linux/clocksource.h> |
19 | #include <linux/jiffies.h> | 19 | #include <linux/jiffies.h> |
20 | #include <linux/time.h> | 20 | #include <linux/time.h> |
@@ -32,6 +32,8 @@ struct timekeeper { | |||
32 | cycle_t cycle_interval; | 32 | cycle_t cycle_interval; |
33 | /* Number of clock shifted nano seconds in one NTP interval. */ | 33 | /* Number of clock shifted nano seconds in one NTP interval. */ |
34 | u64 xtime_interval; | 34 | u64 xtime_interval; |
35 | /* shifted nano seconds left over when rounding cycle_interval */ | ||
36 | s64 xtime_remainder; | ||
35 | /* Raw nano seconds accumulated per NTP interval. */ | 37 | /* Raw nano seconds accumulated per NTP interval. */ |
36 | u32 raw_interval; | 38 | u32 raw_interval; |
37 | 39 | ||
@@ -47,7 +49,7 @@ struct timekeeper { | |||
47 | u32 mult; | 49 | u32 mult; |
48 | }; | 50 | }; |
49 | 51 | ||
50 | struct timekeeper timekeeper; | 52 | static struct timekeeper timekeeper; |
51 | 53 | ||
52 | /** | 54 | /** |
53 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 55 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
@@ -62,7 +64,7 @@ struct timekeeper timekeeper; | |||
62 | static void timekeeper_setup_internals(struct clocksource *clock) | 64 | static void timekeeper_setup_internals(struct clocksource *clock) |
63 | { | 65 | { |
64 | cycle_t interval; | 66 | cycle_t interval; |
65 | u64 tmp; | 67 | u64 tmp, ntpinterval; |
66 | 68 | ||
67 | timekeeper.clock = clock; | 69 | timekeeper.clock = clock; |
68 | clock->cycle_last = clock->read(clock); | 70 | clock->cycle_last = clock->read(clock); |
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
70 | /* Do the ns -> cycle conversion first, using original mult */ | 72 | /* Do the ns -> cycle conversion first, using original mult */ |
71 | tmp = NTP_INTERVAL_LENGTH; | 73 | tmp = NTP_INTERVAL_LENGTH; |
72 | tmp <<= clock->shift; | 74 | tmp <<= clock->shift; |
75 | ntpinterval = tmp; | ||
73 | tmp += clock->mult/2; | 76 | tmp += clock->mult/2; |
74 | do_div(tmp, clock->mult); | 77 | do_div(tmp, clock->mult); |
75 | if (tmp == 0) | 78 | if (tmp == 0) |
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
80 | 83 | ||
81 | /* Go back from cycles -> shifted ns */ | 84 | /* Go back from cycles -> shifted ns */ |
82 | timekeeper.xtime_interval = (u64) interval * clock->mult; | 85 | timekeeper.xtime_interval = (u64) interval * clock->mult; |
86 | timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; | ||
83 | timekeeper.raw_interval = | 87 | timekeeper.raw_interval = |
84 | ((u64) interval * clock->mult) >> clock->shift; | 88 | ((u64) interval * clock->mult) >> clock->shift; |
85 | 89 | ||
@@ -160,7 +164,7 @@ static struct timespec total_sleep_time; | |||
160 | /* | 164 | /* |
161 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. | 165 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. |
162 | */ | 166 | */ |
163 | struct timespec raw_time; | 167 | static struct timespec raw_time; |
164 | 168 | ||
165 | /* flag for if timekeeping is suspended */ | 169 | /* flag for if timekeeping is suspended */ |
166 | int __read_mostly timekeeping_suspended; | 170 | int __read_mostly timekeeping_suspended; |
@@ -284,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) | |||
284 | } | 288 | } |
285 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 289 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
286 | 290 | ||
291 | #ifdef CONFIG_NTP_PPS | ||
292 | |||
293 | /** | ||
294 | * getnstime_raw_and_real - get day and raw monotonic time in timespec format | ||
295 | * @ts_raw: pointer to the timespec to be set to raw monotonic time | ||
296 | * @ts_real: pointer to the timespec to be set to the time of day | ||
297 | * | ||
298 | * This function reads both the time of day and raw monotonic time at the | ||
299 | * same time atomically and stores the resulting timestamps in timespec | ||
300 | * format. | ||
301 | */ | ||
302 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | ||
303 | { | ||
304 | unsigned long seq; | ||
305 | s64 nsecs_raw, nsecs_real; | ||
306 | |||
307 | WARN_ON_ONCE(timekeeping_suspended); | ||
308 | |||
309 | do { | ||
310 | u32 arch_offset; | ||
311 | |||
312 | seq = read_seqbegin(&xtime_lock); | ||
313 | |||
314 | *ts_raw = raw_time; | ||
315 | *ts_real = xtime; | ||
316 | |||
317 | nsecs_raw = timekeeping_get_ns_raw(); | ||
318 | nsecs_real = timekeeping_get_ns(); | ||
319 | |||
320 | /* If arch requires, add in gettimeoffset() */ | ||
321 | arch_offset = arch_gettimeoffset(); | ||
322 | nsecs_raw += arch_offset; | ||
323 | nsecs_real += arch_offset; | ||
324 | |||
325 | } while (read_seqretry(&xtime_lock, seq)); | ||
326 | |||
327 | timespec_add_ns(ts_raw, nsecs_raw); | ||
328 | timespec_add_ns(ts_real, nsecs_real); | ||
329 | } | ||
330 | EXPORT_SYMBOL(getnstime_raw_and_real); | ||
331 | |||
332 | #endif /* CONFIG_NTP_PPS */ | ||
333 | |||
287 | /** | 334 | /** |
288 | * do_gettimeofday - Returns the time of day in a timeval | 335 | * do_gettimeofday - Returns the time of day in a timeval |
289 | * @tv: pointer to the timeval to be set | 336 | * @tv: pointer to the timeval to be set |
@@ -306,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
306 | * | 353 | * |
307 | * Sets the time of day to the new time and update NTP and notify hrtimers | 354 | * Sets the time of day to the new time and update NTP and notify hrtimers |
308 | */ | 355 | */ |
309 | int do_settimeofday(struct timespec *tv) | 356 | int do_settimeofday(const struct timespec *tv) |
310 | { | 357 | { |
311 | struct timespec ts_delta; | 358 | struct timespec ts_delta; |
312 | unsigned long flags; | 359 | unsigned long flags; |
@@ -340,6 +387,42 @@ int do_settimeofday(struct timespec *tv) | |||
340 | 387 | ||
341 | EXPORT_SYMBOL(do_settimeofday); | 388 | EXPORT_SYMBOL(do_settimeofday); |
342 | 389 | ||
390 | |||
391 | /** | ||
392 | * timekeeping_inject_offset - Adds or subtracts from the current time. | ||
393 | * @tv: pointer to the timespec variable containing the offset | ||
394 | * | ||
395 | * Adds or subtracts an offset value from the current time. | ||
396 | */ | ||
397 | int timekeeping_inject_offset(struct timespec *ts) | ||
398 | { | ||
399 | unsigned long flags; | ||
400 | |||
401 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | ||
402 | return -EINVAL; | ||
403 | |||
404 | write_seqlock_irqsave(&xtime_lock, flags); | ||
405 | |||
406 | timekeeping_forward_now(); | ||
407 | |||
408 | xtime = timespec_add(xtime, *ts); | ||
409 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); | ||
410 | |||
411 | timekeeper.ntp_error = 0; | ||
412 | ntp_clear(); | ||
413 | |||
414 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
415 | timekeeper.mult); | ||
416 | |||
417 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
418 | |||
419 | /* signal hrtimers about time change */ | ||
420 | clock_was_set(); | ||
421 | |||
422 | return 0; | ||
423 | } | ||
424 | EXPORT_SYMBOL(timekeeping_inject_offset); | ||
425 | |||
343 | /** | 426 | /** |
344 | * change_clocksource - Swaps clocksources if a new one is available | 427 | * change_clocksource - Swaps clocksources if a new one is available |
345 | * | 428 | * |
@@ -513,14 +596,65 @@ void __init timekeeping_init(void) | |||
513 | static struct timespec timekeeping_suspend_time; | 596 | static struct timespec timekeeping_suspend_time; |
514 | 597 | ||
515 | /** | 598 | /** |
599 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | ||
600 | * @delta: pointer to a timespec delta value | ||
601 | * | ||
602 | * Takes a timespec offset measuring a suspend interval and properly | ||
603 | * adds the sleep offset to the timekeeping variables. | ||
604 | */ | ||
605 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | ||
606 | { | ||
607 | xtime = timespec_add(xtime, *delta); | ||
608 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); | ||
609 | total_sleep_time = timespec_add(total_sleep_time, *delta); | ||
610 | } | ||
611 | |||
612 | |||
613 | /** | ||
614 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | ||
615 | * @delta: pointer to a timespec delta value | ||
616 | * | ||
617 | * This hook is for architectures that cannot support read_persistent_clock | ||
618 | * because their RTC/persistent clock is only accessible when irqs are enabled. | ||
619 | * | ||
620 | * This function should only be called by rtc_resume(), and allows | ||
621 | * a suspend offset to be injected into the timekeeping values. | ||
622 | */ | ||
623 | void timekeeping_inject_sleeptime(struct timespec *delta) | ||
624 | { | ||
625 | unsigned long flags; | ||
626 | struct timespec ts; | ||
627 | |||
628 | /* Make sure we don't set the clock twice */ | ||
629 | read_persistent_clock(&ts); | ||
630 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | ||
631 | return; | ||
632 | |||
633 | write_seqlock_irqsave(&xtime_lock, flags); | ||
634 | timekeeping_forward_now(); | ||
635 | |||
636 | __timekeeping_inject_sleeptime(delta); | ||
637 | |||
638 | timekeeper.ntp_error = 0; | ||
639 | ntp_clear(); | ||
640 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
641 | timekeeper.mult); | ||
642 | |||
643 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
644 | |||
645 | /* signal hrtimers about time change */ | ||
646 | clock_was_set(); | ||
647 | } | ||
648 | |||
649 | |||
650 | /** | ||
516 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 651 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
517 | * @dev: unused | ||
518 | * | 652 | * |
519 | * This is for the generic clocksource timekeeping. | 653 | * This is for the generic clocksource timekeeping. |
520 | * xtime/wall_to_monotonic/jiffies/etc are | 654 | * xtime/wall_to_monotonic/jiffies/etc are |
521 | * still managed by arch specific suspend/resume code. | 655 | * still managed by arch specific suspend/resume code. |
522 | */ | 656 | */ |
523 | static int timekeeping_resume(struct sys_device *dev) | 657 | static void timekeeping_resume(void) |
524 | { | 658 | { |
525 | unsigned long flags; | 659 | unsigned long flags; |
526 | struct timespec ts; | 660 | struct timespec ts; |
@@ -533,9 +667,7 @@ static int timekeeping_resume(struct sys_device *dev) | |||
533 | 667 | ||
534 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 668 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
535 | ts = timespec_sub(ts, timekeeping_suspend_time); | 669 | ts = timespec_sub(ts, timekeeping_suspend_time); |
536 | xtime = timespec_add(xtime, ts); | 670 | __timekeeping_inject_sleeptime(&ts); |
537 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | ||
538 | total_sleep_time = timespec_add(total_sleep_time, ts); | ||
539 | } | 671 | } |
540 | /* re-base the last cycle value */ | 672 | /* re-base the last cycle value */ |
541 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 673 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
@@ -548,12 +680,10 @@ static int timekeeping_resume(struct sys_device *dev) | |||
548 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | 680 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); |
549 | 681 | ||
550 | /* Resume hrtimers */ | 682 | /* Resume hrtimers */ |
551 | hres_timers_resume(); | 683 | hrtimers_resume(); |
552 | |||
553 | return 0; | ||
554 | } | 684 | } |
555 | 685 | ||
556 | static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | 686 | static int timekeeping_suspend(void) |
557 | { | 687 | { |
558 | unsigned long flags; | 688 | unsigned long flags; |
559 | 689 | ||
@@ -571,26 +701,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
571 | } | 701 | } |
572 | 702 | ||
573 | /* sysfs resume/suspend bits for timekeeping */ | 703 | /* sysfs resume/suspend bits for timekeeping */ |
574 | static struct sysdev_class timekeeping_sysclass = { | 704 | static struct syscore_ops timekeeping_syscore_ops = { |
575 | .name = "timekeeping", | ||
576 | .resume = timekeeping_resume, | 705 | .resume = timekeeping_resume, |
577 | .suspend = timekeeping_suspend, | 706 | .suspend = timekeeping_suspend, |
578 | }; | 707 | }; |
579 | 708 | ||
580 | static struct sys_device device_timer = { | 709 | static int __init timekeeping_init_ops(void) |
581 | .id = 0, | ||
582 | .cls = &timekeeping_sysclass, | ||
583 | }; | ||
584 | |||
585 | static int __init timekeeping_init_device(void) | ||
586 | { | 710 | { |
587 | int error = sysdev_class_register(&timekeeping_sysclass); | 711 | register_syscore_ops(&timekeeping_syscore_ops); |
588 | if (!error) | 712 | return 0; |
589 | error = sysdev_register(&device_timer); | ||
590 | return error; | ||
591 | } | 713 | } |
592 | 714 | ||
593 | device_initcall(timekeeping_init_device); | 715 | device_initcall(timekeeping_init_ops); |
594 | 716 | ||
595 | /* | 717 | /* |
596 | * If the error is already larger, we look ahead even further | 718 | * If the error is already larger, we look ahead even further |
@@ -719,7 +841,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
719 | 841 | ||
720 | /* Accumulate error between NTP and clock interval */ | 842 | /* Accumulate error between NTP and clock interval */ |
721 | timekeeper.ntp_error += tick_length << shift; | 843 | timekeeper.ntp_error += tick_length << shift; |
722 | timekeeper.ntp_error -= timekeeper.xtime_interval << | 844 | timekeeper.ntp_error -= |
845 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | ||
723 | (timekeeper.ntp_error_shift + shift); | 846 | (timekeeper.ntp_error_shift + shift); |
724 | 847 | ||
725 | return offset; | 848 | return offset; |
@@ -731,7 +854,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
731 | * | 854 | * |
732 | * Called from the timer interrupt, must hold a write on xtime_lock. | 855 | * Called from the timer interrupt, must hold a write on xtime_lock. |
733 | */ | 856 | */ |
734 | void update_wall_time(void) | 857 | static void update_wall_time(void) |
735 | { | 858 | { |
736 | struct clocksource *clock; | 859 | struct clocksource *clock; |
737 | cycle_t offset; | 860 | cycle_t offset; |
@@ -823,7 +946,7 @@ void update_wall_time(void) | |||
823 | * getboottime - Return the real time of system boot. | 946 | * getboottime - Return the real time of system boot. |
824 | * @ts: pointer to the timespec to be set | 947 | * @ts: pointer to the timespec to be set |
825 | * | 948 | * |
826 | * Returns the time of day in a timespec. | 949 | * Returns the wall-time of boot in a timespec. |
827 | * | 950 | * |
828 | * This is based on the wall_to_monotonic offset and the total suspend | 951 | * This is based on the wall_to_monotonic offset and the total suspend |
829 | * time. Calls to settimeofday will affect the value returned (which | 952 | * time. Calls to settimeofday will affect the value returned (which |
@@ -841,6 +964,55 @@ void getboottime(struct timespec *ts) | |||
841 | } | 964 | } |
842 | EXPORT_SYMBOL_GPL(getboottime); | 965 | EXPORT_SYMBOL_GPL(getboottime); |
843 | 966 | ||
967 | |||
968 | /** | ||
969 | * get_monotonic_boottime - Returns monotonic time since boot | ||
970 | * @ts: pointer to the timespec to be set | ||
971 | * | ||
972 | * Returns the monotonic time since boot in a timespec. | ||
973 | * | ||
974 | * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also | ||
975 | * includes the time spent in suspend. | ||
976 | */ | ||
977 | void get_monotonic_boottime(struct timespec *ts) | ||
978 | { | ||
979 | struct timespec tomono, sleep; | ||
980 | unsigned int seq; | ||
981 | s64 nsecs; | ||
982 | |||
983 | WARN_ON(timekeeping_suspended); | ||
984 | |||
985 | do { | ||
986 | seq = read_seqbegin(&xtime_lock); | ||
987 | *ts = xtime; | ||
988 | tomono = wall_to_monotonic; | ||
989 | sleep = total_sleep_time; | ||
990 | nsecs = timekeeping_get_ns(); | ||
991 | |||
992 | } while (read_seqretry(&xtime_lock, seq)); | ||
993 | |||
994 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, | ||
995 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); | ||
996 | } | ||
997 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); | ||
998 | |||
999 | /** | ||
1000 | * ktime_get_boottime - Returns monotonic time since boot in a ktime | ||
1001 | * | ||
1002 | * Returns the monotonic time since boot in a ktime | ||
1003 | * | ||
1004 | * This is similar to CLOCK_MONTONIC/ktime_get, but also | ||
1005 | * includes the time spent in suspend. | ||
1006 | */ | ||
1007 | ktime_t ktime_get_boottime(void) | ||
1008 | { | ||
1009 | struct timespec ts; | ||
1010 | |||
1011 | get_monotonic_boottime(&ts); | ||
1012 | return timespec_to_ktime(ts); | ||
1013 | } | ||
1014 | EXPORT_SYMBOL_GPL(ktime_get_boottime); | ||
1015 | |||
844 | /** | 1016 | /** |
845 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | 1017 | * monotonic_to_bootbased - Convert the monotonic time to boot based. |
846 | * @ts: pointer to the timespec to be converted | 1018 | * @ts: pointer to the timespec to be converted |
@@ -862,11 +1034,6 @@ struct timespec __current_kernel_time(void) | |||
862 | return xtime; | 1034 | return xtime; |
863 | } | 1035 | } |
864 | 1036 | ||
865 | struct timespec __get_wall_to_monotonic(void) | ||
866 | { | ||
867 | return wall_to_monotonic; | ||
868 | } | ||
869 | |||
870 | struct timespec current_kernel_time(void) | 1037 | struct timespec current_kernel_time(void) |
871 | { | 1038 | { |
872 | struct timespec now; | 1039 | struct timespec now; |
@@ -898,3 +1065,63 @@ struct timespec get_monotonic_coarse(void) | |||
898 | now.tv_nsec + mono.tv_nsec); | 1065 | now.tv_nsec + mono.tv_nsec); |
899 | return now; | 1066 | return now; |
900 | } | 1067 | } |
1068 | |||
1069 | /* | ||
1070 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
1071 | * without sampling the sequence number in xtime_lock. | ||
1072 | * jiffies is defined in the linker script... | ||
1073 | */ | ||
1074 | void do_timer(unsigned long ticks) | ||
1075 | { | ||
1076 | jiffies_64 += ticks; | ||
1077 | update_wall_time(); | ||
1078 | calc_global_load(ticks); | ||
1079 | } | ||
1080 | |||
1081 | /** | ||
1082 | * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, | ||
1083 | * and sleep offsets. | ||
1084 | * @xtim: pointer to timespec to be set with xtime | ||
1085 | * @wtom: pointer to timespec to be set with wall_to_monotonic | ||
1086 | * @sleep: pointer to timespec to be set with time in suspend | ||
1087 | */ | ||
1088 | void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | ||
1089 | struct timespec *wtom, struct timespec *sleep) | ||
1090 | { | ||
1091 | unsigned long seq; | ||
1092 | |||
1093 | do { | ||
1094 | seq = read_seqbegin(&xtime_lock); | ||
1095 | *xtim = xtime; | ||
1096 | *wtom = wall_to_monotonic; | ||
1097 | *sleep = total_sleep_time; | ||
1098 | } while (read_seqretry(&xtime_lock, seq)); | ||
1099 | } | ||
1100 | |||
1101 | /** | ||
1102 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | ||
1103 | */ | ||
1104 | ktime_t ktime_get_monotonic_offset(void) | ||
1105 | { | ||
1106 | unsigned long seq; | ||
1107 | struct timespec wtom; | ||
1108 | |||
1109 | do { | ||
1110 | seq = read_seqbegin(&xtime_lock); | ||
1111 | wtom = wall_to_monotonic; | ||
1112 | } while (read_seqretry(&xtime_lock, seq)); | ||
1113 | return timespec_to_ktime(wtom); | ||
1114 | } | ||
1115 | |||
1116 | /** | ||
1117 | * xtime_update() - advances the timekeeping infrastructure | ||
1118 | * @ticks: number of ticks, that have elapsed since the last call. | ||
1119 | * | ||
1120 | * Must be called with interrupts disabled. | ||
1121 | */ | ||
1122 | void xtime_update(unsigned long ticks) | ||
1123 | { | ||
1124 | write_seqlock(&xtime_lock); | ||
1125 | do_timer(ticks); | ||
1126 | write_sequnlock(&xtime_lock); | ||
1127 | } | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa92..3258455549f4 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym) | |||
41 | char symname[KSYM_NAME_LEN]; | 41 | char symname[KSYM_NAME_LEN]; |
42 | 42 | ||
43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) | 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) |
44 | SEQ_printf(m, "<%p>", sym); | 44 | SEQ_printf(m, "<%pK>", sym); |
45 | else | 45 | else |
46 | SEQ_printf(m, "%s", symname); | 46 | SEQ_printf(m, "%s", symname); |
47 | } | 47 | } |
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | |||
79 | { | 79 | { |
80 | struct hrtimer *timer, tmp; | 80 | struct hrtimer *timer, tmp; |
81 | unsigned long next = 0, i; | 81 | unsigned long next = 0, i; |
82 | struct rb_node *curr; | 82 | struct timerqueue_node *curr; |
83 | unsigned long flags; | 83 | unsigned long flags; |
84 | 84 | ||
85 | next_one: | 85 | next_one: |
86 | i = 0; | 86 | i = 0; |
87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); | 87 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); |
88 | 88 | ||
89 | curr = base->first; | 89 | curr = timerqueue_getnext(&base->active); |
90 | /* | 90 | /* |
91 | * Crude but we have to do this O(N*N) thing, because | 91 | * Crude but we have to do this O(N*N) thing, because |
92 | * we have to unlock the base when printing: | 92 | * we have to unlock the base when printing: |
93 | */ | 93 | */ |
94 | while (curr && i < next) { | 94 | while (curr && i < next) { |
95 | curr = rb_next(curr); | 95 | curr = timerqueue_iterate_next(curr); |
96 | i++; | 96 | i++; |
97 | } | 97 | } |
98 | 98 | ||
99 | if (curr) { | 99 | if (curr) { |
100 | 100 | ||
101 | timer = rb_entry(curr, struct hrtimer, node); | 101 | timer = container_of(curr, struct hrtimer, node); |
102 | tmp = *timer; | 102 | tmp = *timer; |
103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); | 103 | raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); |
104 | 104 | ||
@@ -112,7 +112,7 @@ next_one: | |||
112 | static void | 112 | static void |
113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) | 113 | print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) |
114 | { | 114 | { |
115 | SEQ_printf(m, " .base: %p\n", base); | 115 | SEQ_printf(m, " .base: %pK\n", base); |
116 | SEQ_printf(m, " .index: %d\n", | 116 | SEQ_printf(m, " .index: %d\n", |
117 | base->index); | 117 | base->index); |
118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", | 118 | SEQ_printf(m, " .resolution: %Lu nsecs\n", |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7d..a5d0a3a85dd8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | |||
236 | unsigned int timer_flag) | 236 | unsigned int timer_flag) |
237 | { | 237 | { |
238 | /* | 238 | /* |
239 | * It doesnt matter which lock we take: | 239 | * It doesn't matter which lock we take: |
240 | */ | 240 | */ |
241 | raw_spinlock_t *lock; | 241 | raw_spinlock_t *lock; |
242 | struct entry *entry, input; | 242 | struct entry *entry, input; |
diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..8cff36119e4d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -37,7 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/tick.h> | 38 | #include <linux/tick.h> |
39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
40 | #include <linux/perf_event.h> | 40 | #include <linux/irq_work.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
43 | 43 | ||
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; | |||
88 | EXPORT_SYMBOL(boot_tvec_bases); | 88 | EXPORT_SYMBOL(boot_tvec_bases); |
89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 89 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
90 | 90 | ||
91 | /* | ||
92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | ||
93 | * base in timer_list is guaranteed to be zero. Use the LSB to | ||
94 | * indicate whether the timer is deferrable. | ||
95 | * | ||
96 | * A deferrable timer will work normally when the system is busy, but | ||
97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
99 | * subsequent non-deferrable timer. | ||
100 | */ | ||
101 | #define TBASE_DEFERRABLE_FLAG (0x1) | ||
102 | |||
103 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
104 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
105 | { | 93 | { |
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) | |||
113 | 101 | ||
114 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
115 | { | 103 | { |
116 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | | 104 | timer->base = TBASE_MAKE_DEFERRED(timer->base); |
117 | TBASE_DEFERRABLE_FLAG)); | ||
118 | } | 105 | } |
119 | 106 | ||
120 | static inline void | 107 | static inline void |
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) | |||
343 | } | 330 | } |
344 | EXPORT_SYMBOL_GPL(set_timer_slack); | 331 | EXPORT_SYMBOL_GPL(set_timer_slack); |
345 | 332 | ||
346 | |||
347 | static inline void set_running_timer(struct tvec_base *base, | ||
348 | struct timer_list *timer) | ||
349 | { | ||
350 | #ifdef CONFIG_SMP | ||
351 | base->running_timer = timer; | ||
352 | #endif | ||
353 | } | ||
354 | |||
355 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 333 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
356 | { | 334 | { |
357 | unsigned long expires = timer->expires; | 335 | unsigned long expires = timer->expires; |
@@ -426,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} | |||
426 | 404 | ||
427 | static struct debug_obj_descr timer_debug_descr; | 405 | static struct debug_obj_descr timer_debug_descr; |
428 | 406 | ||
407 | static void *timer_debug_hint(void *addr) | ||
408 | { | ||
409 | return ((struct timer_list *) addr)->function; | ||
410 | } | ||
411 | |||
429 | /* | 412 | /* |
430 | * fixup_init is called when: | 413 | * fixup_init is called when: |
431 | * - an active object is initialized | 414 | * - an active object is initialized |
@@ -499,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) | |||
499 | 482 | ||
500 | static struct debug_obj_descr timer_debug_descr = { | 483 | static struct debug_obj_descr timer_debug_descr = { |
501 | .name = "timer_list", | 484 | .name = "timer_list", |
485 | .debug_hint = timer_debug_hint, | ||
502 | .fixup_init = timer_fixup_init, | 486 | .fixup_init = timer_fixup_init, |
503 | .fixup_activate = timer_fixup_activate, | 487 | .fixup_activate = timer_fixup_activate, |
504 | .fixup_free = timer_fixup_free, | 488 | .fixup_free = timer_fixup_free, |
@@ -765,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
765 | unsigned long expires_limit, mask; | 749 | unsigned long expires_limit, mask; |
766 | int bit; | 750 | int bit; |
767 | 751 | ||
768 | expires_limit = expires; | ||
769 | |||
770 | if (timer->slack >= 0) { | 752 | if (timer->slack >= 0) { |
771 | expires_limit = expires + timer->slack; | 753 | expires_limit = expires + timer->slack; |
772 | } else { | 754 | } else { |
773 | unsigned long now = jiffies; | 755 | long delta = expires - jiffies; |
756 | |||
757 | if (delta < 256) | ||
758 | return expires; | ||
774 | 759 | ||
775 | /* No slack, if already expired else auto slack 0.4% */ | 760 | expires_limit = expires + delta / 256; |
776 | if (time_after(expires, now)) | ||
777 | expires_limit = expires + (expires - now)/256; | ||
778 | } | 761 | } |
779 | mask = expires ^ expires_limit; | 762 | mask = expires ^ expires_limit; |
780 | if (mask == 0) | 763 | if (mask == 0) |
@@ -811,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
811 | */ | 794 | */ |
812 | int mod_timer(struct timer_list *timer, unsigned long expires) | 795 | int mod_timer(struct timer_list *timer, unsigned long expires) |
813 | { | 796 | { |
797 | expires = apply_slack(timer, expires); | ||
798 | |||
814 | /* | 799 | /* |
815 | * This is a common optimization triggered by the | 800 | * This is a common optimization triggered by the |
816 | * networking code - if the timer is re-modified | 801 | * networking code - if the timer is re-modified |
@@ -819,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
819 | if (timer_pending(timer) && timer->expires == expires) | 804 | if (timer_pending(timer) && timer->expires == expires) |
820 | return 1; | 805 | return 1; |
821 | 806 | ||
822 | expires = apply_slack(timer, expires); | ||
823 | |||
824 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); | 807 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); |
825 | } | 808 | } |
826 | EXPORT_SYMBOL(mod_timer); | 809 | EXPORT_SYMBOL(mod_timer); |
@@ -936,15 +919,12 @@ int del_timer(struct timer_list *timer) | |||
936 | } | 919 | } |
937 | EXPORT_SYMBOL(del_timer); | 920 | EXPORT_SYMBOL(del_timer); |
938 | 921 | ||
939 | #ifdef CONFIG_SMP | ||
940 | /** | 922 | /** |
941 | * try_to_del_timer_sync - Try to deactivate a timer | 923 | * try_to_del_timer_sync - Try to deactivate a timer |
942 | * @timer: timer do del | 924 | * @timer: timer do del |
943 | * | 925 | * |
944 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 926 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
945 | * exit the timer is not queued and the handler is not running on any CPU. | 927 | * exit the timer is not queued and the handler is not running on any CPU. |
946 | * | ||
947 | * It must not be called from interrupt contexts. | ||
948 | */ | 928 | */ |
949 | int try_to_del_timer_sync(struct timer_list *timer) | 929 | int try_to_del_timer_sync(struct timer_list *timer) |
950 | { | 930 | { |
@@ -973,6 +953,7 @@ out: | |||
973 | } | 953 | } |
974 | EXPORT_SYMBOL(try_to_del_timer_sync); | 954 | EXPORT_SYMBOL(try_to_del_timer_sync); |
975 | 955 | ||
956 | #ifdef CONFIG_SMP | ||
976 | /** | 957 | /** |
977 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 958 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
978 | * @timer: the timer to be deactivated | 959 | * @timer: the timer to be deactivated |
@@ -988,6 +969,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
988 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 969 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
989 | * not running on any CPU. | 970 | * not running on any CPU. |
990 | * | 971 | * |
972 | * Note: You must not hold locks that are held in interrupt context | ||
973 | * while calling this function. Even if the lock has nothing to do | ||
974 | * with the timer in question. Here's why: | ||
975 | * | ||
976 | * CPU0 CPU1 | ||
977 | * ---- ---- | ||
978 | * <SOFTIRQ> | ||
979 | * call_timer_fn(); | ||
980 | * base->running_timer = mytimer; | ||
981 | * spin_lock_irq(somelock); | ||
982 | * <IRQ> | ||
983 | * spin_lock(somelock); | ||
984 | * del_timer_sync(mytimer); | ||
985 | * while (base->running_timer == mytimer); | ||
986 | * | ||
987 | * Now del_timer_sync() will never return and never release somelock. | ||
988 | * The interrupt on the other CPU is waiting to grab somelock but | ||
989 | * it has interrupted the softirq that CPU0 is waiting to finish. | ||
990 | * | ||
991 | * The function returns whether it has deactivated a pending timer or not. | 991 | * The function returns whether it has deactivated a pending timer or not. |
992 | */ | 992 | */ |
993 | int del_timer_sync(struct timer_list *timer) | 993 | int del_timer_sync(struct timer_list *timer) |
@@ -995,12 +995,20 @@ int del_timer_sync(struct timer_list *timer) | |||
995 | #ifdef CONFIG_LOCKDEP | 995 | #ifdef CONFIG_LOCKDEP |
996 | unsigned long flags; | 996 | unsigned long flags; |
997 | 997 | ||
998 | /* | ||
999 | * If lockdep gives a backtrace here, please reference | ||
1000 | * the synchronization rules above. | ||
1001 | */ | ||
998 | local_irq_save(flags); | 1002 | local_irq_save(flags); |
999 | lock_map_acquire(&timer->lockdep_map); | 1003 | lock_map_acquire(&timer->lockdep_map); |
1000 | lock_map_release(&timer->lockdep_map); | 1004 | lock_map_release(&timer->lockdep_map); |
1001 | local_irq_restore(flags); | 1005 | local_irq_restore(flags); |
1002 | #endif | 1006 | #endif |
1003 | 1007 | /* | |
1008 | * don't use it in hardirq context, because it | ||
1009 | * could lead to deadlock. | ||
1010 | */ | ||
1011 | WARN_ON(in_irq()); | ||
1004 | for (;;) { | 1012 | for (;;) { |
1005 | int ret = try_to_del_timer_sync(timer); | 1013 | int ret = try_to_del_timer_sync(timer); |
1006 | if (ret >= 0) | 1014 | if (ret >= 0) |
@@ -1111,7 +1119,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1111 | 1119 | ||
1112 | timer_stats_account_timer(timer); | 1120 | timer_stats_account_timer(timer); |
1113 | 1121 | ||
1114 | set_running_timer(base, timer); | 1122 | base->running_timer = timer; |
1115 | detach_timer(timer, 1); | 1123 | detach_timer(timer, 1); |
1116 | 1124 | ||
1117 | spin_unlock_irq(&base->lock); | 1125 | spin_unlock_irq(&base->lock); |
@@ -1119,7 +1127,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1119 | spin_lock_irq(&base->lock); | 1127 | spin_lock_irq(&base->lock); |
1120 | } | 1128 | } |
1121 | } | 1129 | } |
1122 | set_running_timer(base, NULL); | 1130 | base->running_timer = NULL; |
1123 | spin_unlock_irq(&base->lock); | 1131 | spin_unlock_irq(&base->lock); |
1124 | } | 1132 | } |
1125 | 1133 | ||
@@ -1249,9 +1257,15 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
1249 | */ | 1257 | */ |
1250 | unsigned long get_next_timer_interrupt(unsigned long now) | 1258 | unsigned long get_next_timer_interrupt(unsigned long now) |
1251 | { | 1259 | { |
1252 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1260 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1253 | unsigned long expires; | 1261 | unsigned long expires; |
1254 | 1262 | ||
1263 | /* | ||
1264 | * Pretend that there is no timer pending if the cpu is offline. | ||
1265 | * Possible pending timers will be migrated later to an active cpu. | ||
1266 | */ | ||
1267 | if (cpu_is_offline(smp_processor_id())) | ||
1268 | return now + NEXT_TIMER_MAX_DELTA; | ||
1255 | spin_lock(&base->lock); | 1269 | spin_lock(&base->lock); |
1256 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1270 | if (time_before_eq(base->next_timer, base->timer_jiffies)) |
1257 | base->next_timer = __next_timer_interrupt(base); | 1271 | base->next_timer = __next_timer_interrupt(base); |
@@ -1279,7 +1293,10 @@ void update_process_times(int user_tick) | |||
1279 | run_local_timers(); | 1293 | run_local_timers(); |
1280 | rcu_check_callbacks(cpu, user_tick); | 1294 | rcu_check_callbacks(cpu, user_tick); |
1281 | printk_tick(); | 1295 | printk_tick(); |
1282 | perf_event_do_pending(); | 1296 | #ifdef CONFIG_IRQ_WORK |
1297 | if (in_irq()) | ||
1298 | irq_work_run(); | ||
1299 | #endif | ||
1283 | scheduler_tick(); | 1300 | scheduler_tick(); |
1284 | run_posix_cpu_timers(p); | 1301 | run_posix_cpu_timers(p); |
1285 | } | 1302 | } |
@@ -1289,7 +1306,7 @@ void update_process_times(int user_tick) | |||
1289 | */ | 1306 | */ |
1290 | static void run_timer_softirq(struct softirq_action *h) | 1307 | static void run_timer_softirq(struct softirq_action *h) |
1291 | { | 1308 | { |
1292 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1309 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
1293 | 1310 | ||
1294 | hrtimer_run_pending(); | 1311 | hrtimer_run_pending(); |
1295 | 1312 | ||
@@ -1306,19 +1323,6 @@ void run_local_timers(void) | |||
1306 | raise_softirq(TIMER_SOFTIRQ); | 1323 | raise_softirq(TIMER_SOFTIRQ); |
1307 | } | 1324 | } |
1308 | 1325 | ||
1309 | /* | ||
1310 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
1311 | * without sampling the sequence number in xtime_lock. | ||
1312 | * jiffies is defined in the linker script... | ||
1313 | */ | ||
1314 | |||
1315 | void do_timer(unsigned long ticks) | ||
1316 | { | ||
1317 | jiffies_64 += ticks; | ||
1318 | update_wall_time(); | ||
1319 | calc_global_load(); | ||
1320 | } | ||
1321 | |||
1322 | #ifdef __ARCH_WANT_SYS_ALARM | 1326 | #ifdef __ARCH_WANT_SYS_ALARM |
1323 | 1327 | ||
1324 | /* | 1328 | /* |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..2ad39e556cb4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS | |||
49 | help | 49 | help |
50 | See Documentation/trace/ftrace-design.txt | 50 | See Documentation/trace/ftrace-design.txt |
51 | 51 | ||
52 | config HAVE_C_RECORDMCOUNT | ||
53 | bool | ||
54 | help | ||
55 | C version of recordmcount available? | ||
56 | |||
52 | config TRACER_MAX_TRACE | 57 | config TRACER_MAX_TRACE |
53 | bool | 58 | bool |
54 | 59 | ||
@@ -64,6 +69,21 @@ config EVENT_TRACING | |||
64 | select CONTEXT_SWITCH_TRACER | 69 | select CONTEXT_SWITCH_TRACER |
65 | bool | 70 | bool |
66 | 71 | ||
72 | config EVENT_POWER_TRACING_DEPRECATED | ||
73 | depends on EVENT_TRACING | ||
74 | bool "Deprecated power event trace API, to be removed" | ||
75 | default y | ||
76 | help | ||
77 | Provides old power event types: | ||
78 | C-state/idle accounting events: | ||
79 | power:power_start | ||
80 | power:power_end | ||
81 | and old cpufreq accounting event: | ||
82 | power:power_frequency | ||
83 | This is for userspace compatibility | ||
84 | and will vanish after 5 kernel iterations, | ||
85 | namely 2.6.41. | ||
86 | |||
67 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
68 | bool | 88 | bool |
69 | 89 | ||
@@ -121,7 +141,7 @@ if FTRACE | |||
121 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
122 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
123 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
124 | select FRAME_POINTER | 144 | select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE |
125 | select KALLSYMS | 145 | select KALLSYMS |
126 | select GENERIC_TRACER | 146 | select GENERIC_TRACER |
127 | select CONTEXT_SWITCH_TRACER | 147 | select CONTEXT_SWITCH_TRACER |
@@ -255,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
255 | This tracer profiles all the the likely and unlikely macros | 275 | This tracer profiles all the the likely and unlikely macros |
256 | in the kernel. It will display the results in: | 276 | in the kernel. It will display the results in: |
257 | 277 | ||
258 | /sys/kernel/debug/tracing/profile_annotated_branch | 278 | /sys/kernel/debug/tracing/trace_stat/branch_annotated |
259 | 279 | ||
260 | Note: this will add a significant overhead; only turn this | 280 | Note: this will add a significant overhead; only turn this |
261 | on if you need to profile the system's use of these macros. | 281 | on if you need to profile the system's use of these macros. |
@@ -268,7 +288,7 @@ config PROFILE_ALL_BRANCHES | |||
268 | taken in the kernel is recorded whether it hit or miss. | 288 | taken in the kernel is recorded whether it hit or miss. |
269 | The results will be displayed in: | 289 | The results will be displayed in: |
270 | 290 | ||
271 | /sys/kernel/debug/tracing/profile_branch | 291 | /sys/kernel/debug/tracing/trace_stat/branch_all |
272 | 292 | ||
273 | This option also enables the likely/unlikely profiler. | 293 | This option also enables the likely/unlikely profiler. |
274 | 294 | ||
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b26..761c510a06c5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
52 | endif | 52 | endif |
53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
56 | ifeq ($(CONFIG_TRACING),y) | 56 | ifeq ($(CONFIG_TRACING),y) |
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
58 | endif | 58 | endif |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 959f8d6c8cc1..6957aa298dfa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/debugfs.h> | 25 | #include <linux/debugfs.h> |
26 | #include <linux/smp_lock.h> | ||
27 | #include <linux/time.h> | 26 | #include <linux/time.h> |
28 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
29 | 28 | ||
@@ -139,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) | |||
139 | !blk_tracer_enabled)) | 138 | !blk_tracer_enabled)) |
140 | return; | 139 | return; |
141 | 140 | ||
141 | /* | ||
142 | * If the BLK_TC_NOTIFY action mask isn't set, don't send any note | ||
143 | * message to the trace. | ||
144 | */ | ||
145 | if (!(bt->act_mask & BLK_TC_NOTIFY)) | ||
146 | return; | ||
147 | |||
142 | local_irq_save(flags); | 148 | local_irq_save(flags); |
143 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); | 149 | buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); |
144 | va_start(args, fmt); | 150 | va_start(args, fmt); |
@@ -169,7 +175,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, | |||
169 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | 175 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), |
170 | BLK_TC_ACT(BLK_TC_WRITE) }; | 176 | BLK_TC_ACT(BLK_TC_WRITE) }; |
171 | 177 | ||
172 | #define BLK_TC_HARDBARRIER BLK_TC_BARRIER | ||
173 | #define BLK_TC_RAHEAD BLK_TC_AHEAD | 178 | #define BLK_TC_RAHEAD BLK_TC_AHEAD |
174 | 179 | ||
175 | /* The ilog2() calls fall out because they're constant */ | 180 | /* The ilog2() calls fall out because they're constant */ |
@@ -197,7 +202,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
197 | return; | 202 | return; |
198 | 203 | ||
199 | what |= ddir_act[rw & WRITE]; | 204 | what |= ddir_act[rw & WRITE]; |
200 | what |= MASK_TC_BIT(rw, HARDBARRIER); | ||
201 | what |= MASK_TC_BIT(rw, SYNC); | 205 | what |= MASK_TC_BIT(rw, SYNC); |
202 | what |= MASK_TC_BIT(rw, RAHEAD); | 206 | what |= MASK_TC_BIT(rw, RAHEAD); |
203 | what |= MASK_TC_BIT(rw, META); | 207 | what |= MASK_TC_BIT(rw, META); |
@@ -326,6 +330,7 @@ static const struct file_operations blk_dropped_fops = { | |||
326 | .owner = THIS_MODULE, | 330 | .owner = THIS_MODULE, |
327 | .open = blk_dropped_open, | 331 | .open = blk_dropped_open, |
328 | .read = blk_dropped_read, | 332 | .read = blk_dropped_read, |
333 | .llseek = default_llseek, | ||
329 | }; | 334 | }; |
330 | 335 | ||
331 | static int blk_msg_open(struct inode *inode, struct file *filp) | 336 | static int blk_msg_open(struct inode *inode, struct file *filp) |
@@ -365,6 +370,7 @@ static const struct file_operations blk_msg_fops = { | |||
365 | .owner = THIS_MODULE, | 370 | .owner = THIS_MODULE, |
366 | .open = blk_msg_open, | 371 | .open = blk_msg_open, |
367 | .write = blk_msg_write, | 372 | .write = blk_msg_write, |
373 | .llseek = noop_llseek, | ||
368 | }; | 374 | }; |
369 | 375 | ||
370 | /* | 376 | /* |
@@ -639,7 +645,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
639 | if (!q) | 645 | if (!q) |
640 | return -ENXIO; | 646 | return -ENXIO; |
641 | 647 | ||
642 | lock_kernel(); | ||
643 | mutex_lock(&bdev->bd_mutex); | 648 | mutex_lock(&bdev->bd_mutex); |
644 | 649 | ||
645 | switch (cmd) { | 650 | switch (cmd) { |
@@ -667,7 +672,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
667 | } | 672 | } |
668 | 673 | ||
669 | mutex_unlock(&bdev->bd_mutex); | 674 | mutex_unlock(&bdev->bd_mutex); |
670 | unlock_kernel(); | ||
671 | return ret; | 675 | return ret; |
672 | } | 676 | } |
673 | 677 | ||
@@ -699,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) | |||
699 | * | 703 | * |
700 | **/ | 704 | **/ |
701 | static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | 705 | static void blk_add_trace_rq(struct request_queue *q, struct request *rq, |
702 | u32 what) | 706 | u32 what) |
703 | { | 707 | { |
704 | struct blk_trace *bt = q->blk_trace; | 708 | struct blk_trace *bt = q->blk_trace; |
705 | int rw = rq->cmd_flags & 0x03; | ||
706 | 709 | ||
707 | if (likely(!bt)) | 710 | if (likely(!bt)) |
708 | return; | 711 | return; |
709 | 712 | ||
710 | if (rq->cmd_flags & REQ_DISCARD) | ||
711 | rw |= REQ_DISCARD; | ||
712 | |||
713 | if (rq->cmd_flags & REQ_SECURE) | ||
714 | rw |= REQ_SECURE; | ||
715 | |||
716 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | 713 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
717 | what |= BLK_TC_ACT(BLK_TC_PC); | 714 | what |= BLK_TC_ACT(BLK_TC_PC); |
718 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, | 715 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, |
719 | what, rq->errors, rq->cmd_len, rq->cmd); | 716 | what, rq->errors, rq->cmd_len, rq->cmd); |
720 | } else { | 717 | } else { |
721 | what |= BLK_TC_ACT(BLK_TC_FS); | 718 | what |= BLK_TC_ACT(BLK_TC_FS); |
722 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, | 719 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), |
723 | what, rq->errors, 0, NULL); | 720 | rq->cmd_flags, what, rq->errors, 0, NULL); |
724 | } | 721 | } |
725 | } | 722 | } |
726 | 723 | ||
@@ -761,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore, | |||
761 | * @q: queue the io is for | 758 | * @q: queue the io is for |
762 | * @bio: the source bio | 759 | * @bio: the source bio |
763 | * @what: the action | 760 | * @what: the action |
761 | * @error: error, if any | ||
764 | * | 762 | * |
765 | * Description: | 763 | * Description: |
766 | * Records an action against a bio. Will log the bio offset + size. | 764 | * Records an action against a bio. Will log the bio offset + size. |
767 | * | 765 | * |
768 | **/ | 766 | **/ |
769 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | 767 | static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, |
770 | u32 what) | 768 | u32 what, int error) |
771 | { | 769 | { |
772 | struct blk_trace *bt = q->blk_trace; | 770 | struct blk_trace *bt = q->blk_trace; |
773 | 771 | ||
774 | if (likely(!bt)) | 772 | if (likely(!bt)) |
775 | return; | 773 | return; |
776 | 774 | ||
775 | if (!error && !bio_flagged(bio, BIO_UPTODATE)) | ||
776 | error = EIO; | ||
777 | |||
777 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, | 778 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, |
778 | !bio_flagged(bio, BIO_UPTODATE), 0, NULL); | 779 | error, 0, NULL); |
779 | } | 780 | } |
780 | 781 | ||
781 | static void blk_add_trace_bio_bounce(void *ignore, | 782 | static void blk_add_trace_bio_bounce(void *ignore, |
782 | struct request_queue *q, struct bio *bio) | 783 | struct request_queue *q, struct bio *bio) |
783 | { | 784 | { |
784 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); | 785 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); |
785 | } | 786 | } |
786 | 787 | ||
787 | static void blk_add_trace_bio_complete(void *ignore, | 788 | static void blk_add_trace_bio_complete(void *ignore, |
788 | struct request_queue *q, struct bio *bio) | 789 | struct request_queue *q, struct bio *bio, |
790 | int error) | ||
789 | { | 791 | { |
790 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); | 792 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); |
791 | } | 793 | } |
792 | 794 | ||
793 | static void blk_add_trace_bio_backmerge(void *ignore, | 795 | static void blk_add_trace_bio_backmerge(void *ignore, |
794 | struct request_queue *q, | 796 | struct request_queue *q, |
795 | struct bio *bio) | 797 | struct bio *bio) |
796 | { | 798 | { |
797 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); | 799 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); |
798 | } | 800 | } |
799 | 801 | ||
800 | static void blk_add_trace_bio_frontmerge(void *ignore, | 802 | static void blk_add_trace_bio_frontmerge(void *ignore, |
801 | struct request_queue *q, | 803 | struct request_queue *q, |
802 | struct bio *bio) | 804 | struct bio *bio) |
803 | { | 805 | { |
804 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); | 806 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); |
805 | } | 807 | } |
806 | 808 | ||
807 | static void blk_add_trace_bio_queue(void *ignore, | 809 | static void blk_add_trace_bio_queue(void *ignore, |
808 | struct request_queue *q, struct bio *bio) | 810 | struct request_queue *q, struct bio *bio) |
809 | { | 811 | { |
810 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE); | 812 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); |
811 | } | 813 | } |
812 | 814 | ||
813 | static void blk_add_trace_getrq(void *ignore, | 815 | static void blk_add_trace_getrq(void *ignore, |
@@ -815,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore, | |||
815 | struct bio *bio, int rw) | 817 | struct bio *bio, int rw) |
816 | { | 818 | { |
817 | if (bio) | 819 | if (bio) |
818 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ); | 820 | blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); |
819 | else { | 821 | else { |
820 | struct blk_trace *bt = q->blk_trace; | 822 | struct blk_trace *bt = q->blk_trace; |
821 | 823 | ||
@@ -830,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore, | |||
830 | struct bio *bio, int rw) | 832 | struct bio *bio, int rw) |
831 | { | 833 | { |
832 | if (bio) | 834 | if (bio) |
833 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); | 835 | blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); |
834 | else { | 836 | else { |
835 | struct blk_trace *bt = q->blk_trace; | 837 | struct blk_trace *bt = q->blk_trace; |
836 | 838 | ||
@@ -848,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) | |||
848 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); | 850 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); |
849 | } | 851 | } |
850 | 852 | ||
851 | static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) | 853 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
854 | unsigned int depth, bool explicit) | ||
852 | { | 855 | { |
853 | struct blk_trace *bt = q->blk_trace; | 856 | struct blk_trace *bt = q->blk_trace; |
854 | 857 | ||
855 | if (bt) { | 858 | if (bt) { |
856 | unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; | 859 | __be64 rpdu = cpu_to_be64(depth); |
857 | __be64 rpdu = cpu_to_be64(pdu); | 860 | u32 what; |
858 | 861 | ||
859 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, | 862 | if (explicit) |
860 | sizeof(rpdu), &rpdu); | 863 | what = BLK_TA_UNPLUG_IO; |
861 | } | 864 | else |
862 | } | 865 | what = BLK_TA_UNPLUG_TIMER; |
863 | |||
864 | static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) | ||
865 | { | ||
866 | struct blk_trace *bt = q->blk_trace; | ||
867 | |||
868 | if (bt) { | ||
869 | unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; | ||
870 | __be64 rpdu = cpu_to_be64(pdu); | ||
871 | 866 | ||
872 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, | 867 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); |
873 | sizeof(rpdu), &rpdu); | ||
874 | } | 868 | } |
875 | } | 869 | } |
876 | 870 | ||
@@ -890,7 +884,7 @@ static void blk_add_trace_split(void *ignore, | |||
890 | } | 884 | } |
891 | 885 | ||
892 | /** | 886 | /** |
893 | * blk_add_trace_remap - Add a trace for a remap operation | 887 | * blk_add_trace_bio_remap - Add a trace for a bio-remap operation |
894 | * @ignore: trace callback data parameter (not used) | 888 | * @ignore: trace callback data parameter (not used) |
895 | * @q: queue the io is for | 889 | * @q: queue the io is for |
896 | * @bio: the source bio | 890 | * @bio: the source bio |
@@ -902,9 +896,9 @@ static void blk_add_trace_split(void *ignore, | |||
902 | * it spans a stripe (or similar). Add a trace for that action. | 896 | * it spans a stripe (or similar). Add a trace for that action. |
903 | * | 897 | * |
904 | **/ | 898 | **/ |
905 | static void blk_add_trace_remap(void *ignore, | 899 | static void blk_add_trace_bio_remap(void *ignore, |
906 | struct request_queue *q, struct bio *bio, | 900 | struct request_queue *q, struct bio *bio, |
907 | dev_t dev, sector_t from) | 901 | dev_t dev, sector_t from) |
908 | { | 902 | { |
909 | struct blk_trace *bt = q->blk_trace; | 903 | struct blk_trace *bt = q->blk_trace; |
910 | struct blk_io_trace_remap r; | 904 | struct blk_io_trace_remap r; |
@@ -1013,13 +1007,11 @@ static void blk_register_tracepoints(void) | |||
1013 | WARN_ON(ret); | 1007 | WARN_ON(ret); |
1014 | ret = register_trace_block_plug(blk_add_trace_plug, NULL); | 1008 | ret = register_trace_block_plug(blk_add_trace_plug, NULL); |
1015 | WARN_ON(ret); | 1009 | WARN_ON(ret); |
1016 | ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | 1010 | ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); |
1017 | WARN_ON(ret); | ||
1018 | ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | ||
1019 | WARN_ON(ret); | 1011 | WARN_ON(ret); |
1020 | ret = register_trace_block_split(blk_add_trace_split, NULL); | 1012 | ret = register_trace_block_split(blk_add_trace_split, NULL); |
1021 | WARN_ON(ret); | 1013 | WARN_ON(ret); |
1022 | ret = register_trace_block_remap(blk_add_trace_remap, NULL); | 1014 | ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1023 | WARN_ON(ret); | 1015 | WARN_ON(ret); |
1024 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1016 | ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1025 | WARN_ON(ret); | 1017 | WARN_ON(ret); |
@@ -1028,10 +1020,9 @@ static void blk_register_tracepoints(void) | |||
1028 | static void blk_unregister_tracepoints(void) | 1020 | static void blk_unregister_tracepoints(void) |
1029 | { | 1021 | { |
1030 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1022 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1031 | unregister_trace_block_remap(blk_add_trace_remap, NULL); | 1023 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1032 | unregister_trace_block_split(blk_add_trace_split, NULL); | 1024 | unregister_trace_block_split(blk_add_trace_split, NULL); |
1033 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | 1025 | unregister_trace_block_unplug(blk_add_trace_unplug, NULL); |
1034 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | ||
1035 | unregister_trace_block_plug(blk_add_trace_plug, NULL); | 1026 | unregister_trace_block_plug(blk_add_trace_plug, NULL); |
1036 | unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); | 1027 | unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); |
1037 | unregister_trace_block_getrq(blk_add_trace_getrq, NULL); | 1028 | unregister_trace_block_getrq(blk_add_trace_getrq, NULL); |
@@ -1652,10 +1643,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, | |||
1652 | struct block_device *bdev; | 1643 | struct block_device *bdev; |
1653 | ssize_t ret = -ENXIO; | 1644 | ssize_t ret = -ENXIO; |
1654 | 1645 | ||
1655 | lock_kernel(); | ||
1656 | bdev = bdget(part_devt(p)); | 1646 | bdev = bdget(part_devt(p)); |
1657 | if (bdev == NULL) | 1647 | if (bdev == NULL) |
1658 | goto out_unlock_kernel; | 1648 | goto out; |
1659 | 1649 | ||
1660 | q = blk_trace_get_queue(bdev); | 1650 | q = blk_trace_get_queue(bdev); |
1661 | if (q == NULL) | 1651 | if (q == NULL) |
@@ -1683,8 +1673,7 @@ out_unlock_bdev: | |||
1683 | mutex_unlock(&bdev->bd_mutex); | 1673 | mutex_unlock(&bdev->bd_mutex); |
1684 | out_bdput: | 1674 | out_bdput: |
1685 | bdput(bdev); | 1675 | bdput(bdev); |
1686 | out_unlock_kernel: | 1676 | out: |
1687 | unlock_kernel(); | ||
1688 | return ret; | 1677 | return ret; |
1689 | } | 1678 | } |
1690 | 1679 | ||
@@ -1714,11 +1703,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, | |||
1714 | 1703 | ||
1715 | ret = -ENXIO; | 1704 | ret = -ENXIO; |
1716 | 1705 | ||
1717 | lock_kernel(); | ||
1718 | p = dev_to_part(dev); | 1706 | p = dev_to_part(dev); |
1719 | bdev = bdget(part_devt(p)); | 1707 | bdev = bdget(part_devt(p)); |
1720 | if (bdev == NULL) | 1708 | if (bdev == NULL) |
1721 | goto out_unlock_kernel; | 1709 | goto out; |
1722 | 1710 | ||
1723 | q = blk_trace_get_queue(bdev); | 1711 | q = blk_trace_get_queue(bdev); |
1724 | if (q == NULL) | 1712 | if (q == NULL) |
@@ -1753,8 +1741,6 @@ out_unlock_bdev: | |||
1753 | mutex_unlock(&bdev->bd_mutex); | 1741 | mutex_unlock(&bdev->bd_mutex); |
1754 | out_bdput: | 1742 | out_bdput: |
1755 | bdput(bdev); | 1743 | bdput(bdev); |
1756 | out_unlock_kernel: | ||
1757 | unlock_kernel(); | ||
1758 | out: | 1744 | out: |
1759 | return ret ? ret : count; | 1745 | return ret ? ret : count; |
1760 | } | 1746 | } |
@@ -1813,8 +1799,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1813 | 1799 | ||
1814 | if (rw & REQ_RAHEAD) | 1800 | if (rw & REQ_RAHEAD) |
1815 | rwbs[i++] = 'A'; | 1801 | rwbs[i++] = 'A'; |
1816 | if (rw & REQ_HARDBARRIER) | ||
1817 | rwbs[i++] = 'B'; | ||
1818 | if (rw & REQ_SYNC) | 1802 | if (rw & REQ_SYNC) |
1819 | rwbs[i++] = 'S'; | 1803 | rwbs[i++] = 'S'; |
1820 | if (rw & REQ_META) | 1804 | if (rw & REQ_META) |
@@ -1825,21 +1809,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1825 | rwbs[i] = '\0'; | 1809 | rwbs[i] = '\0'; |
1826 | } | 1810 | } |
1827 | 1811 | ||
1828 | void blk_fill_rwbs_rq(char *rwbs, struct request *rq) | ||
1829 | { | ||
1830 | int rw = rq->cmd_flags & 0x03; | ||
1831 | int bytes; | ||
1832 | |||
1833 | if (rq->cmd_flags & REQ_DISCARD) | ||
1834 | rw |= REQ_DISCARD; | ||
1835 | |||
1836 | if (rq->cmd_flags & REQ_SECURE) | ||
1837 | rw |= REQ_SECURE; | ||
1838 | |||
1839 | bytes = blk_rq_bytes(rq); | ||
1840 | |||
1841 | blk_fill_rwbs(rwbs, rw, bytes); | ||
1842 | } | ||
1843 | |||
1844 | #endif /* CONFIG_EVENT_TRACING */ | 1812 | #endif /* CONFIG_EVENT_TRACING */ |
1845 | 1813 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..908038f57440 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -39,20 +39,26 @@ | |||
39 | #include "trace_stat.h" | 39 | #include "trace_stat.h" |
40 | 40 | ||
41 | #define FTRACE_WARN_ON(cond) \ | 41 | #define FTRACE_WARN_ON(cond) \ |
42 | do { \ | 42 | ({ \ |
43 | if (WARN_ON(cond)) \ | 43 | int ___r = cond; \ |
44 | if (WARN_ON(___r)) \ | ||
44 | ftrace_kill(); \ | 45 | ftrace_kill(); \ |
45 | } while (0) | 46 | ___r; \ |
47 | }) | ||
46 | 48 | ||
47 | #define FTRACE_WARN_ON_ONCE(cond) \ | 49 | #define FTRACE_WARN_ON_ONCE(cond) \ |
48 | do { \ | 50 | ({ \ |
49 | if (WARN_ON_ONCE(cond)) \ | 51 | int ___r = cond; \ |
52 | if (WARN_ON_ONCE(___r)) \ | ||
50 | ftrace_kill(); \ | 53 | ftrace_kill(); \ |
51 | } while (0) | 54 | ___r; \ |
55 | }) | ||
52 | 56 | ||
53 | /* hash bits for specific function selection */ | 57 | /* hash bits for specific function selection */ |
54 | #define FTRACE_HASH_BITS 7 | 58 | #define FTRACE_HASH_BITS 7 |
55 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) | 59 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) |
60 | #define FTRACE_HASH_DEFAULT_BITS 10 | ||
61 | #define FTRACE_HASH_MAX_BITS 12 | ||
56 | 62 | ||
57 | /* ftrace_enabled is a method to turn ftrace on or off */ | 63 | /* ftrace_enabled is a method to turn ftrace on or off */ |
58 | int ftrace_enabled __read_mostly; | 64 | int ftrace_enabled __read_mostly; |
@@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly = | |||
81 | .func = ftrace_stub, | 87 | .func = ftrace_stub, |
82 | }; | 88 | }; |
83 | 89 | ||
84 | static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; | 90 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
91 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | ||
85 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 92 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
86 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 93 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
87 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 94 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
95 | static struct ftrace_ops global_ops; | ||
96 | |||
97 | static void | ||
98 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | ||
88 | 99 | ||
89 | /* | 100 | /* |
90 | * Traverse the ftrace_list, invoking all entries. The reason that we | 101 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
91 | * can use rcu_dereference_raw() is that elements removed from this list | 102 | * can use rcu_dereference_raw() is that elements removed from this list |
92 | * are simply leaked, so there is no need to interact with a grace-period | 103 | * are simply leaked, so there is no need to interact with a grace-period |
93 | * mechanism. The rcu_dereference_raw() calls are needed to handle | 104 | * mechanism. The rcu_dereference_raw() calls are needed to handle |
94 | * concurrent insertions into the ftrace_list. | 105 | * concurrent insertions into the ftrace_global_list. |
95 | * | 106 | * |
96 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 107 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
97 | */ | 108 | */ |
98 | static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) | 109 | static void ftrace_global_list_func(unsigned long ip, |
110 | unsigned long parent_ip) | ||
99 | { | 111 | { |
100 | struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ | 112 | struct ftrace_ops *op; |
101 | 113 | ||
114 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) | ||
115 | return; | ||
116 | |||
117 | trace_recursion_set(TRACE_GLOBAL_BIT); | ||
118 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ | ||
102 | while (op != &ftrace_list_end) { | 119 | while (op != &ftrace_list_end) { |
103 | op->func(ip, parent_ip); | 120 | op->func(ip, parent_ip); |
104 | op = rcu_dereference_raw(op->next); /*see above*/ | 121 | op = rcu_dereference_raw(op->next); /*see above*/ |
105 | }; | 122 | }; |
123 | trace_recursion_clear(TRACE_GLOBAL_BIT); | ||
106 | } | 124 | } |
107 | 125 | ||
108 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) | 126 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) |
@@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | |||
147 | } | 165 | } |
148 | #endif | 166 | #endif |
149 | 167 | ||
150 | static int __register_ftrace_function(struct ftrace_ops *ops) | 168 | static void update_global_ops(void) |
151 | { | 169 | { |
152 | ops->next = ftrace_list; | 170 | ftrace_func_t func; |
171 | |||
153 | /* | 172 | /* |
154 | * We are entering ops into the ftrace_list but another | 173 | * If there's only one function registered, then call that |
155 | * CPU might be walking that list. We need to make sure | 174 | * function directly. Otherwise, we need to iterate over the |
156 | * the ops->next pointer is valid before another CPU sees | 175 | * registered callers. |
157 | * the ops pointer included into the ftrace_list. | ||
158 | */ | 176 | */ |
159 | rcu_assign_pointer(ftrace_list, ops); | 177 | if (ftrace_global_list == &ftrace_list_end || |
178 | ftrace_global_list->next == &ftrace_list_end) | ||
179 | func = ftrace_global_list->func; | ||
180 | else | ||
181 | func = ftrace_global_list_func; | ||
160 | 182 | ||
161 | if (ftrace_enabled) { | 183 | /* If we filter on pids, update to use the pid function */ |
162 | ftrace_func_t func; | 184 | if (!list_empty(&ftrace_pids)) { |
185 | set_ftrace_pid_function(func); | ||
186 | func = ftrace_pid_func; | ||
187 | } | ||
163 | 188 | ||
164 | if (ops->next == &ftrace_list_end) | 189 | global_ops.func = func; |
165 | func = ops->func; | 190 | } |
166 | else | ||
167 | func = ftrace_list_func; | ||
168 | 191 | ||
169 | if (!list_empty(&ftrace_pids)) { | 192 | static void update_ftrace_function(void) |
170 | set_ftrace_pid_function(func); | 193 | { |
171 | func = ftrace_pid_func; | 194 | ftrace_func_t func; |
172 | } | 195 | |
196 | update_global_ops(); | ||
197 | |||
198 | /* | ||
199 | * If we are at the end of the list and this ops is | ||
200 | * not dynamic, then have the mcount trampoline call | ||
201 | * the function directly | ||
202 | */ | ||
203 | if (ftrace_ops_list == &ftrace_list_end || | ||
204 | (ftrace_ops_list->next == &ftrace_list_end && | ||
205 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) | ||
206 | func = ftrace_ops_list->func; | ||
207 | else | ||
208 | func = ftrace_ops_list_func; | ||
173 | 209 | ||
174 | /* | ||
175 | * For one func, simply call it directly. | ||
176 | * For more than one func, call the chain. | ||
177 | */ | ||
178 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 210 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
179 | ftrace_trace_function = func; | 211 | ftrace_trace_function = func; |
180 | #else | 212 | #else |
181 | __ftrace_trace_function = func; | 213 | __ftrace_trace_function = func; |
182 | ftrace_trace_function = ftrace_test_stop_func; | 214 | ftrace_trace_function = ftrace_test_stop_func; |
183 | #endif | 215 | #endif |
184 | } | 216 | } |
185 | 217 | ||
186 | return 0; | 218 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
219 | { | ||
220 | ops->next = *list; | ||
221 | /* | ||
222 | * We are entering ops into the list but another | ||
223 | * CPU might be walking that list. We need to make sure | ||
224 | * the ops->next pointer is valid before another CPU sees | ||
225 | * the ops pointer included into the list. | ||
226 | */ | ||
227 | rcu_assign_pointer(*list, ops); | ||
187 | } | 228 | } |
188 | 229 | ||
189 | static int __unregister_ftrace_function(struct ftrace_ops *ops) | 230 | static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
190 | { | 231 | { |
191 | struct ftrace_ops **p; | 232 | struct ftrace_ops **p; |
192 | 233 | ||
@@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
194 | * If we are removing the last function, then simply point | 235 | * If we are removing the last function, then simply point |
195 | * to the ftrace_stub. | 236 | * to the ftrace_stub. |
196 | */ | 237 | */ |
197 | if (ftrace_list == ops && ops->next == &ftrace_list_end) { | 238 | if (*list == ops && ops->next == &ftrace_list_end) { |
198 | ftrace_trace_function = ftrace_stub; | 239 | *list = &ftrace_list_end; |
199 | ftrace_list = &ftrace_list_end; | ||
200 | return 0; | 240 | return 0; |
201 | } | 241 | } |
202 | 242 | ||
203 | for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) | 243 | for (p = list; *p != &ftrace_list_end; p = &(*p)->next) |
204 | if (*p == ops) | 244 | if (*p == ops) |
205 | break; | 245 | break; |
206 | 246 | ||
@@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
208 | return -1; | 248 | return -1; |
209 | 249 | ||
210 | *p = (*p)->next; | 250 | *p = (*p)->next; |
251 | return 0; | ||
252 | } | ||
211 | 253 | ||
212 | if (ftrace_enabled) { | 254 | static int __register_ftrace_function(struct ftrace_ops *ops) |
213 | /* If we only have one func left, then call that directly */ | 255 | { |
214 | if (ftrace_list->next == &ftrace_list_end) { | 256 | if (ftrace_disabled) |
215 | ftrace_func_t func = ftrace_list->func; | 257 | return -ENODEV; |
216 | 258 | ||
217 | if (!list_empty(&ftrace_pids)) { | 259 | if (FTRACE_WARN_ON(ops == &global_ops)) |
218 | set_ftrace_pid_function(func); | 260 | return -EINVAL; |
219 | func = ftrace_pid_func; | 261 | |
220 | } | 262 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
221 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 263 | return -EBUSY; |
222 | ftrace_trace_function = func; | 264 | |
223 | #else | 265 | if (!core_kernel_data((unsigned long)ops)) |
224 | __ftrace_trace_function = func; | 266 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
225 | #endif | 267 | |
226 | } | 268 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
227 | } | 269 | int first = ftrace_global_list == &ftrace_list_end; |
270 | add_ftrace_ops(&ftrace_global_list, ops); | ||
271 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
272 | if (first) | ||
273 | add_ftrace_ops(&ftrace_ops_list, &global_ops); | ||
274 | } else | ||
275 | add_ftrace_ops(&ftrace_ops_list, ops); | ||
276 | |||
277 | if (ftrace_enabled) | ||
278 | update_ftrace_function(); | ||
228 | 279 | ||
229 | return 0; | 280 | return 0; |
230 | } | 281 | } |
231 | 282 | ||
232 | static void ftrace_update_pid_func(void) | 283 | static int __unregister_ftrace_function(struct ftrace_ops *ops) |
233 | { | 284 | { |
234 | ftrace_func_t func; | 285 | int ret; |
235 | 286 | ||
236 | if (ftrace_trace_function == ftrace_stub) | 287 | if (ftrace_disabled) |
237 | return; | 288 | return -ENODEV; |
238 | 289 | ||
239 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 290 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
240 | func = ftrace_trace_function; | 291 | return -EBUSY; |
241 | #else | ||
242 | func = __ftrace_trace_function; | ||
243 | #endif | ||
244 | 292 | ||
245 | if (!list_empty(&ftrace_pids)) { | 293 | if (FTRACE_WARN_ON(ops == &global_ops)) |
246 | set_ftrace_pid_function(func); | 294 | return -EINVAL; |
247 | func = ftrace_pid_func; | ||
248 | } else { | ||
249 | if (func == ftrace_pid_func) | ||
250 | func = ftrace_pid_function; | ||
251 | } | ||
252 | 295 | ||
253 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 296 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
254 | ftrace_trace_function = func; | 297 | ret = remove_ftrace_ops(&ftrace_global_list, ops); |
255 | #else | 298 | if (!ret && ftrace_global_list == &ftrace_list_end) |
256 | __ftrace_trace_function = func; | 299 | ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); |
257 | #endif | 300 | if (!ret) |
301 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
302 | } else | ||
303 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); | ||
304 | |||
305 | if (ret < 0) | ||
306 | return ret; | ||
307 | |||
308 | if (ftrace_enabled) | ||
309 | update_ftrace_function(); | ||
310 | |||
311 | /* | ||
312 | * Dynamic ops may be freed, we must make sure that all | ||
313 | * callers are done before leaving this function. | ||
314 | */ | ||
315 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
316 | synchronize_sched(); | ||
317 | |||
318 | return 0; | ||
319 | } | ||
320 | |||
321 | static void ftrace_update_pid_func(void) | ||
322 | { | ||
323 | /* Only do something if we are tracing something */ | ||
324 | if (ftrace_trace_function == ftrace_stub) | ||
325 | return; | ||
326 | |||
327 | update_ftrace_function(); | ||
258 | } | 328 | } |
259 | 329 | ||
260 | #ifdef CONFIG_FUNCTION_PROFILER | 330 | #ifdef CONFIG_FUNCTION_PROFILER |
@@ -800,6 +870,7 @@ static const struct file_operations ftrace_profile_fops = { | |||
800 | .open = tracing_open_generic, | 870 | .open = tracing_open_generic, |
801 | .read = ftrace_profile_read, | 871 | .read = ftrace_profile_read, |
802 | .write = ftrace_profile_write, | 872 | .write = ftrace_profile_write, |
873 | .llseek = default_llseek, | ||
803 | }; | 874 | }; |
804 | 875 | ||
805 | /* used to initialize the real stat files */ | 876 | /* used to initialize the real stat files */ |
@@ -884,13 +955,38 @@ enum { | |||
884 | FTRACE_ENABLE_CALLS = (1 << 0), | 955 | FTRACE_ENABLE_CALLS = (1 << 0), |
885 | FTRACE_DISABLE_CALLS = (1 << 1), | 956 | FTRACE_DISABLE_CALLS = (1 << 1), |
886 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), | 957 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), |
887 | FTRACE_ENABLE_MCOUNT = (1 << 3), | 958 | FTRACE_START_FUNC_RET = (1 << 3), |
888 | FTRACE_DISABLE_MCOUNT = (1 << 4), | 959 | FTRACE_STOP_FUNC_RET = (1 << 4), |
889 | FTRACE_START_FUNC_RET = (1 << 5), | 960 | }; |
890 | FTRACE_STOP_FUNC_RET = (1 << 6), | 961 | struct ftrace_func_entry { |
962 | struct hlist_node hlist; | ||
963 | unsigned long ip; | ||
964 | }; | ||
965 | |||
966 | struct ftrace_hash { | ||
967 | unsigned long size_bits; | ||
968 | struct hlist_head *buckets; | ||
969 | unsigned long count; | ||
970 | struct rcu_head rcu; | ||
971 | }; | ||
972 | |||
973 | /* | ||
974 | * We make these constant because no one should touch them, | ||
975 | * but they are used as the default "empty hash", to avoid allocating | ||
976 | * it all the time. These are in a read only section such that if | ||
977 | * anyone does try to modify it, it will cause an exception. | ||
978 | */ | ||
979 | static const struct hlist_head empty_buckets[1]; | ||
980 | static const struct ftrace_hash empty_hash = { | ||
981 | .buckets = (struct hlist_head *)empty_buckets, | ||
891 | }; | 982 | }; |
983 | #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) | ||
892 | 984 | ||
893 | static int ftrace_filtered; | 985 | static struct ftrace_ops global_ops = { |
986 | .func = ftrace_stub, | ||
987 | .notrace_hash = EMPTY_HASH, | ||
988 | .filter_hash = EMPTY_HASH, | ||
989 | }; | ||
894 | 990 | ||
895 | static struct dyn_ftrace *ftrace_new_addrs; | 991 | static struct dyn_ftrace *ftrace_new_addrs; |
896 | 992 | ||
@@ -913,6 +1009,269 @@ static struct ftrace_page *ftrace_pages; | |||
913 | 1009 | ||
914 | static struct dyn_ftrace *ftrace_free_records; | 1010 | static struct dyn_ftrace *ftrace_free_records; |
915 | 1011 | ||
1012 | static struct ftrace_func_entry * | ||
1013 | ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | ||
1014 | { | ||
1015 | unsigned long key; | ||
1016 | struct ftrace_func_entry *entry; | ||
1017 | struct hlist_head *hhd; | ||
1018 | struct hlist_node *n; | ||
1019 | |||
1020 | if (!hash->count) | ||
1021 | return NULL; | ||
1022 | |||
1023 | if (hash->size_bits > 0) | ||
1024 | key = hash_long(ip, hash->size_bits); | ||
1025 | else | ||
1026 | key = 0; | ||
1027 | |||
1028 | hhd = &hash->buckets[key]; | ||
1029 | |||
1030 | hlist_for_each_entry_rcu(entry, n, hhd, hlist) { | ||
1031 | if (entry->ip == ip) | ||
1032 | return entry; | ||
1033 | } | ||
1034 | return NULL; | ||
1035 | } | ||
1036 | |||
1037 | static void __add_hash_entry(struct ftrace_hash *hash, | ||
1038 | struct ftrace_func_entry *entry) | ||
1039 | { | ||
1040 | struct hlist_head *hhd; | ||
1041 | unsigned long key; | ||
1042 | |||
1043 | if (hash->size_bits) | ||
1044 | key = hash_long(entry->ip, hash->size_bits); | ||
1045 | else | ||
1046 | key = 0; | ||
1047 | |||
1048 | hhd = &hash->buckets[key]; | ||
1049 | hlist_add_head(&entry->hlist, hhd); | ||
1050 | hash->count++; | ||
1051 | } | ||
1052 | |||
1053 | static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) | ||
1054 | { | ||
1055 | struct ftrace_func_entry *entry; | ||
1056 | |||
1057 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
1058 | if (!entry) | ||
1059 | return -ENOMEM; | ||
1060 | |||
1061 | entry->ip = ip; | ||
1062 | __add_hash_entry(hash, entry); | ||
1063 | |||
1064 | return 0; | ||
1065 | } | ||
1066 | |||
1067 | static void | ||
1068 | free_hash_entry(struct ftrace_hash *hash, | ||
1069 | struct ftrace_func_entry *entry) | ||
1070 | { | ||
1071 | hlist_del(&entry->hlist); | ||
1072 | kfree(entry); | ||
1073 | hash->count--; | ||
1074 | } | ||
1075 | |||
1076 | static void | ||
1077 | remove_hash_entry(struct ftrace_hash *hash, | ||
1078 | struct ftrace_func_entry *entry) | ||
1079 | { | ||
1080 | hlist_del(&entry->hlist); | ||
1081 | hash->count--; | ||
1082 | } | ||
1083 | |||
1084 | static void ftrace_hash_clear(struct ftrace_hash *hash) | ||
1085 | { | ||
1086 | struct hlist_head *hhd; | ||
1087 | struct hlist_node *tp, *tn; | ||
1088 | struct ftrace_func_entry *entry; | ||
1089 | int size = 1 << hash->size_bits; | ||
1090 | int i; | ||
1091 | |||
1092 | if (!hash->count) | ||
1093 | return; | ||
1094 | |||
1095 | for (i = 0; i < size; i++) { | ||
1096 | hhd = &hash->buckets[i]; | ||
1097 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) | ||
1098 | free_hash_entry(hash, entry); | ||
1099 | } | ||
1100 | FTRACE_WARN_ON(hash->count); | ||
1101 | } | ||
1102 | |||
1103 | static void free_ftrace_hash(struct ftrace_hash *hash) | ||
1104 | { | ||
1105 | if (!hash || hash == EMPTY_HASH) | ||
1106 | return; | ||
1107 | ftrace_hash_clear(hash); | ||
1108 | kfree(hash->buckets); | ||
1109 | kfree(hash); | ||
1110 | } | ||
1111 | |||
1112 | static void __free_ftrace_hash_rcu(struct rcu_head *rcu) | ||
1113 | { | ||
1114 | struct ftrace_hash *hash; | ||
1115 | |||
1116 | hash = container_of(rcu, struct ftrace_hash, rcu); | ||
1117 | free_ftrace_hash(hash); | ||
1118 | } | ||
1119 | |||
1120 | static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | ||
1121 | { | ||
1122 | if (!hash || hash == EMPTY_HASH) | ||
1123 | return; | ||
1124 | call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); | ||
1125 | } | ||
1126 | |||
1127 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | ||
1128 | { | ||
1129 | struct ftrace_hash *hash; | ||
1130 | int size; | ||
1131 | |||
1132 | hash = kzalloc(sizeof(*hash), GFP_KERNEL); | ||
1133 | if (!hash) | ||
1134 | return NULL; | ||
1135 | |||
1136 | size = 1 << size_bits; | ||
1137 | hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); | ||
1138 | |||
1139 | if (!hash->buckets) { | ||
1140 | kfree(hash); | ||
1141 | return NULL; | ||
1142 | } | ||
1143 | |||
1144 | hash->size_bits = size_bits; | ||
1145 | |||
1146 | return hash; | ||
1147 | } | ||
1148 | |||
1149 | static struct ftrace_hash * | ||
1150 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | ||
1151 | { | ||
1152 | struct ftrace_func_entry *entry; | ||
1153 | struct ftrace_hash *new_hash; | ||
1154 | struct hlist_node *tp; | ||
1155 | int size; | ||
1156 | int ret; | ||
1157 | int i; | ||
1158 | |||
1159 | new_hash = alloc_ftrace_hash(size_bits); | ||
1160 | if (!new_hash) | ||
1161 | return NULL; | ||
1162 | |||
1163 | /* Empty hash? */ | ||
1164 | if (!hash || !hash->count) | ||
1165 | return new_hash; | ||
1166 | |||
1167 | size = 1 << hash->size_bits; | ||
1168 | for (i = 0; i < size; i++) { | ||
1169 | hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { | ||
1170 | ret = add_hash_entry(new_hash, entry->ip); | ||
1171 | if (ret < 0) | ||
1172 | goto free_hash; | ||
1173 | } | ||
1174 | } | ||
1175 | |||
1176 | FTRACE_WARN_ON(new_hash->count != hash->count); | ||
1177 | |||
1178 | return new_hash; | ||
1179 | |||
1180 | free_hash: | ||
1181 | free_ftrace_hash(new_hash); | ||
1182 | return NULL; | ||
1183 | } | ||
1184 | |||
1185 | static int | ||
1186 | ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | ||
1187 | { | ||
1188 | struct ftrace_func_entry *entry; | ||
1189 | struct hlist_node *tp, *tn; | ||
1190 | struct hlist_head *hhd; | ||
1191 | struct ftrace_hash *old_hash; | ||
1192 | struct ftrace_hash *new_hash; | ||
1193 | unsigned long key; | ||
1194 | int size = src->count; | ||
1195 | int bits = 0; | ||
1196 | int i; | ||
1197 | |||
1198 | /* | ||
1199 | * If the new source is empty, just free dst and assign it | ||
1200 | * the empty_hash. | ||
1201 | */ | ||
1202 | if (!src->count) { | ||
1203 | free_ftrace_hash_rcu(*dst); | ||
1204 | rcu_assign_pointer(*dst, EMPTY_HASH); | ||
1205 | return 0; | ||
1206 | } | ||
1207 | |||
1208 | /* | ||
1209 | * Make the hash size about 1/2 the # found | ||
1210 | */ | ||
1211 | for (size /= 2; size; size >>= 1) | ||
1212 | bits++; | ||
1213 | |||
1214 | /* Don't allocate too much */ | ||
1215 | if (bits > FTRACE_HASH_MAX_BITS) | ||
1216 | bits = FTRACE_HASH_MAX_BITS; | ||
1217 | |||
1218 | new_hash = alloc_ftrace_hash(bits); | ||
1219 | if (!new_hash) | ||
1220 | return -ENOMEM; | ||
1221 | |||
1222 | size = 1 << src->size_bits; | ||
1223 | for (i = 0; i < size; i++) { | ||
1224 | hhd = &src->buckets[i]; | ||
1225 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { | ||
1226 | if (bits > 0) | ||
1227 | key = hash_long(entry->ip, bits); | ||
1228 | else | ||
1229 | key = 0; | ||
1230 | remove_hash_entry(src, entry); | ||
1231 | __add_hash_entry(new_hash, entry); | ||
1232 | } | ||
1233 | } | ||
1234 | |||
1235 | old_hash = *dst; | ||
1236 | rcu_assign_pointer(*dst, new_hash); | ||
1237 | free_ftrace_hash_rcu(old_hash); | ||
1238 | |||
1239 | return 0; | ||
1240 | } | ||
1241 | |||
1242 | /* | ||
1243 | * Test the hashes for this ops to see if we want to call | ||
1244 | * the ops->func or not. | ||
1245 | * | ||
1246 | * It's a match if the ip is in the ops->filter_hash or | ||
1247 | * the filter_hash does not exist or is empty, | ||
1248 | * AND | ||
1249 | * the ip is not in the ops->notrace_hash. | ||
1250 | * | ||
1251 | * This needs to be called with preemption disabled as | ||
1252 | * the hashes are freed with call_rcu_sched(). | ||
1253 | */ | ||
1254 | static int | ||
1255 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
1256 | { | ||
1257 | struct ftrace_hash *filter_hash; | ||
1258 | struct ftrace_hash *notrace_hash; | ||
1259 | int ret; | ||
1260 | |||
1261 | filter_hash = rcu_dereference_raw(ops->filter_hash); | ||
1262 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | ||
1263 | |||
1264 | if ((!filter_hash || !filter_hash->count || | ||
1265 | ftrace_lookup_ip(filter_hash, ip)) && | ||
1266 | (!notrace_hash || !notrace_hash->count || | ||
1267 | !ftrace_lookup_ip(notrace_hash, ip))) | ||
1268 | ret = 1; | ||
1269 | else | ||
1270 | ret = 0; | ||
1271 | |||
1272 | return ret; | ||
1273 | } | ||
1274 | |||
916 | /* | 1275 | /* |
917 | * This is a double for. Do not use 'break' to break out of the loop, | 1276 | * This is a double for. Do not use 'break' to break out of the loop, |
918 | * you must use a goto. | 1277 | * you must use a goto. |
@@ -927,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records; | |||
927 | } \ | 1286 | } \ |
928 | } | 1287 | } |
929 | 1288 | ||
1289 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | ||
1290 | int filter_hash, | ||
1291 | bool inc) | ||
1292 | { | ||
1293 | struct ftrace_hash *hash; | ||
1294 | struct ftrace_hash *other_hash; | ||
1295 | struct ftrace_page *pg; | ||
1296 | struct dyn_ftrace *rec; | ||
1297 | int count = 0; | ||
1298 | int all = 0; | ||
1299 | |||
1300 | /* Only update if the ops has been registered */ | ||
1301 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
1302 | return; | ||
1303 | |||
1304 | /* | ||
1305 | * In the filter_hash case: | ||
1306 | * If the count is zero, we update all records. | ||
1307 | * Otherwise we just update the items in the hash. | ||
1308 | * | ||
1309 | * In the notrace_hash case: | ||
1310 | * We enable the update in the hash. | ||
1311 | * As disabling notrace means enabling the tracing, | ||
1312 | * and enabling notrace means disabling, the inc variable | ||
1313 | * gets inversed. | ||
1314 | */ | ||
1315 | if (filter_hash) { | ||
1316 | hash = ops->filter_hash; | ||
1317 | other_hash = ops->notrace_hash; | ||
1318 | if (!hash || !hash->count) | ||
1319 | all = 1; | ||
1320 | } else { | ||
1321 | inc = !inc; | ||
1322 | hash = ops->notrace_hash; | ||
1323 | other_hash = ops->filter_hash; | ||
1324 | /* | ||
1325 | * If the notrace hash has no items, | ||
1326 | * then there's nothing to do. | ||
1327 | */ | ||
1328 | if (hash && !hash->count) | ||
1329 | return; | ||
1330 | } | ||
1331 | |||
1332 | do_for_each_ftrace_rec(pg, rec) { | ||
1333 | int in_other_hash = 0; | ||
1334 | int in_hash = 0; | ||
1335 | int match = 0; | ||
1336 | |||
1337 | if (all) { | ||
1338 | /* | ||
1339 | * Only the filter_hash affects all records. | ||
1340 | * Update if the record is not in the notrace hash. | ||
1341 | */ | ||
1342 | if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) | ||
1343 | match = 1; | ||
1344 | } else { | ||
1345 | in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); | ||
1346 | in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); | ||
1347 | |||
1348 | /* | ||
1349 | * | ||
1350 | */ | ||
1351 | if (filter_hash && in_hash && !in_other_hash) | ||
1352 | match = 1; | ||
1353 | else if (!filter_hash && in_hash && | ||
1354 | (in_other_hash || !other_hash->count)) | ||
1355 | match = 1; | ||
1356 | } | ||
1357 | if (!match) | ||
1358 | continue; | ||
1359 | |||
1360 | if (inc) { | ||
1361 | rec->flags++; | ||
1362 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | ||
1363 | return; | ||
1364 | } else { | ||
1365 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | ||
1366 | return; | ||
1367 | rec->flags--; | ||
1368 | } | ||
1369 | count++; | ||
1370 | /* Shortcut, if we handled all records, we are done. */ | ||
1371 | if (!all && count == hash->count) | ||
1372 | return; | ||
1373 | } while_for_each_ftrace_rec(); | ||
1374 | } | ||
1375 | |||
1376 | static void ftrace_hash_rec_disable(struct ftrace_ops *ops, | ||
1377 | int filter_hash) | ||
1378 | { | ||
1379 | __ftrace_hash_rec_update(ops, filter_hash, 0); | ||
1380 | } | ||
1381 | |||
1382 | static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | ||
1383 | int filter_hash) | ||
1384 | { | ||
1385 | __ftrace_hash_rec_update(ops, filter_hash, 1); | ||
1386 | } | ||
1387 | |||
930 | static void ftrace_free_rec(struct dyn_ftrace *rec) | 1388 | static void ftrace_free_rec(struct dyn_ftrace *rec) |
931 | { | 1389 | { |
932 | rec->freelist = ftrace_free_records; | 1390 | rec->freelist = ftrace_free_records; |
@@ -1048,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1048 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1506 | ftrace_addr = (unsigned long)FTRACE_ADDR; |
1049 | 1507 | ||
1050 | /* | 1508 | /* |
1051 | * If this record is not to be traced or we want to disable it, | 1509 | * If we are enabling tracing: |
1052 | * then disable it. | ||
1053 | * | 1510 | * |
1054 | * If we want to enable it and filtering is off, then enable it. | 1511 | * If the record has a ref count, then we need to enable it |
1512 | * because someone is using it. | ||
1055 | * | 1513 | * |
1056 | * If we want to enable it and filtering is on, enable it only if | 1514 | * Otherwise we make sure its disabled. |
1057 | * it's filtered | 1515 | * |
1516 | * If we are disabling tracing, then disable all records that | ||
1517 | * are enabled. | ||
1058 | */ | 1518 | */ |
1059 | if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { | 1519 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
1060 | if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) | 1520 | flag = FTRACE_FL_ENABLED; |
1061 | flag = FTRACE_FL_ENABLED; | ||
1062 | } | ||
1063 | 1521 | ||
1064 | /* If the state of this record hasn't changed, then do nothing */ | 1522 | /* If the state of this record hasn't changed, then do nothing */ |
1065 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1523 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
@@ -1080,19 +1538,16 @@ static void ftrace_replace_code(int enable) | |||
1080 | struct ftrace_page *pg; | 1538 | struct ftrace_page *pg; |
1081 | int failed; | 1539 | int failed; |
1082 | 1540 | ||
1541 | if (unlikely(ftrace_disabled)) | ||
1542 | return; | ||
1543 | |||
1083 | do_for_each_ftrace_rec(pg, rec) { | 1544 | do_for_each_ftrace_rec(pg, rec) { |
1084 | /* | 1545 | /* Skip over free records */ |
1085 | * Skip over free records, records that have | 1546 | if (rec->flags & FTRACE_FL_FREE) |
1086 | * failed and not converted. | ||
1087 | */ | ||
1088 | if (rec->flags & FTRACE_FL_FREE || | ||
1089 | rec->flags & FTRACE_FL_FAILED || | ||
1090 | !(rec->flags & FTRACE_FL_CONVERTED)) | ||
1091 | continue; | 1547 | continue; |
1092 | 1548 | ||
1093 | failed = __ftrace_replace_code(rec, enable); | 1549 | failed = __ftrace_replace_code(rec, enable); |
1094 | if (failed) { | 1550 | if (failed) { |
1095 | rec->flags |= FTRACE_FL_FAILED; | ||
1096 | ftrace_bug(failed, rec->ip); | 1551 | ftrace_bug(failed, rec->ip); |
1097 | /* Stop processing */ | 1552 | /* Stop processing */ |
1098 | return; | 1553 | return; |
@@ -1108,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | |||
1108 | 1563 | ||
1109 | ip = rec->ip; | 1564 | ip = rec->ip; |
1110 | 1565 | ||
1566 | if (unlikely(ftrace_disabled)) | ||
1567 | return 0; | ||
1568 | |||
1111 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); | 1569 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); |
1112 | if (ret) { | 1570 | if (ret) { |
1113 | ftrace_bug(ret, ip); | 1571 | ftrace_bug(ret, ip); |
1114 | rec->flags |= FTRACE_FL_FAILED; | ||
1115 | return 0; | 1572 | return 0; |
1116 | } | 1573 | } |
1117 | return 1; | 1574 | return 1; |
@@ -1172,6 +1629,7 @@ static void ftrace_run_update_code(int command) | |||
1172 | 1629 | ||
1173 | static ftrace_func_t saved_ftrace_func; | 1630 | static ftrace_func_t saved_ftrace_func; |
1174 | static int ftrace_start_up; | 1631 | static int ftrace_start_up; |
1632 | static int global_start_up; | ||
1175 | 1633 | ||
1176 | static void ftrace_startup_enable(int command) | 1634 | static void ftrace_startup_enable(int command) |
1177 | { | 1635 | { |
@@ -1186,19 +1644,38 @@ static void ftrace_startup_enable(int command) | |||
1186 | ftrace_run_update_code(command); | 1644 | ftrace_run_update_code(command); |
1187 | } | 1645 | } |
1188 | 1646 | ||
1189 | static void ftrace_startup(int command) | 1647 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
1190 | { | 1648 | { |
1649 | bool hash_enable = true; | ||
1650 | |||
1191 | if (unlikely(ftrace_disabled)) | 1651 | if (unlikely(ftrace_disabled)) |
1192 | return; | 1652 | return -ENODEV; |
1193 | 1653 | ||
1194 | ftrace_start_up++; | 1654 | ftrace_start_up++; |
1195 | command |= FTRACE_ENABLE_CALLS; | 1655 | command |= FTRACE_ENABLE_CALLS; |
1196 | 1656 | ||
1657 | /* ops marked global share the filter hashes */ | ||
1658 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
1659 | ops = &global_ops; | ||
1660 | /* Don't update hash if global is already set */ | ||
1661 | if (global_start_up) | ||
1662 | hash_enable = false; | ||
1663 | global_start_up++; | ||
1664 | } | ||
1665 | |||
1666 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
1667 | if (hash_enable) | ||
1668 | ftrace_hash_rec_enable(ops, 1); | ||
1669 | |||
1197 | ftrace_startup_enable(command); | 1670 | ftrace_startup_enable(command); |
1671 | |||
1672 | return 0; | ||
1198 | } | 1673 | } |
1199 | 1674 | ||
1200 | static void ftrace_shutdown(int command) | 1675 | static void ftrace_shutdown(struct ftrace_ops *ops, int command) |
1201 | { | 1676 | { |
1677 | bool hash_disable = true; | ||
1678 | |||
1202 | if (unlikely(ftrace_disabled)) | 1679 | if (unlikely(ftrace_disabled)) |
1203 | return; | 1680 | return; |
1204 | 1681 | ||
@@ -1210,6 +1687,23 @@ static void ftrace_shutdown(int command) | |||
1210 | */ | 1687 | */ |
1211 | WARN_ON_ONCE(ftrace_start_up < 0); | 1688 | WARN_ON_ONCE(ftrace_start_up < 0); |
1212 | 1689 | ||
1690 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
1691 | ops = &global_ops; | ||
1692 | global_start_up--; | ||
1693 | WARN_ON_ONCE(global_start_up < 0); | ||
1694 | /* Don't update hash if global still has users */ | ||
1695 | if (global_start_up) { | ||
1696 | WARN_ON_ONCE(!ftrace_start_up); | ||
1697 | hash_disable = false; | ||
1698 | } | ||
1699 | } | ||
1700 | |||
1701 | if (hash_disable) | ||
1702 | ftrace_hash_rec_disable(ops, 1); | ||
1703 | |||
1704 | if (ops != &global_ops || !global_start_up) | ||
1705 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
1706 | |||
1213 | if (!ftrace_start_up) | 1707 | if (!ftrace_start_up) |
1214 | command |= FTRACE_DISABLE_CALLS; | 1708 | command |= FTRACE_DISABLE_CALLS; |
1215 | 1709 | ||
@@ -1226,8 +1720,6 @@ static void ftrace_shutdown(int command) | |||
1226 | 1720 | ||
1227 | static void ftrace_startup_sysctl(void) | 1721 | static void ftrace_startup_sysctl(void) |
1228 | { | 1722 | { |
1229 | int command = FTRACE_ENABLE_MCOUNT; | ||
1230 | |||
1231 | if (unlikely(ftrace_disabled)) | 1723 | if (unlikely(ftrace_disabled)) |
1232 | return; | 1724 | return; |
1233 | 1725 | ||
@@ -1235,23 +1727,17 @@ static void ftrace_startup_sysctl(void) | |||
1235 | saved_ftrace_func = NULL; | 1727 | saved_ftrace_func = NULL; |
1236 | /* ftrace_start_up is true if we want ftrace running */ | 1728 | /* ftrace_start_up is true if we want ftrace running */ |
1237 | if (ftrace_start_up) | 1729 | if (ftrace_start_up) |
1238 | command |= FTRACE_ENABLE_CALLS; | 1730 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); |
1239 | |||
1240 | ftrace_run_update_code(command); | ||
1241 | } | 1731 | } |
1242 | 1732 | ||
1243 | static void ftrace_shutdown_sysctl(void) | 1733 | static void ftrace_shutdown_sysctl(void) |
1244 | { | 1734 | { |
1245 | int command = FTRACE_DISABLE_MCOUNT; | ||
1246 | |||
1247 | if (unlikely(ftrace_disabled)) | 1735 | if (unlikely(ftrace_disabled)) |
1248 | return; | 1736 | return; |
1249 | 1737 | ||
1250 | /* ftrace_start_up is true if ftrace is running */ | 1738 | /* ftrace_start_up is true if ftrace is running */ |
1251 | if (ftrace_start_up) | 1739 | if (ftrace_start_up) |
1252 | command |= FTRACE_DISABLE_CALLS; | 1740 | ftrace_run_update_code(FTRACE_DISABLE_CALLS); |
1253 | |||
1254 | ftrace_run_update_code(command); | ||
1255 | } | 1741 | } |
1256 | 1742 | ||
1257 | static cycle_t ftrace_update_time; | 1743 | static cycle_t ftrace_update_time; |
@@ -1277,15 +1763,15 @@ static int ftrace_update_code(struct module *mod) | |||
1277 | p->flags = 0L; | 1763 | p->flags = 0L; |
1278 | 1764 | ||
1279 | /* | 1765 | /* |
1280 | * Do the initial record convertion from mcount jump | 1766 | * Do the initial record conversion from mcount jump |
1281 | * to the NOP instructions. | 1767 | * to the NOP instructions. |
1282 | */ | 1768 | */ |
1283 | if (!ftrace_code_disable(mod, p)) { | 1769 | if (!ftrace_code_disable(mod, p)) { |
1284 | ftrace_free_rec(p); | 1770 | ftrace_free_rec(p); |
1285 | continue; | 1771 | /* Game over */ |
1772 | break; | ||
1286 | } | 1773 | } |
1287 | 1774 | ||
1288 | p->flags |= FTRACE_FL_CONVERTED; | ||
1289 | ftrace_update_cnt++; | 1775 | ftrace_update_cnt++; |
1290 | 1776 | ||
1291 | /* | 1777 | /* |
@@ -1360,32 +1846,39 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) | |||
1360 | enum { | 1846 | enum { |
1361 | FTRACE_ITER_FILTER = (1 << 0), | 1847 | FTRACE_ITER_FILTER = (1 << 0), |
1362 | FTRACE_ITER_NOTRACE = (1 << 1), | 1848 | FTRACE_ITER_NOTRACE = (1 << 1), |
1363 | FTRACE_ITER_FAILURES = (1 << 2), | 1849 | FTRACE_ITER_PRINTALL = (1 << 2), |
1364 | FTRACE_ITER_PRINTALL = (1 << 3), | 1850 | FTRACE_ITER_HASH = (1 << 3), |
1365 | FTRACE_ITER_HASH = (1 << 4), | 1851 | FTRACE_ITER_ENABLED = (1 << 4), |
1366 | }; | 1852 | }; |
1367 | 1853 | ||
1368 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1854 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
1369 | 1855 | ||
1370 | struct ftrace_iterator { | 1856 | struct ftrace_iterator { |
1371 | struct ftrace_page *pg; | 1857 | loff_t pos; |
1372 | int hidx; | 1858 | loff_t func_pos; |
1373 | int idx; | 1859 | struct ftrace_page *pg; |
1374 | unsigned flags; | 1860 | struct dyn_ftrace *func; |
1375 | struct trace_parser parser; | 1861 | struct ftrace_func_probe *probe; |
1862 | struct trace_parser parser; | ||
1863 | struct ftrace_hash *hash; | ||
1864 | struct ftrace_ops *ops; | ||
1865 | int hidx; | ||
1866 | int idx; | ||
1867 | unsigned flags; | ||
1376 | }; | 1868 | }; |
1377 | 1869 | ||
1378 | static void * | 1870 | static void * |
1379 | t_hash_next(struct seq_file *m, void *v, loff_t *pos) | 1871 | t_hash_next(struct seq_file *m, loff_t *pos) |
1380 | { | 1872 | { |
1381 | struct ftrace_iterator *iter = m->private; | 1873 | struct ftrace_iterator *iter = m->private; |
1382 | struct hlist_node *hnd = v; | 1874 | struct hlist_node *hnd = NULL; |
1383 | struct hlist_head *hhd; | 1875 | struct hlist_head *hhd; |
1384 | 1876 | ||
1385 | WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); | ||
1386 | |||
1387 | (*pos)++; | 1877 | (*pos)++; |
1878 | iter->pos = *pos; | ||
1388 | 1879 | ||
1880 | if (iter->probe) | ||
1881 | hnd = &iter->probe->node; | ||
1389 | retry: | 1882 | retry: |
1390 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) | 1883 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) |
1391 | return NULL; | 1884 | return NULL; |
@@ -1408,7 +1901,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) | |||
1408 | } | 1901 | } |
1409 | } | 1902 | } |
1410 | 1903 | ||
1411 | return hnd; | 1904 | if (WARN_ON_ONCE(!hnd)) |
1905 | return NULL; | ||
1906 | |||
1907 | iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); | ||
1908 | |||
1909 | return iter; | ||
1412 | } | 1910 | } |
1413 | 1911 | ||
1414 | static void *t_hash_start(struct seq_file *m, loff_t *pos) | 1912 | static void *t_hash_start(struct seq_file *m, loff_t *pos) |
@@ -1417,26 +1915,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) | |||
1417 | void *p = NULL; | 1915 | void *p = NULL; |
1418 | loff_t l; | 1916 | loff_t l; |
1419 | 1917 | ||
1420 | if (!(iter->flags & FTRACE_ITER_HASH)) | 1918 | if (iter->func_pos > *pos) |
1421 | *pos = 0; | 1919 | return NULL; |
1422 | |||
1423 | iter->flags |= FTRACE_ITER_HASH; | ||
1424 | 1920 | ||
1425 | iter->hidx = 0; | 1921 | iter->hidx = 0; |
1426 | for (l = 0; l <= *pos; ) { | 1922 | for (l = 0; l <= (*pos - iter->func_pos); ) { |
1427 | p = t_hash_next(m, p, &l); | 1923 | p = t_hash_next(m, &l); |
1428 | if (!p) | 1924 | if (!p) |
1429 | break; | 1925 | break; |
1430 | } | 1926 | } |
1431 | return p; | 1927 | if (!p) |
1928 | return NULL; | ||
1929 | |||
1930 | /* Only set this if we have an item */ | ||
1931 | iter->flags |= FTRACE_ITER_HASH; | ||
1932 | |||
1933 | return iter; | ||
1432 | } | 1934 | } |
1433 | 1935 | ||
1434 | static int t_hash_show(struct seq_file *m, void *v) | 1936 | static int |
1937 | t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) | ||
1435 | { | 1938 | { |
1436 | struct ftrace_func_probe *rec; | 1939 | struct ftrace_func_probe *rec; |
1437 | struct hlist_node *hnd = v; | ||
1438 | 1940 | ||
1439 | rec = hlist_entry(hnd, struct ftrace_func_probe, node); | 1941 | rec = iter->probe; |
1942 | if (WARN_ON_ONCE(!rec)) | ||
1943 | return -EIO; | ||
1440 | 1944 | ||
1441 | if (rec->ops->print) | 1945 | if (rec->ops->print) |
1442 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); | 1946 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); |
@@ -1454,15 +1958,20 @@ static void * | |||
1454 | t_next(struct seq_file *m, void *v, loff_t *pos) | 1958 | t_next(struct seq_file *m, void *v, loff_t *pos) |
1455 | { | 1959 | { |
1456 | struct ftrace_iterator *iter = m->private; | 1960 | struct ftrace_iterator *iter = m->private; |
1961 | struct ftrace_ops *ops = &global_ops; | ||
1457 | struct dyn_ftrace *rec = NULL; | 1962 | struct dyn_ftrace *rec = NULL; |
1458 | 1963 | ||
1964 | if (unlikely(ftrace_disabled)) | ||
1965 | return NULL; | ||
1966 | |||
1459 | if (iter->flags & FTRACE_ITER_HASH) | 1967 | if (iter->flags & FTRACE_ITER_HASH) |
1460 | return t_hash_next(m, v, pos); | 1968 | return t_hash_next(m, pos); |
1461 | 1969 | ||
1462 | (*pos)++; | 1970 | (*pos)++; |
1971 | iter->pos = iter->func_pos = *pos; | ||
1463 | 1972 | ||
1464 | if (iter->flags & FTRACE_ITER_PRINTALL) | 1973 | if (iter->flags & FTRACE_ITER_PRINTALL) |
1465 | return NULL; | 1974 | return t_hash_start(m, pos); |
1466 | 1975 | ||
1467 | retry: | 1976 | retry: |
1468 | if (iter->idx >= iter->pg->index) { | 1977 | if (iter->idx >= iter->pg->index) { |
@@ -1475,38 +1984,59 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1475 | rec = &iter->pg->records[iter->idx++]; | 1984 | rec = &iter->pg->records[iter->idx++]; |
1476 | if ((rec->flags & FTRACE_FL_FREE) || | 1985 | if ((rec->flags & FTRACE_FL_FREE) || |
1477 | 1986 | ||
1478 | (!(iter->flags & FTRACE_ITER_FAILURES) && | ||
1479 | (rec->flags & FTRACE_FL_FAILED)) || | ||
1480 | |||
1481 | ((iter->flags & FTRACE_ITER_FAILURES) && | ||
1482 | !(rec->flags & FTRACE_FL_FAILED)) || | ||
1483 | |||
1484 | ((iter->flags & FTRACE_ITER_FILTER) && | 1987 | ((iter->flags & FTRACE_ITER_FILTER) && |
1485 | !(rec->flags & FTRACE_FL_FILTER)) || | 1988 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || |
1486 | 1989 | ||
1487 | ((iter->flags & FTRACE_ITER_NOTRACE) && | 1990 | ((iter->flags & FTRACE_ITER_NOTRACE) && |
1488 | !(rec->flags & FTRACE_FL_NOTRACE))) { | 1991 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || |
1992 | |||
1993 | ((iter->flags & FTRACE_ITER_ENABLED) && | ||
1994 | !(rec->flags & ~FTRACE_FL_MASK))) { | ||
1995 | |||
1489 | rec = NULL; | 1996 | rec = NULL; |
1490 | goto retry; | 1997 | goto retry; |
1491 | } | 1998 | } |
1492 | } | 1999 | } |
1493 | 2000 | ||
1494 | return rec; | 2001 | if (!rec) |
2002 | return t_hash_start(m, pos); | ||
2003 | |||
2004 | iter->func = rec; | ||
2005 | |||
2006 | return iter; | ||
2007 | } | ||
2008 | |||
2009 | static void reset_iter_read(struct ftrace_iterator *iter) | ||
2010 | { | ||
2011 | iter->pos = 0; | ||
2012 | iter->func_pos = 0; | ||
2013 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | ||
1495 | } | 2014 | } |
1496 | 2015 | ||
1497 | static void *t_start(struct seq_file *m, loff_t *pos) | 2016 | static void *t_start(struct seq_file *m, loff_t *pos) |
1498 | { | 2017 | { |
1499 | struct ftrace_iterator *iter = m->private; | 2018 | struct ftrace_iterator *iter = m->private; |
2019 | struct ftrace_ops *ops = &global_ops; | ||
1500 | void *p = NULL; | 2020 | void *p = NULL; |
1501 | loff_t l; | 2021 | loff_t l; |
1502 | 2022 | ||
1503 | mutex_lock(&ftrace_lock); | 2023 | mutex_lock(&ftrace_lock); |
2024 | |||
2025 | if (unlikely(ftrace_disabled)) | ||
2026 | return NULL; | ||
2027 | |||
2028 | /* | ||
2029 | * If an lseek was done, then reset and start from beginning. | ||
2030 | */ | ||
2031 | if (*pos < iter->pos) | ||
2032 | reset_iter_read(iter); | ||
2033 | |||
1504 | /* | 2034 | /* |
1505 | * For set_ftrace_filter reading, if we have the filter | 2035 | * For set_ftrace_filter reading, if we have the filter |
1506 | * off, we can short cut and just print out that all | 2036 | * off, we can short cut and just print out that all |
1507 | * functions are enabled. | 2037 | * functions are enabled. |
1508 | */ | 2038 | */ |
1509 | if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { | 2039 | if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { |
1510 | if (*pos > 0) | 2040 | if (*pos > 0) |
1511 | return t_hash_start(m, pos); | 2041 | return t_hash_start(m, pos); |
1512 | iter->flags |= FTRACE_ITER_PRINTALL; | 2042 | iter->flags |= FTRACE_ITER_PRINTALL; |
@@ -1518,6 +2048,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1518 | if (iter->flags & FTRACE_ITER_HASH) | 2048 | if (iter->flags & FTRACE_ITER_HASH) |
1519 | return t_hash_start(m, pos); | 2049 | return t_hash_start(m, pos); |
1520 | 2050 | ||
2051 | /* | ||
2052 | * Unfortunately, we need to restart at ftrace_pages_start | ||
2053 | * every time we let go of the ftrace_mutex. This is because | ||
2054 | * those pointers can change without the lock. | ||
2055 | */ | ||
1521 | iter->pg = ftrace_pages_start; | 2056 | iter->pg = ftrace_pages_start; |
1522 | iter->idx = 0; | 2057 | iter->idx = 0; |
1523 | for (l = 0; l <= *pos; ) { | 2058 | for (l = 0; l <= *pos; ) { |
@@ -1526,10 +2061,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1526 | break; | 2061 | break; |
1527 | } | 2062 | } |
1528 | 2063 | ||
1529 | if (!p && iter->flags & FTRACE_ITER_FILTER) | 2064 | if (!p) { |
1530 | return t_hash_start(m, pos); | 2065 | if (iter->flags & FTRACE_ITER_FILTER) |
2066 | return t_hash_start(m, pos); | ||
1531 | 2067 | ||
1532 | return p; | 2068 | return NULL; |
2069 | } | ||
2070 | |||
2071 | return iter; | ||
1533 | } | 2072 | } |
1534 | 2073 | ||
1535 | static void t_stop(struct seq_file *m, void *p) | 2074 | static void t_stop(struct seq_file *m, void *p) |
@@ -1540,20 +2079,26 @@ static void t_stop(struct seq_file *m, void *p) | |||
1540 | static int t_show(struct seq_file *m, void *v) | 2079 | static int t_show(struct seq_file *m, void *v) |
1541 | { | 2080 | { |
1542 | struct ftrace_iterator *iter = m->private; | 2081 | struct ftrace_iterator *iter = m->private; |
1543 | struct dyn_ftrace *rec = v; | 2082 | struct dyn_ftrace *rec; |
1544 | 2083 | ||
1545 | if (iter->flags & FTRACE_ITER_HASH) | 2084 | if (iter->flags & FTRACE_ITER_HASH) |
1546 | return t_hash_show(m, v); | 2085 | return t_hash_show(m, iter); |
1547 | 2086 | ||
1548 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 2087 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
1549 | seq_printf(m, "#### all functions enabled ####\n"); | 2088 | seq_printf(m, "#### all functions enabled ####\n"); |
1550 | return 0; | 2089 | return 0; |
1551 | } | 2090 | } |
1552 | 2091 | ||
2092 | rec = iter->func; | ||
2093 | |||
1553 | if (!rec) | 2094 | if (!rec) |
1554 | return 0; | 2095 | return 0; |
1555 | 2096 | ||
1556 | seq_printf(m, "%ps\n", (void *)rec->ip); | 2097 | seq_printf(m, "%ps", (void *)rec->ip); |
2098 | if (iter->flags & FTRACE_ITER_ENABLED) | ||
2099 | seq_printf(m, " (%ld)", | ||
2100 | rec->flags & ~FTRACE_FL_MASK); | ||
2101 | seq_printf(m, "\n"); | ||
1557 | 2102 | ||
1558 | return 0; | 2103 | return 0; |
1559 | } | 2104 | } |
@@ -1593,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file) | |||
1593 | } | 2138 | } |
1594 | 2139 | ||
1595 | static int | 2140 | static int |
1596 | ftrace_failures_open(struct inode *inode, struct file *file) | 2141 | ftrace_enabled_open(struct inode *inode, struct file *file) |
1597 | { | 2142 | { |
1598 | int ret; | ||
1599 | struct seq_file *m; | ||
1600 | struct ftrace_iterator *iter; | 2143 | struct ftrace_iterator *iter; |
2144 | int ret; | ||
2145 | |||
2146 | if (unlikely(ftrace_disabled)) | ||
2147 | return -ENODEV; | ||
2148 | |||
2149 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | ||
2150 | if (!iter) | ||
2151 | return -ENOMEM; | ||
2152 | |||
2153 | iter->pg = ftrace_pages_start; | ||
2154 | iter->flags = FTRACE_ITER_ENABLED; | ||
1601 | 2155 | ||
1602 | ret = ftrace_avail_open(inode, file); | 2156 | ret = seq_open(file, &show_ftrace_seq_ops); |
1603 | if (!ret) { | 2157 | if (!ret) { |
1604 | m = (struct seq_file *)file->private_data; | 2158 | struct seq_file *m = file->private_data; |
1605 | iter = (struct ftrace_iterator *)m->private; | 2159 | |
1606 | iter->flags = FTRACE_ITER_FAILURES; | 2160 | m->private = iter; |
2161 | } else { | ||
2162 | kfree(iter); | ||
1607 | } | 2163 | } |
1608 | 2164 | ||
1609 | return ret; | 2165 | return ret; |
1610 | } | 2166 | } |
1611 | 2167 | ||
1612 | 2168 | static void ftrace_filter_reset(struct ftrace_hash *hash) | |
1613 | static void ftrace_filter_reset(int enable) | ||
1614 | { | 2169 | { |
1615 | struct ftrace_page *pg; | ||
1616 | struct dyn_ftrace *rec; | ||
1617 | unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
1618 | |||
1619 | mutex_lock(&ftrace_lock); | 2170 | mutex_lock(&ftrace_lock); |
1620 | if (enable) | 2171 | ftrace_hash_clear(hash); |
1621 | ftrace_filtered = 0; | ||
1622 | do_for_each_ftrace_rec(pg, rec) { | ||
1623 | if (rec->flags & FTRACE_FL_FAILED) | ||
1624 | continue; | ||
1625 | rec->flags &= ~type; | ||
1626 | } while_for_each_ftrace_rec(); | ||
1627 | mutex_unlock(&ftrace_lock); | 2172 | mutex_unlock(&ftrace_lock); |
1628 | } | 2173 | } |
1629 | 2174 | ||
1630 | static int | 2175 | static int |
1631 | ftrace_regex_open(struct inode *inode, struct file *file, int enable) | 2176 | ftrace_regex_open(struct ftrace_ops *ops, int flag, |
2177 | struct inode *inode, struct file *file) | ||
1632 | { | 2178 | { |
1633 | struct ftrace_iterator *iter; | 2179 | struct ftrace_iterator *iter; |
2180 | struct ftrace_hash *hash; | ||
1634 | int ret = 0; | 2181 | int ret = 0; |
1635 | 2182 | ||
1636 | if (unlikely(ftrace_disabled)) | 2183 | if (unlikely(ftrace_disabled)) |
@@ -1645,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
1645 | return -ENOMEM; | 2192 | return -ENOMEM; |
1646 | } | 2193 | } |
1647 | 2194 | ||
2195 | if (flag & FTRACE_ITER_NOTRACE) | ||
2196 | hash = ops->notrace_hash; | ||
2197 | else | ||
2198 | hash = ops->filter_hash; | ||
2199 | |||
2200 | iter->ops = ops; | ||
2201 | iter->flags = flag; | ||
2202 | |||
2203 | if (file->f_mode & FMODE_WRITE) { | ||
2204 | mutex_lock(&ftrace_lock); | ||
2205 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); | ||
2206 | mutex_unlock(&ftrace_lock); | ||
2207 | |||
2208 | if (!iter->hash) { | ||
2209 | trace_parser_put(&iter->parser); | ||
2210 | kfree(iter); | ||
2211 | return -ENOMEM; | ||
2212 | } | ||
2213 | } | ||
2214 | |||
1648 | mutex_lock(&ftrace_regex_lock); | 2215 | mutex_lock(&ftrace_regex_lock); |
2216 | |||
1649 | if ((file->f_mode & FMODE_WRITE) && | 2217 | if ((file->f_mode & FMODE_WRITE) && |
1650 | (file->f_flags & O_TRUNC)) | 2218 | (file->f_flags & O_TRUNC)) |
1651 | ftrace_filter_reset(enable); | 2219 | ftrace_filter_reset(iter->hash); |
1652 | 2220 | ||
1653 | if (file->f_mode & FMODE_READ) { | 2221 | if (file->f_mode & FMODE_READ) { |
1654 | iter->pg = ftrace_pages_start; | 2222 | iter->pg = ftrace_pages_start; |
1655 | iter->flags = enable ? FTRACE_ITER_FILTER : | ||
1656 | FTRACE_ITER_NOTRACE; | ||
1657 | 2223 | ||
1658 | ret = seq_open(file, &show_ftrace_seq_ops); | 2224 | ret = seq_open(file, &show_ftrace_seq_ops); |
1659 | if (!ret) { | 2225 | if (!ret) { |
1660 | struct seq_file *m = file->private_data; | 2226 | struct seq_file *m = file->private_data; |
1661 | m->private = iter; | 2227 | m->private = iter; |
1662 | } else { | 2228 | } else { |
2229 | /* Failed */ | ||
2230 | free_ftrace_hash(iter->hash); | ||
1663 | trace_parser_put(&iter->parser); | 2231 | trace_parser_put(&iter->parser); |
1664 | kfree(iter); | 2232 | kfree(iter); |
1665 | } | 2233 | } |
@@ -1673,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
1673 | static int | 2241 | static int |
1674 | ftrace_filter_open(struct inode *inode, struct file *file) | 2242 | ftrace_filter_open(struct inode *inode, struct file *file) |
1675 | { | 2243 | { |
1676 | return ftrace_regex_open(inode, file, 1); | 2244 | return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, |
2245 | inode, file); | ||
1677 | } | 2246 | } |
1678 | 2247 | ||
1679 | static int | 2248 | static int |
1680 | ftrace_notrace_open(struct inode *inode, struct file *file) | 2249 | ftrace_notrace_open(struct inode *inode, struct file *file) |
1681 | { | 2250 | { |
1682 | return ftrace_regex_open(inode, file, 0); | 2251 | return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, |
2252 | inode, file); | ||
1683 | } | 2253 | } |
1684 | 2254 | ||
1685 | static loff_t | 2255 | static loff_t |
@@ -1724,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type) | |||
1724 | } | 2294 | } |
1725 | 2295 | ||
1726 | static int | 2296 | static int |
1727 | ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) | 2297 | enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) |
2298 | { | ||
2299 | struct ftrace_func_entry *entry; | ||
2300 | int ret = 0; | ||
2301 | |||
2302 | entry = ftrace_lookup_ip(hash, rec->ip); | ||
2303 | if (not) { | ||
2304 | /* Do nothing if it doesn't exist */ | ||
2305 | if (!entry) | ||
2306 | return 0; | ||
2307 | |||
2308 | free_hash_entry(hash, entry); | ||
2309 | } else { | ||
2310 | /* Do nothing if it exists */ | ||
2311 | if (entry) | ||
2312 | return 0; | ||
2313 | |||
2314 | ret = add_hash_entry(hash, rec->ip); | ||
2315 | } | ||
2316 | return ret; | ||
2317 | } | ||
2318 | |||
2319 | static int | ||
2320 | ftrace_match_record(struct dyn_ftrace *rec, char *mod, | ||
2321 | char *regex, int len, int type) | ||
1728 | { | 2322 | { |
1729 | char str[KSYM_SYMBOL_LEN]; | 2323 | char str[KSYM_SYMBOL_LEN]; |
2324 | char *modname; | ||
2325 | |||
2326 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
2327 | |||
2328 | if (mod) { | ||
2329 | /* module lookup requires matching the module */ | ||
2330 | if (!modname || strcmp(modname, mod)) | ||
2331 | return 0; | ||
2332 | |||
2333 | /* blank search means to match all funcs in the mod */ | ||
2334 | if (!len) | ||
2335 | return 1; | ||
2336 | } | ||
1730 | 2337 | ||
1731 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | ||
1732 | return ftrace_match(str, regex, len, type); | 2338 | return ftrace_match(str, regex, len, type); |
1733 | } | 2339 | } |
1734 | 2340 | ||
1735 | static int ftrace_match_records(char *buff, int len, int enable) | 2341 | static int |
2342 | match_records(struct ftrace_hash *hash, char *buff, | ||
2343 | int len, char *mod, int not) | ||
1736 | { | 2344 | { |
1737 | unsigned int search_len; | 2345 | unsigned search_len = 0; |
1738 | struct ftrace_page *pg; | 2346 | struct ftrace_page *pg; |
1739 | struct dyn_ftrace *rec; | 2347 | struct dyn_ftrace *rec; |
1740 | unsigned long flag; | 2348 | int type = MATCH_FULL; |
1741 | char *search; | 2349 | char *search = buff; |
1742 | int type; | ||
1743 | int not; | ||
1744 | int found = 0; | 2350 | int found = 0; |
2351 | int ret; | ||
1745 | 2352 | ||
1746 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | 2353 | if (len) { |
1747 | type = filter_parse_regex(buff, len, &search, ¬); | 2354 | type = filter_parse_regex(buff, len, &search, ¬); |
1748 | 2355 | search_len = strlen(search); | |
1749 | search_len = strlen(search); | 2356 | } |
1750 | 2357 | ||
1751 | mutex_lock(&ftrace_lock); | 2358 | mutex_lock(&ftrace_lock); |
1752 | do_for_each_ftrace_rec(pg, rec) { | ||
1753 | 2359 | ||
1754 | if (rec->flags & FTRACE_FL_FAILED) | 2360 | if (unlikely(ftrace_disabled)) |
1755 | continue; | 2361 | goto out_unlock; |
1756 | 2362 | ||
1757 | if (ftrace_match_record(rec, search, search_len, type)) { | 2363 | do_for_each_ftrace_rec(pg, rec) { |
1758 | if (not) | 2364 | |
1759 | rec->flags &= ~flag; | 2365 | if (ftrace_match_record(rec, mod, search, search_len, type)) { |
1760 | else | 2366 | ret = enter_record(hash, rec, not); |
1761 | rec->flags |= flag; | 2367 | if (ret < 0) { |
2368 | found = ret; | ||
2369 | goto out_unlock; | ||
2370 | } | ||
1762 | found = 1; | 2371 | found = 1; |
1763 | } | 2372 | } |
1764 | /* | ||
1765 | * Only enable filtering if we have a function that | ||
1766 | * is filtered on. | ||
1767 | */ | ||
1768 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
1769 | ftrace_filtered = 1; | ||
1770 | } while_for_each_ftrace_rec(); | 2373 | } while_for_each_ftrace_rec(); |
2374 | out_unlock: | ||
1771 | mutex_unlock(&ftrace_lock); | 2375 | mutex_unlock(&ftrace_lock); |
1772 | 2376 | ||
1773 | return found; | 2377 | return found; |
1774 | } | 2378 | } |
1775 | 2379 | ||
1776 | static int | 2380 | static int |
1777 | ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, | 2381 | ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) |
1778 | char *regex, int len, int type) | ||
1779 | { | 2382 | { |
1780 | char str[KSYM_SYMBOL_LEN]; | 2383 | return match_records(hash, buff, len, NULL, 0); |
1781 | char *modname; | ||
1782 | |||
1783 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
1784 | |||
1785 | if (!modname || strcmp(modname, mod)) | ||
1786 | return 0; | ||
1787 | |||
1788 | /* blank search means to match all funcs in the mod */ | ||
1789 | if (len) | ||
1790 | return ftrace_match(str, regex, len, type); | ||
1791 | else | ||
1792 | return 1; | ||
1793 | } | 2384 | } |
1794 | 2385 | ||
1795 | static int ftrace_match_module_records(char *buff, char *mod, int enable) | 2386 | static int |
2387 | ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) | ||
1796 | { | 2388 | { |
1797 | unsigned search_len = 0; | ||
1798 | struct ftrace_page *pg; | ||
1799 | struct dyn_ftrace *rec; | ||
1800 | int type = MATCH_FULL; | ||
1801 | char *search = buff; | ||
1802 | unsigned long flag; | ||
1803 | int not = 0; | 2389 | int not = 0; |
1804 | int found = 0; | ||
1805 | |||
1806 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
1807 | 2390 | ||
1808 | /* blank or '*' mean the same */ | 2391 | /* blank or '*' mean the same */ |
1809 | if (strcmp(buff, "*") == 0) | 2392 | if (strcmp(buff, "*") == 0) |
@@ -1815,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
1815 | not = 1; | 2398 | not = 1; |
1816 | } | 2399 | } |
1817 | 2400 | ||
1818 | if (strlen(buff)) { | 2401 | return match_records(hash, buff, strlen(buff), mod, not); |
1819 | type = filter_parse_regex(buff, strlen(buff), &search, ¬); | ||
1820 | search_len = strlen(search); | ||
1821 | } | ||
1822 | |||
1823 | mutex_lock(&ftrace_lock); | ||
1824 | do_for_each_ftrace_rec(pg, rec) { | ||
1825 | |||
1826 | if (rec->flags & FTRACE_FL_FAILED) | ||
1827 | continue; | ||
1828 | |||
1829 | if (ftrace_match_module_record(rec, mod, | ||
1830 | search, search_len, type)) { | ||
1831 | if (not) | ||
1832 | rec->flags &= ~flag; | ||
1833 | else | ||
1834 | rec->flags |= flag; | ||
1835 | found = 1; | ||
1836 | } | ||
1837 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
1838 | ftrace_filtered = 1; | ||
1839 | |||
1840 | } while_for_each_ftrace_rec(); | ||
1841 | mutex_unlock(&ftrace_lock); | ||
1842 | |||
1843 | return found; | ||
1844 | } | 2402 | } |
1845 | 2403 | ||
1846 | /* | 2404 | /* |
@@ -1851,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
1851 | static int | 2409 | static int |
1852 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | 2410 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) |
1853 | { | 2411 | { |
2412 | struct ftrace_ops *ops = &global_ops; | ||
2413 | struct ftrace_hash *hash; | ||
1854 | char *mod; | 2414 | char *mod; |
2415 | int ret = -EINVAL; | ||
1855 | 2416 | ||
1856 | /* | 2417 | /* |
1857 | * cmd == 'mod' because we only registered this func | 2418 | * cmd == 'mod' because we only registered this func |
@@ -1863,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | |||
1863 | 2424 | ||
1864 | /* we must have a module name */ | 2425 | /* we must have a module name */ |
1865 | if (!param) | 2426 | if (!param) |
1866 | return -EINVAL; | 2427 | return ret; |
1867 | 2428 | ||
1868 | mod = strsep(¶m, ":"); | 2429 | mod = strsep(¶m, ":"); |
1869 | if (!strlen(mod)) | 2430 | if (!strlen(mod)) |
1870 | return -EINVAL; | 2431 | return ret; |
1871 | 2432 | ||
1872 | if (ftrace_match_module_records(func, mod, enable)) | 2433 | if (enable) |
1873 | return 0; | 2434 | hash = ops->filter_hash; |
1874 | return -EINVAL; | 2435 | else |
2436 | hash = ops->notrace_hash; | ||
2437 | |||
2438 | ret = ftrace_match_module_records(hash, func, mod); | ||
2439 | if (!ret) | ||
2440 | ret = -EINVAL; | ||
2441 | if (ret < 0) | ||
2442 | return ret; | ||
2443 | |||
2444 | return 0; | ||
1875 | } | 2445 | } |
1876 | 2446 | ||
1877 | static struct ftrace_func_command ftrace_mod_cmd = { | 2447 | static struct ftrace_func_command ftrace_mod_cmd = { |
@@ -1922,6 +2492,7 @@ static int ftrace_probe_registered; | |||
1922 | 2492 | ||
1923 | static void __enable_ftrace_function_probe(void) | 2493 | static void __enable_ftrace_function_probe(void) |
1924 | { | 2494 | { |
2495 | int ret; | ||
1925 | int i; | 2496 | int i; |
1926 | 2497 | ||
1927 | if (ftrace_probe_registered) | 2498 | if (ftrace_probe_registered) |
@@ -1936,13 +2507,16 @@ static void __enable_ftrace_function_probe(void) | |||
1936 | if (i == FTRACE_FUNC_HASHSIZE) | 2507 | if (i == FTRACE_FUNC_HASHSIZE) |
1937 | return; | 2508 | return; |
1938 | 2509 | ||
1939 | __register_ftrace_function(&trace_probe_ops); | 2510 | ret = __register_ftrace_function(&trace_probe_ops); |
1940 | ftrace_startup(0); | 2511 | if (!ret) |
2512 | ret = ftrace_startup(&trace_probe_ops, 0); | ||
2513 | |||
1941 | ftrace_probe_registered = 1; | 2514 | ftrace_probe_registered = 1; |
1942 | } | 2515 | } |
1943 | 2516 | ||
1944 | static void __disable_ftrace_function_probe(void) | 2517 | static void __disable_ftrace_function_probe(void) |
1945 | { | 2518 | { |
2519 | int ret; | ||
1946 | int i; | 2520 | int i; |
1947 | 2521 | ||
1948 | if (!ftrace_probe_registered) | 2522 | if (!ftrace_probe_registered) |
@@ -1955,8 +2529,10 @@ static void __disable_ftrace_function_probe(void) | |||
1955 | } | 2529 | } |
1956 | 2530 | ||
1957 | /* no more funcs left */ | 2531 | /* no more funcs left */ |
1958 | __unregister_ftrace_function(&trace_probe_ops); | 2532 | ret = __unregister_ftrace_function(&trace_probe_ops); |
1959 | ftrace_shutdown(0); | 2533 | if (!ret) |
2534 | ftrace_shutdown(&trace_probe_ops, 0); | ||
2535 | |||
1960 | ftrace_probe_registered = 0; | 2536 | ftrace_probe_registered = 0; |
1961 | } | 2537 | } |
1962 | 2538 | ||
@@ -1992,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
1992 | return -EINVAL; | 2568 | return -EINVAL; |
1993 | 2569 | ||
1994 | mutex_lock(&ftrace_lock); | 2570 | mutex_lock(&ftrace_lock); |
1995 | do_for_each_ftrace_rec(pg, rec) { | ||
1996 | 2571 | ||
1997 | if (rec->flags & FTRACE_FL_FAILED) | 2572 | if (unlikely(ftrace_disabled)) |
1998 | continue; | 2573 | goto out_unlock; |
2574 | |||
2575 | do_for_each_ftrace_rec(pg, rec) { | ||
1999 | 2576 | ||
2000 | if (!ftrace_match_record(rec, search, len, type)) | 2577 | if (!ftrace_match_record(rec, NULL, search, len, type)) |
2001 | continue; | 2578 | continue; |
2002 | 2579 | ||
2003 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | 2580 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); |
@@ -2158,7 +2735,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) | |||
2158 | return ret; | 2735 | return ret; |
2159 | } | 2736 | } |
2160 | 2737 | ||
2161 | static int ftrace_process_regex(char *buff, int len, int enable) | 2738 | static int ftrace_process_regex(struct ftrace_hash *hash, |
2739 | char *buff, int len, int enable) | ||
2162 | { | 2740 | { |
2163 | char *func, *command, *next = buff; | 2741 | char *func, *command, *next = buff; |
2164 | struct ftrace_func_command *p; | 2742 | struct ftrace_func_command *p; |
@@ -2167,9 +2745,12 @@ static int ftrace_process_regex(char *buff, int len, int enable) | |||
2167 | func = strsep(&next, ":"); | 2745 | func = strsep(&next, ":"); |
2168 | 2746 | ||
2169 | if (!next) { | 2747 | if (!next) { |
2170 | if (ftrace_match_records(func, len, enable)) | 2748 | ret = ftrace_match_records(hash, func, len); |
2171 | return 0; | 2749 | if (!ret) |
2172 | return ret; | 2750 | ret = -EINVAL; |
2751 | if (ret < 0) | ||
2752 | return ret; | ||
2753 | return 0; | ||
2173 | } | 2754 | } |
2174 | 2755 | ||
2175 | /* command found */ | 2756 | /* command found */ |
@@ -2202,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2202 | 2783 | ||
2203 | mutex_lock(&ftrace_regex_lock); | 2784 | mutex_lock(&ftrace_regex_lock); |
2204 | 2785 | ||
2786 | ret = -ENODEV; | ||
2787 | if (unlikely(ftrace_disabled)) | ||
2788 | goto out_unlock; | ||
2789 | |||
2205 | if (file->f_mode & FMODE_READ) { | 2790 | if (file->f_mode & FMODE_READ) { |
2206 | struct seq_file *m = file->private_data; | 2791 | struct seq_file *m = file->private_data; |
2207 | iter = m->private; | 2792 | iter = m->private; |
@@ -2213,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2213 | 2798 | ||
2214 | if (read >= 0 && trace_parser_loaded(parser) && | 2799 | if (read >= 0 && trace_parser_loaded(parser) && |
2215 | !trace_parser_cont(parser)) { | 2800 | !trace_parser_cont(parser)) { |
2216 | ret = ftrace_process_regex(parser->buffer, | 2801 | ret = ftrace_process_regex(iter->hash, parser->buffer, |
2217 | parser->idx, enable); | 2802 | parser->idx, enable); |
2218 | trace_parser_clear(parser); | 2803 | trace_parser_clear(parser); |
2219 | if (ret) | 2804 | if (ret) |
@@ -2241,22 +2826,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, | |||
2241 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); | 2826 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); |
2242 | } | 2827 | } |
2243 | 2828 | ||
2244 | static void | 2829 | static int |
2245 | ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | 2830 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, |
2831 | int reset, int enable) | ||
2246 | { | 2832 | { |
2833 | struct ftrace_hash **orig_hash; | ||
2834 | struct ftrace_hash *hash; | ||
2835 | int ret; | ||
2836 | |||
2837 | /* All global ops uses the global ops filters */ | ||
2838 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) | ||
2839 | ops = &global_ops; | ||
2840 | |||
2247 | if (unlikely(ftrace_disabled)) | 2841 | if (unlikely(ftrace_disabled)) |
2248 | return; | 2842 | return -ENODEV; |
2843 | |||
2844 | if (enable) | ||
2845 | orig_hash = &ops->filter_hash; | ||
2846 | else | ||
2847 | orig_hash = &ops->notrace_hash; | ||
2848 | |||
2849 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | ||
2850 | if (!hash) | ||
2851 | return -ENOMEM; | ||
2249 | 2852 | ||
2250 | mutex_lock(&ftrace_regex_lock); | 2853 | mutex_lock(&ftrace_regex_lock); |
2251 | if (reset) | 2854 | if (reset) |
2252 | ftrace_filter_reset(enable); | 2855 | ftrace_filter_reset(hash); |
2253 | if (buf) | 2856 | if (buf) |
2254 | ftrace_match_records(buf, len, enable); | 2857 | ftrace_match_records(hash, buf, len); |
2858 | |||
2859 | mutex_lock(&ftrace_lock); | ||
2860 | ret = ftrace_hash_move(orig_hash, hash); | ||
2861 | mutex_unlock(&ftrace_lock); | ||
2862 | |||
2255 | mutex_unlock(&ftrace_regex_lock); | 2863 | mutex_unlock(&ftrace_regex_lock); |
2864 | |||
2865 | free_ftrace_hash(hash); | ||
2866 | return ret; | ||
2256 | } | 2867 | } |
2257 | 2868 | ||
2258 | /** | 2869 | /** |
2259 | * ftrace_set_filter - set a function to filter on in ftrace | 2870 | * ftrace_set_filter - set a function to filter on in ftrace |
2871 | * @ops - the ops to set the filter with | ||
2260 | * @buf - the string that holds the function filter text. | 2872 | * @buf - the string that holds the function filter text. |
2261 | * @len - the length of the string. | 2873 | * @len - the length of the string. |
2262 | * @reset - non zero to reset all filters before applying this filter. | 2874 | * @reset - non zero to reset all filters before applying this filter. |
@@ -2264,13 +2876,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | |||
2264 | * Filters denote which functions should be enabled when tracing is enabled. | 2876 | * Filters denote which functions should be enabled when tracing is enabled. |
2265 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | 2877 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. |
2266 | */ | 2878 | */ |
2267 | void ftrace_set_filter(unsigned char *buf, int len, int reset) | 2879 | void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, |
2880 | int len, int reset) | ||
2268 | { | 2881 | { |
2269 | ftrace_set_regex(buf, len, reset, 1); | 2882 | ftrace_set_regex(ops, buf, len, reset, 1); |
2270 | } | 2883 | } |
2884 | EXPORT_SYMBOL_GPL(ftrace_set_filter); | ||
2271 | 2885 | ||
2272 | /** | 2886 | /** |
2273 | * ftrace_set_notrace - set a function to not trace in ftrace | 2887 | * ftrace_set_notrace - set a function to not trace in ftrace |
2888 | * @ops - the ops to set the notrace filter with | ||
2274 | * @buf - the string that holds the function notrace text. | 2889 | * @buf - the string that holds the function notrace text. |
2275 | * @len - the length of the string. | 2890 | * @len - the length of the string. |
2276 | * @reset - non zero to reset all filters before applying this filter. | 2891 | * @reset - non zero to reset all filters before applying this filter. |
@@ -2279,10 +2894,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) | |||
2279 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | 2894 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled |
2280 | * for tracing. | 2895 | * for tracing. |
2281 | */ | 2896 | */ |
2282 | void ftrace_set_notrace(unsigned char *buf, int len, int reset) | 2897 | void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, |
2898 | int len, int reset) | ||
2283 | { | 2899 | { |
2284 | ftrace_set_regex(buf, len, reset, 0); | 2900 | ftrace_set_regex(ops, buf, len, reset, 0); |
2285 | } | 2901 | } |
2902 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | ||
2903 | /** | ||
2904 | * ftrace_set_filter - set a function to filter on in ftrace | ||
2905 | * @ops - the ops to set the filter with | ||
2906 | * @buf - the string that holds the function filter text. | ||
2907 | * @len - the length of the string. | ||
2908 | * @reset - non zero to reset all filters before applying this filter. | ||
2909 | * | ||
2910 | * Filters denote which functions should be enabled when tracing is enabled. | ||
2911 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | ||
2912 | */ | ||
2913 | void ftrace_set_global_filter(unsigned char *buf, int len, int reset) | ||
2914 | { | ||
2915 | ftrace_set_regex(&global_ops, buf, len, reset, 1); | ||
2916 | } | ||
2917 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); | ||
2918 | |||
2919 | /** | ||
2920 | * ftrace_set_notrace - set a function to not trace in ftrace | ||
2921 | * @ops - the ops to set the notrace filter with | ||
2922 | * @buf - the string that holds the function notrace text. | ||
2923 | * @len - the length of the string. | ||
2924 | * @reset - non zero to reset all filters before applying this filter. | ||
2925 | * | ||
2926 | * Notrace Filters denote which functions should not be enabled when tracing | ||
2927 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | ||
2928 | * for tracing. | ||
2929 | */ | ||
2930 | void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) | ||
2931 | { | ||
2932 | ftrace_set_regex(&global_ops, buf, len, reset, 0); | ||
2933 | } | ||
2934 | EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); | ||
2286 | 2935 | ||
2287 | /* | 2936 | /* |
2288 | * command line interface to allow users to set filters on boot up. | 2937 | * command line interface to allow users to set filters on boot up. |
@@ -2333,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf) | |||
2333 | } | 2982 | } |
2334 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 2983 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
2335 | 2984 | ||
2336 | static void __init set_ftrace_early_filter(char *buf, int enable) | 2985 | static void __init |
2986 | set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) | ||
2337 | { | 2987 | { |
2338 | char *func; | 2988 | char *func; |
2339 | 2989 | ||
2340 | while (buf) { | 2990 | while (buf) { |
2341 | func = strsep(&buf, ","); | 2991 | func = strsep(&buf, ","); |
2342 | ftrace_set_regex(func, strlen(func), 0, enable); | 2992 | ftrace_set_regex(ops, func, strlen(func), 0, enable); |
2343 | } | 2993 | } |
2344 | } | 2994 | } |
2345 | 2995 | ||
2346 | static void __init set_ftrace_early_filters(void) | 2996 | static void __init set_ftrace_early_filters(void) |
2347 | { | 2997 | { |
2348 | if (ftrace_filter_buf[0]) | 2998 | if (ftrace_filter_buf[0]) |
2349 | set_ftrace_early_filter(ftrace_filter_buf, 1); | 2999 | set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); |
2350 | if (ftrace_notrace_buf[0]) | 3000 | if (ftrace_notrace_buf[0]) |
2351 | set_ftrace_early_filter(ftrace_notrace_buf, 0); | 3001 | set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); |
2352 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3002 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
2353 | if (ftrace_graph_buf[0]) | 3003 | if (ftrace_graph_buf[0]) |
2354 | set_ftrace_early_graph(ftrace_graph_buf); | 3004 | set_ftrace_early_graph(ftrace_graph_buf); |
@@ -2356,11 +3006,14 @@ static void __init set_ftrace_early_filters(void) | |||
2356 | } | 3006 | } |
2357 | 3007 | ||
2358 | static int | 3008 | static int |
2359 | ftrace_regex_release(struct inode *inode, struct file *file, int enable) | 3009 | ftrace_regex_release(struct inode *inode, struct file *file) |
2360 | { | 3010 | { |
2361 | struct seq_file *m = (struct seq_file *)file->private_data; | 3011 | struct seq_file *m = (struct seq_file *)file->private_data; |
2362 | struct ftrace_iterator *iter; | 3012 | struct ftrace_iterator *iter; |
3013 | struct ftrace_hash **orig_hash; | ||
2363 | struct trace_parser *parser; | 3014 | struct trace_parser *parser; |
3015 | int filter_hash; | ||
3016 | int ret; | ||
2364 | 3017 | ||
2365 | mutex_lock(&ftrace_regex_lock); | 3018 | mutex_lock(&ftrace_regex_lock); |
2366 | if (file->f_mode & FMODE_READ) { | 3019 | if (file->f_mode & FMODE_READ) { |
@@ -2373,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) | |||
2373 | parser = &iter->parser; | 3026 | parser = &iter->parser; |
2374 | if (trace_parser_loaded(parser)) { | 3027 | if (trace_parser_loaded(parser)) { |
2375 | parser->buffer[parser->idx] = 0; | 3028 | parser->buffer[parser->idx] = 0; |
2376 | ftrace_match_records(parser->buffer, parser->idx, enable); | 3029 | ftrace_match_records(iter->hash, parser->buffer, parser->idx); |
2377 | } | 3030 | } |
2378 | 3031 | ||
2379 | mutex_lock(&ftrace_lock); | ||
2380 | if (ftrace_start_up && ftrace_enabled) | ||
2381 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
2382 | mutex_unlock(&ftrace_lock); | ||
2383 | |||
2384 | trace_parser_put(parser); | 3032 | trace_parser_put(parser); |
3033 | |||
3034 | if (file->f_mode & FMODE_WRITE) { | ||
3035 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | ||
3036 | |||
3037 | if (filter_hash) | ||
3038 | orig_hash = &iter->ops->filter_hash; | ||
3039 | else | ||
3040 | orig_hash = &iter->ops->notrace_hash; | ||
3041 | |||
3042 | mutex_lock(&ftrace_lock); | ||
3043 | /* | ||
3044 | * Remove the current set, update the hash and add | ||
3045 | * them back. | ||
3046 | */ | ||
3047 | ftrace_hash_rec_disable(iter->ops, filter_hash); | ||
3048 | ret = ftrace_hash_move(orig_hash, iter->hash); | ||
3049 | if (!ret) { | ||
3050 | ftrace_hash_rec_enable(iter->ops, filter_hash); | ||
3051 | if (iter->ops->flags & FTRACE_OPS_FL_ENABLED | ||
3052 | && ftrace_enabled) | ||
3053 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
3054 | } | ||
3055 | mutex_unlock(&ftrace_lock); | ||
3056 | } | ||
3057 | free_ftrace_hash(iter->hash); | ||
2385 | kfree(iter); | 3058 | kfree(iter); |
2386 | 3059 | ||
2387 | mutex_unlock(&ftrace_regex_lock); | 3060 | mutex_unlock(&ftrace_regex_lock); |
2388 | return 0; | 3061 | return 0; |
2389 | } | 3062 | } |
2390 | 3063 | ||
2391 | static int | ||
2392 | ftrace_filter_release(struct inode *inode, struct file *file) | ||
2393 | { | ||
2394 | return ftrace_regex_release(inode, file, 1); | ||
2395 | } | ||
2396 | |||
2397 | static int | ||
2398 | ftrace_notrace_release(struct inode *inode, struct file *file) | ||
2399 | { | ||
2400 | return ftrace_regex_release(inode, file, 0); | ||
2401 | } | ||
2402 | |||
2403 | static const struct file_operations ftrace_avail_fops = { | 3064 | static const struct file_operations ftrace_avail_fops = { |
2404 | .open = ftrace_avail_open, | 3065 | .open = ftrace_avail_open, |
2405 | .read = seq_read, | 3066 | .read = seq_read, |
@@ -2407,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = { | |||
2407 | .release = seq_release_private, | 3068 | .release = seq_release_private, |
2408 | }; | 3069 | }; |
2409 | 3070 | ||
2410 | static const struct file_operations ftrace_failures_fops = { | 3071 | static const struct file_operations ftrace_enabled_fops = { |
2411 | .open = ftrace_failures_open, | 3072 | .open = ftrace_enabled_open, |
2412 | .read = seq_read, | 3073 | .read = seq_read, |
2413 | .llseek = seq_lseek, | 3074 | .llseek = seq_lseek, |
2414 | .release = seq_release_private, | 3075 | .release = seq_release_private, |
@@ -2418,8 +3079,8 @@ static const struct file_operations ftrace_filter_fops = { | |||
2418 | .open = ftrace_filter_open, | 3079 | .open = ftrace_filter_open, |
2419 | .read = seq_read, | 3080 | .read = seq_read, |
2420 | .write = ftrace_filter_write, | 3081 | .write = ftrace_filter_write, |
2421 | .llseek = no_llseek, | 3082 | .llseek = ftrace_regex_lseek, |
2422 | .release = ftrace_filter_release, | 3083 | .release = ftrace_regex_release, |
2423 | }; | 3084 | }; |
2424 | 3085 | ||
2425 | static const struct file_operations ftrace_notrace_fops = { | 3086 | static const struct file_operations ftrace_notrace_fops = { |
@@ -2427,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
2427 | .read = seq_read, | 3088 | .read = seq_read, |
2428 | .write = ftrace_notrace_write, | 3089 | .write = ftrace_notrace_write, |
2429 | .llseek = ftrace_regex_lseek, | 3090 | .llseek = ftrace_regex_lseek, |
2430 | .release = ftrace_notrace_release, | 3091 | .release = ftrace_regex_release, |
2431 | }; | 3092 | }; |
2432 | 3093 | ||
2433 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3094 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
@@ -2536,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2536 | bool exists; | 3197 | bool exists; |
2537 | int i; | 3198 | int i; |
2538 | 3199 | ||
2539 | if (ftrace_disabled) | ||
2540 | return -ENODEV; | ||
2541 | |||
2542 | /* decode regex */ | 3200 | /* decode regex */ |
2543 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 3201 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
2544 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) | 3202 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
@@ -2547,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2547 | search_len = strlen(search); | 3205 | search_len = strlen(search); |
2548 | 3206 | ||
2549 | mutex_lock(&ftrace_lock); | 3207 | mutex_lock(&ftrace_lock); |
3208 | |||
3209 | if (unlikely(ftrace_disabled)) { | ||
3210 | mutex_unlock(&ftrace_lock); | ||
3211 | return -ENODEV; | ||
3212 | } | ||
3213 | |||
2550 | do_for_each_ftrace_rec(pg, rec) { | 3214 | do_for_each_ftrace_rec(pg, rec) { |
2551 | 3215 | ||
2552 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 3216 | if (rec->flags & FTRACE_FL_FREE) |
2553 | continue; | 3217 | continue; |
2554 | 3218 | ||
2555 | if (ftrace_match_record(rec, search, search_len, type)) { | 3219 | if (ftrace_match_record(rec, NULL, search, search_len, type)) { |
2556 | /* if it is in the array */ | 3220 | /* if it is in the array */ |
2557 | exists = false; | 3221 | exists = false; |
2558 | for (i = 0; i < *idx; i++) { | 3222 | for (i = 0; i < *idx; i++) { |
@@ -2632,6 +3296,7 @@ static const struct file_operations ftrace_graph_fops = { | |||
2632 | .read = seq_read, | 3296 | .read = seq_read, |
2633 | .write = ftrace_graph_write, | 3297 | .write = ftrace_graph_write, |
2634 | .release = ftrace_graph_release, | 3298 | .release = ftrace_graph_release, |
3299 | .llseek = seq_lseek, | ||
2635 | }; | 3300 | }; |
2636 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 3301 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
2637 | 3302 | ||
@@ -2641,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
2641 | trace_create_file("available_filter_functions", 0444, | 3306 | trace_create_file("available_filter_functions", 0444, |
2642 | d_tracer, NULL, &ftrace_avail_fops); | 3307 | d_tracer, NULL, &ftrace_avail_fops); |
2643 | 3308 | ||
2644 | trace_create_file("failures", 0444, | 3309 | trace_create_file("enabled_functions", 0444, |
2645 | d_tracer, NULL, &ftrace_failures_fops); | 3310 | d_tracer, NULL, &ftrace_enabled_fops); |
2646 | 3311 | ||
2647 | trace_create_file("set_ftrace_filter", 0644, d_tracer, | 3312 | trace_create_file("set_ftrace_filter", 0644, d_tracer, |
2648 | NULL, &ftrace_filter_fops); | 3313 | NULL, &ftrace_filter_fops); |
@@ -2682,7 +3347,10 @@ static int ftrace_process_locs(struct module *mod, | |||
2682 | ftrace_record_ip(addr); | 3347 | ftrace_record_ip(addr); |
2683 | } | 3348 | } |
2684 | 3349 | ||
2685 | /* disable interrupts to prevent kstop machine */ | 3350 | /* |
3351 | * Disable interrupts to prevent interrupts from executing | ||
3352 | * code that is being modified. | ||
3353 | */ | ||
2686 | local_irq_save(flags); | 3354 | local_irq_save(flags); |
2687 | ftrace_update_code(mod); | 3355 | ftrace_update_code(mod); |
2688 | local_irq_restore(flags); | 3356 | local_irq_restore(flags); |
@@ -2697,10 +3365,11 @@ void ftrace_release_mod(struct module *mod) | |||
2697 | struct dyn_ftrace *rec; | 3365 | struct dyn_ftrace *rec; |
2698 | struct ftrace_page *pg; | 3366 | struct ftrace_page *pg; |
2699 | 3367 | ||
3368 | mutex_lock(&ftrace_lock); | ||
3369 | |||
2700 | if (ftrace_disabled) | 3370 | if (ftrace_disabled) |
2701 | return; | 3371 | goto out_unlock; |
2702 | 3372 | ||
2703 | mutex_lock(&ftrace_lock); | ||
2704 | do_for_each_ftrace_rec(pg, rec) { | 3373 | do_for_each_ftrace_rec(pg, rec) { |
2705 | if (within_module_core(rec->ip, mod)) { | 3374 | if (within_module_core(rec->ip, mod)) { |
2706 | /* | 3375 | /* |
@@ -2711,6 +3380,7 @@ void ftrace_release_mod(struct module *mod) | |||
2711 | ftrace_free_rec(rec); | 3380 | ftrace_free_rec(rec); |
2712 | } | 3381 | } |
2713 | } while_for_each_ftrace_rec(); | 3382 | } while_for_each_ftrace_rec(); |
3383 | out_unlock: | ||
2714 | mutex_unlock(&ftrace_lock); | 3384 | mutex_unlock(&ftrace_lock); |
2715 | } | 3385 | } |
2716 | 3386 | ||
@@ -2797,6 +3467,10 @@ void __init ftrace_init(void) | |||
2797 | 3467 | ||
2798 | #else | 3468 | #else |
2799 | 3469 | ||
3470 | static struct ftrace_ops global_ops = { | ||
3471 | .func = ftrace_stub, | ||
3472 | }; | ||
3473 | |||
2800 | static int __init ftrace_nodyn_init(void) | 3474 | static int __init ftrace_nodyn_init(void) |
2801 | { | 3475 | { |
2802 | ftrace_enabled = 1; | 3476 | ftrace_enabled = 1; |
@@ -2807,12 +3481,47 @@ device_initcall(ftrace_nodyn_init); | |||
2807 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 3481 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
2808 | static inline void ftrace_startup_enable(int command) { } | 3482 | static inline void ftrace_startup_enable(int command) { } |
2809 | /* Keep as macros so we do not need to define the commands */ | 3483 | /* Keep as macros so we do not need to define the commands */ |
2810 | # define ftrace_startup(command) do { } while (0) | 3484 | # define ftrace_startup(ops, command) \ |
2811 | # define ftrace_shutdown(command) do { } while (0) | 3485 | ({ \ |
3486 | (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ | ||
3487 | 0; \ | ||
3488 | }) | ||
3489 | # define ftrace_shutdown(ops, command) do { } while (0) | ||
2812 | # define ftrace_startup_sysctl() do { } while (0) | 3490 | # define ftrace_startup_sysctl() do { } while (0) |
2813 | # define ftrace_shutdown_sysctl() do { } while (0) | 3491 | # define ftrace_shutdown_sysctl() do { } while (0) |
3492 | |||
3493 | static inline int | ||
3494 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
3495 | { | ||
3496 | return 1; | ||
3497 | } | ||
3498 | |||
2814 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 3499 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
2815 | 3500 | ||
3501 | static void | ||
3502 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | ||
3503 | { | ||
3504 | struct ftrace_ops *op; | ||
3505 | |||
3506 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) | ||
3507 | return; | ||
3508 | |||
3509 | trace_recursion_set(TRACE_INTERNAL_BIT); | ||
3510 | /* | ||
3511 | * Some of the ops may be dynamically allocated, | ||
3512 | * they must be freed after a synchronize_sched(). | ||
3513 | */ | ||
3514 | preempt_disable_notrace(); | ||
3515 | op = rcu_dereference_raw(ftrace_ops_list); | ||
3516 | while (op != &ftrace_list_end) { | ||
3517 | if (ftrace_ops_test(op, ip)) | ||
3518 | op->func(ip, parent_ip); | ||
3519 | op = rcu_dereference_raw(op->next); | ||
3520 | }; | ||
3521 | preempt_enable_notrace(); | ||
3522 | trace_recursion_clear(TRACE_INTERNAL_BIT); | ||
3523 | } | ||
3524 | |||
2816 | static void clear_ftrace_swapper(void) | 3525 | static void clear_ftrace_swapper(void) |
2817 | { | 3526 | { |
2818 | struct task_struct *p; | 3527 | struct task_struct *p; |
@@ -3105,19 +3814,23 @@ void ftrace_kill(void) | |||
3105 | */ | 3814 | */ |
3106 | int register_ftrace_function(struct ftrace_ops *ops) | 3815 | int register_ftrace_function(struct ftrace_ops *ops) |
3107 | { | 3816 | { |
3108 | int ret; | 3817 | int ret = -1; |
3109 | |||
3110 | if (unlikely(ftrace_disabled)) | ||
3111 | return -1; | ||
3112 | 3818 | ||
3113 | mutex_lock(&ftrace_lock); | 3819 | mutex_lock(&ftrace_lock); |
3114 | 3820 | ||
3821 | if (unlikely(ftrace_disabled)) | ||
3822 | goto out_unlock; | ||
3823 | |||
3115 | ret = __register_ftrace_function(ops); | 3824 | ret = __register_ftrace_function(ops); |
3116 | ftrace_startup(0); | 3825 | if (!ret) |
3826 | ret = ftrace_startup(ops, 0); | ||
3117 | 3827 | ||
3828 | |||
3829 | out_unlock: | ||
3118 | mutex_unlock(&ftrace_lock); | 3830 | mutex_unlock(&ftrace_lock); |
3119 | return ret; | 3831 | return ret; |
3120 | } | 3832 | } |
3833 | EXPORT_SYMBOL_GPL(register_ftrace_function); | ||
3121 | 3834 | ||
3122 | /** | 3835 | /** |
3123 | * unregister_ftrace_function - unregister a function for profiling. | 3836 | * unregister_ftrace_function - unregister a function for profiling. |
@@ -3131,25 +3844,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops) | |||
3131 | 3844 | ||
3132 | mutex_lock(&ftrace_lock); | 3845 | mutex_lock(&ftrace_lock); |
3133 | ret = __unregister_ftrace_function(ops); | 3846 | ret = __unregister_ftrace_function(ops); |
3134 | ftrace_shutdown(0); | 3847 | if (!ret) |
3848 | ftrace_shutdown(ops, 0); | ||
3135 | mutex_unlock(&ftrace_lock); | 3849 | mutex_unlock(&ftrace_lock); |
3136 | 3850 | ||
3137 | return ret; | 3851 | return ret; |
3138 | } | 3852 | } |
3853 | EXPORT_SYMBOL_GPL(unregister_ftrace_function); | ||
3139 | 3854 | ||
3140 | int | 3855 | int |
3141 | ftrace_enable_sysctl(struct ctl_table *table, int write, | 3856 | ftrace_enable_sysctl(struct ctl_table *table, int write, |
3142 | void __user *buffer, size_t *lenp, | 3857 | void __user *buffer, size_t *lenp, |
3143 | loff_t *ppos) | 3858 | loff_t *ppos) |
3144 | { | 3859 | { |
3145 | int ret; | 3860 | int ret = -ENODEV; |
3146 | |||
3147 | if (unlikely(ftrace_disabled)) | ||
3148 | return -ENODEV; | ||
3149 | 3861 | ||
3150 | mutex_lock(&ftrace_lock); | 3862 | mutex_lock(&ftrace_lock); |
3151 | 3863 | ||
3152 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 3864 | if (unlikely(ftrace_disabled)) |
3865 | goto out; | ||
3866 | |||
3867 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
3153 | 3868 | ||
3154 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) | 3869 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) |
3155 | goto out; | 3870 | goto out; |
@@ -3161,11 +3876,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
3161 | ftrace_startup_sysctl(); | 3876 | ftrace_startup_sysctl(); |
3162 | 3877 | ||
3163 | /* we are starting ftrace again */ | 3878 | /* we are starting ftrace again */ |
3164 | if (ftrace_list != &ftrace_list_end) { | 3879 | if (ftrace_ops_list != &ftrace_list_end) { |
3165 | if (ftrace_list->next == &ftrace_list_end) | 3880 | if (ftrace_ops_list->next == &ftrace_list_end) |
3166 | ftrace_trace_function = ftrace_list->func; | 3881 | ftrace_trace_function = ftrace_ops_list->func; |
3167 | else | 3882 | else |
3168 | ftrace_trace_function = ftrace_list_func; | 3883 | ftrace_trace_function = ftrace_ops_list_func; |
3169 | } | 3884 | } |
3170 | 3885 | ||
3171 | } else { | 3886 | } else { |
@@ -3289,7 +4004,7 @@ static int start_graph_tracing(void) | |||
3289 | /* The cpu_boot init_task->ret_stack will never be freed */ | 4004 | /* The cpu_boot init_task->ret_stack will never be freed */ |
3290 | for_each_online_cpu(cpu) { | 4005 | for_each_online_cpu(cpu) { |
3291 | if (!idle_task(cpu)->ret_stack) | 4006 | if (!idle_task(cpu)->ret_stack) |
3292 | ftrace_graph_init_task(idle_task(cpu)); | 4007 | ftrace_graph_init_idle_task(idle_task(cpu), cpu); |
3293 | } | 4008 | } |
3294 | 4009 | ||
3295 | do { | 4010 | do { |
@@ -3354,7 +4069,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
3354 | ftrace_graph_return = retfunc; | 4069 | ftrace_graph_return = retfunc; |
3355 | ftrace_graph_entry = entryfunc; | 4070 | ftrace_graph_entry = entryfunc; |
3356 | 4071 | ||
3357 | ftrace_startup(FTRACE_START_FUNC_RET); | 4072 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); |
3358 | 4073 | ||
3359 | out: | 4074 | out: |
3360 | mutex_unlock(&ftrace_lock); | 4075 | mutex_unlock(&ftrace_lock); |
@@ -3371,7 +4086,7 @@ void unregister_ftrace_graph(void) | |||
3371 | ftrace_graph_active--; | 4086 | ftrace_graph_active--; |
3372 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 4087 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
3373 | ftrace_graph_entry = ftrace_graph_entry_stub; | 4088 | ftrace_graph_entry = ftrace_graph_entry_stub; |
3374 | ftrace_shutdown(FTRACE_STOP_FUNC_RET); | 4089 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
3375 | unregister_pm_notifier(&ftrace_suspend_notifier); | 4090 | unregister_pm_notifier(&ftrace_suspend_notifier); |
3376 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 4091 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
3377 | 4092 | ||
@@ -3379,6 +4094,49 @@ void unregister_ftrace_graph(void) | |||
3379 | mutex_unlock(&ftrace_lock); | 4094 | mutex_unlock(&ftrace_lock); |
3380 | } | 4095 | } |
3381 | 4096 | ||
4097 | static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); | ||
4098 | |||
4099 | static void | ||
4100 | graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) | ||
4101 | { | ||
4102 | atomic_set(&t->tracing_graph_pause, 0); | ||
4103 | atomic_set(&t->trace_overrun, 0); | ||
4104 | t->ftrace_timestamp = 0; | ||
4105 | /* make curr_ret_stack visible before we add the ret_stack */ | ||
4106 | smp_wmb(); | ||
4107 | t->ret_stack = ret_stack; | ||
4108 | } | ||
4109 | |||
4110 | /* | ||
4111 | * Allocate a return stack for the idle task. May be the first | ||
4112 | * time through, or it may be done by CPU hotplug online. | ||
4113 | */ | ||
4114 | void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) | ||
4115 | { | ||
4116 | t->curr_ret_stack = -1; | ||
4117 | /* | ||
4118 | * The idle task has no parent, it either has its own | ||
4119 | * stack or no stack at all. | ||
4120 | */ | ||
4121 | if (t->ret_stack) | ||
4122 | WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); | ||
4123 | |||
4124 | if (ftrace_graph_active) { | ||
4125 | struct ftrace_ret_stack *ret_stack; | ||
4126 | |||
4127 | ret_stack = per_cpu(idle_ret_stack, cpu); | ||
4128 | if (!ret_stack) { | ||
4129 | ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH | ||
4130 | * sizeof(struct ftrace_ret_stack), | ||
4131 | GFP_KERNEL); | ||
4132 | if (!ret_stack) | ||
4133 | return; | ||
4134 | per_cpu(idle_ret_stack, cpu) = ret_stack; | ||
4135 | } | ||
4136 | graph_init_task(t, ret_stack); | ||
4137 | } | ||
4138 | } | ||
4139 | |||
3382 | /* Allocate a return stack for newly created task */ | 4140 | /* Allocate a return stack for newly created task */ |
3383 | void ftrace_graph_init_task(struct task_struct *t) | 4141 | void ftrace_graph_init_task(struct task_struct *t) |
3384 | { | 4142 | { |
@@ -3394,12 +4152,7 @@ void ftrace_graph_init_task(struct task_struct *t) | |||
3394 | GFP_KERNEL); | 4152 | GFP_KERNEL); |
3395 | if (!ret_stack) | 4153 | if (!ret_stack) |
3396 | return; | 4154 | return; |
3397 | atomic_set(&t->tracing_graph_pause, 0); | 4155 | graph_init_task(t, ret_stack); |
3398 | atomic_set(&t->trace_overrun, 0); | ||
3399 | t->ftrace_timestamp = 0; | ||
3400 | /* make curr_ret_stack visable before we add the ret_stack */ | ||
3401 | smp_wmb(); | ||
3402 | t->ret_stack = ret_stack; | ||
3403 | } | 4156 | } |
3404 | } | 4157 | } |
3405 | 4158 | ||
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a06161..f55fcf61b223 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c | |||
@@ -13,5 +13,8 @@ | |||
13 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
15 | 15 | ||
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); | 16 | #ifdef EVENT_POWER_TRACING_DEPRECATED |
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); | ||
18 | #endif | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); | ||
17 | 20 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bca96377fd4e..b0c7aa407943 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -5,7 +5,6 @@ | |||
5 | */ | 5 | */ |
6 | #include <linux/ring_buffer.h> | 6 | #include <linux/ring_buffer.h> |
7 | #include <linux/trace_clock.h> | 7 | #include <linux/trace_clock.h> |
8 | #include <linux/ftrace_irq.h> | ||
9 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
10 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
11 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
@@ -224,6 +223,9 @@ enum { | |||
224 | RB_LEN_TIME_STAMP = 16, | 223 | RB_LEN_TIME_STAMP = 16, |
225 | }; | 224 | }; |
226 | 225 | ||
226 | #define skip_time_extend(event) \ | ||
227 | ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) | ||
228 | |||
227 | static inline int rb_null_event(struct ring_buffer_event *event) | 229 | static inline int rb_null_event(struct ring_buffer_event *event) |
228 | { | 230 | { |
229 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; | 231 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; |
@@ -248,8 +250,12 @@ rb_event_data_length(struct ring_buffer_event *event) | |||
248 | return length + RB_EVNT_HDR_SIZE; | 250 | return length + RB_EVNT_HDR_SIZE; |
249 | } | 251 | } |
250 | 252 | ||
251 | /* inline for ring buffer fast paths */ | 253 | /* |
252 | static unsigned | 254 | * Return the length of the given event. Will return |
255 | * the length of the time extend if the event is a | ||
256 | * time extend. | ||
257 | */ | ||
258 | static inline unsigned | ||
253 | rb_event_length(struct ring_buffer_event *event) | 259 | rb_event_length(struct ring_buffer_event *event) |
254 | { | 260 | { |
255 | switch (event->type_len) { | 261 | switch (event->type_len) { |
@@ -274,13 +280,41 @@ rb_event_length(struct ring_buffer_event *event) | |||
274 | return 0; | 280 | return 0; |
275 | } | 281 | } |
276 | 282 | ||
283 | /* | ||
284 | * Return total length of time extend and data, | ||
285 | * or just the event length for all other events. | ||
286 | */ | ||
287 | static inline unsigned | ||
288 | rb_event_ts_length(struct ring_buffer_event *event) | ||
289 | { | ||
290 | unsigned len = 0; | ||
291 | |||
292 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
293 | /* time extends include the data event after it */ | ||
294 | len = RB_LEN_TIME_EXTEND; | ||
295 | event = skip_time_extend(event); | ||
296 | } | ||
297 | return len + rb_event_length(event); | ||
298 | } | ||
299 | |||
277 | /** | 300 | /** |
278 | * ring_buffer_event_length - return the length of the event | 301 | * ring_buffer_event_length - return the length of the event |
279 | * @event: the event to get the length of | 302 | * @event: the event to get the length of |
303 | * | ||
304 | * Returns the size of the data load of a data event. | ||
305 | * If the event is something other than a data event, it | ||
306 | * returns the size of the event itself. With the exception | ||
307 | * of a TIME EXTEND, where it still returns the size of the | ||
308 | * data load of the data event after it. | ||
280 | */ | 309 | */ |
281 | unsigned ring_buffer_event_length(struct ring_buffer_event *event) | 310 | unsigned ring_buffer_event_length(struct ring_buffer_event *event) |
282 | { | 311 | { |
283 | unsigned length = rb_event_length(event); | 312 | unsigned length; |
313 | |||
314 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
315 | event = skip_time_extend(event); | ||
316 | |||
317 | length = rb_event_length(event); | ||
284 | if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) | 318 | if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) |
285 | return length; | 319 | return length; |
286 | length -= RB_EVNT_HDR_SIZE; | 320 | length -= RB_EVNT_HDR_SIZE; |
@@ -294,6 +328,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); | |||
294 | static void * | 328 | static void * |
295 | rb_event_data(struct ring_buffer_event *event) | 329 | rb_event_data(struct ring_buffer_event *event) |
296 | { | 330 | { |
331 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
332 | event = skip_time_extend(event); | ||
297 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | 333 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); |
298 | /* If length is in len field, then array[0] has the data */ | 334 | /* If length is in len field, then array[0] has the data */ |
299 | if (event->type_len) | 335 | if (event->type_len) |
@@ -404,9 +440,6 @@ static inline int test_time_stamp(u64 delta) | |||
404 | /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ | 440 | /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ |
405 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) | 441 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) |
406 | 442 | ||
407 | /* Max number of timestamps that can fit on a page */ | ||
408 | #define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) | ||
409 | |||
410 | int ring_buffer_print_page_header(struct trace_seq *s) | 443 | int ring_buffer_print_page_header(struct trace_seq *s) |
411 | { | 444 | { |
412 | struct buffer_data_page field; | 445 | struct buffer_data_page field; |
@@ -635,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list) | |||
635 | * the reader page). But if the next page is a header page, | 668 | * the reader page). But if the next page is a header page, |
636 | * its flags will be non zero. | 669 | * its flags will be non zero. |
637 | */ | 670 | */ |
638 | static int inline | 671 | static inline int |
639 | rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, | 672 | rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, |
640 | struct buffer_page *page, struct list_head *list) | 673 | struct buffer_page *page, struct list_head *list) |
641 | { | 674 | { |
@@ -1395,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1395 | } | 1428 | } |
1396 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1429 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1397 | 1430 | ||
1431 | void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) | ||
1432 | { | ||
1433 | mutex_lock(&buffer->mutex); | ||
1434 | if (val) | ||
1435 | buffer->flags |= RB_FL_OVERWRITE; | ||
1436 | else | ||
1437 | buffer->flags &= ~RB_FL_OVERWRITE; | ||
1438 | mutex_unlock(&buffer->mutex); | ||
1439 | } | ||
1440 | EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); | ||
1441 | |||
1398 | static inline void * | 1442 | static inline void * |
1399 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) | 1443 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) |
1400 | { | 1444 | { |
@@ -1434,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) | |||
1434 | return local_read(&bpage->entries) & RB_WRITE_MASK; | 1478 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1435 | } | 1479 | } |
1436 | 1480 | ||
1437 | /* Size is determined by what has been commited */ | 1481 | /* Size is determined by what has been committed */ |
1438 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1482 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1439 | { | 1483 | { |
1440 | return rb_page_commit(bpage); | 1484 | return rb_page_commit(bpage); |
@@ -1546,6 +1590,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1546 | iter->head = 0; | 1590 | iter->head = 0; |
1547 | } | 1591 | } |
1548 | 1592 | ||
1593 | /* Slow path, do not inline */ | ||
1594 | static noinline struct ring_buffer_event * | ||
1595 | rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | ||
1596 | { | ||
1597 | event->type_len = RINGBUF_TYPE_TIME_EXTEND; | ||
1598 | |||
1599 | /* Not the first event on the page? */ | ||
1600 | if (rb_event_index(event)) { | ||
1601 | event->time_delta = delta & TS_MASK; | ||
1602 | event->array[0] = delta >> TS_SHIFT; | ||
1603 | } else { | ||
1604 | /* nope, just zero it */ | ||
1605 | event->time_delta = 0; | ||
1606 | event->array[0] = 0; | ||
1607 | } | ||
1608 | |||
1609 | return skip_time_extend(event); | ||
1610 | } | ||
1611 | |||
1549 | /** | 1612 | /** |
1550 | * ring_buffer_update_event - update event type and data | 1613 | * ring_buffer_update_event - update event type and data |
1551 | * @event: the even to update | 1614 | * @event: the even to update |
@@ -1558,28 +1621,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1558 | * data field. | 1621 | * data field. |
1559 | */ | 1622 | */ |
1560 | static void | 1623 | static void |
1561 | rb_update_event(struct ring_buffer_event *event, | 1624 | rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, |
1562 | unsigned type, unsigned length) | 1625 | struct ring_buffer_event *event, unsigned length, |
1626 | int add_timestamp, u64 delta) | ||
1563 | { | 1627 | { |
1564 | event->type_len = type; | 1628 | /* Only a commit updates the timestamp */ |
1565 | 1629 | if (unlikely(!rb_event_is_commit(cpu_buffer, event))) | |
1566 | switch (type) { | 1630 | delta = 0; |
1567 | |||
1568 | case RINGBUF_TYPE_PADDING: | ||
1569 | case RINGBUF_TYPE_TIME_EXTEND: | ||
1570 | case RINGBUF_TYPE_TIME_STAMP: | ||
1571 | break; | ||
1572 | 1631 | ||
1573 | case 0: | 1632 | /* |
1574 | length -= RB_EVNT_HDR_SIZE; | 1633 | * If we need to add a timestamp, then we |
1575 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) | 1634 | * add it to the start of the resevered space. |
1576 | event->array[0] = length; | 1635 | */ |
1577 | else | 1636 | if (unlikely(add_timestamp)) { |
1578 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | 1637 | event = rb_add_time_stamp(event, delta); |
1579 | break; | 1638 | length -= RB_LEN_TIME_EXTEND; |
1580 | default: | 1639 | delta = 0; |
1581 | BUG(); | ||
1582 | } | 1640 | } |
1641 | |||
1642 | event->time_delta = delta; | ||
1643 | length -= RB_EVNT_HDR_SIZE; | ||
1644 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { | ||
1645 | event->type_len = 0; | ||
1646 | event->array[0] = length; | ||
1647 | } else | ||
1648 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | ||
1583 | } | 1649 | } |
1584 | 1650 | ||
1585 | /* | 1651 | /* |
@@ -1823,10 +1889,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1823 | local_sub(length, &tail_page->write); | 1889 | local_sub(length, &tail_page->write); |
1824 | } | 1890 | } |
1825 | 1891 | ||
1826 | static struct ring_buffer_event * | 1892 | /* |
1893 | * This is the slow path, force gcc not to inline it. | ||
1894 | */ | ||
1895 | static noinline struct ring_buffer_event * | ||
1827 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | 1896 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, |
1828 | unsigned long length, unsigned long tail, | 1897 | unsigned long length, unsigned long tail, |
1829 | struct buffer_page *tail_page, u64 *ts) | 1898 | struct buffer_page *tail_page, u64 ts) |
1830 | { | 1899 | { |
1831 | struct buffer_page *commit_page = cpu_buffer->commit_page; | 1900 | struct buffer_page *commit_page = cpu_buffer->commit_page; |
1832 | struct ring_buffer *buffer = cpu_buffer->buffer; | 1901 | struct ring_buffer *buffer = cpu_buffer->buffer; |
@@ -1909,8 +1978,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1909 | * Nested commits always have zero deltas, so | 1978 | * Nested commits always have zero deltas, so |
1910 | * just reread the time stamp | 1979 | * just reread the time stamp |
1911 | */ | 1980 | */ |
1912 | *ts = rb_time_stamp(buffer); | 1981 | ts = rb_time_stamp(buffer); |
1913 | next_page->page->time_stamp = *ts; | 1982 | next_page->page->time_stamp = ts; |
1914 | } | 1983 | } |
1915 | 1984 | ||
1916 | out_again: | 1985 | out_again: |
@@ -1929,12 +1998,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1929 | 1998 | ||
1930 | static struct ring_buffer_event * | 1999 | static struct ring_buffer_event * |
1931 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | 2000 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, |
1932 | unsigned type, unsigned long length, u64 *ts) | 2001 | unsigned long length, u64 ts, |
2002 | u64 delta, int add_timestamp) | ||
1933 | { | 2003 | { |
1934 | struct buffer_page *tail_page; | 2004 | struct buffer_page *tail_page; |
1935 | struct ring_buffer_event *event; | 2005 | struct ring_buffer_event *event; |
1936 | unsigned long tail, write; | 2006 | unsigned long tail, write; |
1937 | 2007 | ||
2008 | /* | ||
2009 | * If the time delta since the last event is too big to | ||
2010 | * hold in the time field of the event, then we append a | ||
2011 | * TIME EXTEND event ahead of the data event. | ||
2012 | */ | ||
2013 | if (unlikely(add_timestamp)) | ||
2014 | length += RB_LEN_TIME_EXTEND; | ||
2015 | |||
1938 | tail_page = cpu_buffer->tail_page; | 2016 | tail_page = cpu_buffer->tail_page; |
1939 | write = local_add_return(length, &tail_page->write); | 2017 | write = local_add_return(length, &tail_page->write); |
1940 | 2018 | ||
@@ -1943,7 +2021,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1943 | tail = write - length; | 2021 | tail = write - length; |
1944 | 2022 | ||
1945 | /* See if we shot pass the end of this buffer page */ | 2023 | /* See if we shot pass the end of this buffer page */ |
1946 | if (write > BUF_PAGE_SIZE) | 2024 | if (unlikely(write > BUF_PAGE_SIZE)) |
1947 | return rb_move_tail(cpu_buffer, length, tail, | 2025 | return rb_move_tail(cpu_buffer, length, tail, |
1948 | tail_page, ts); | 2026 | tail_page, ts); |
1949 | 2027 | ||
@@ -1951,18 +2029,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1951 | 2029 | ||
1952 | event = __rb_page_index(tail_page, tail); | 2030 | event = __rb_page_index(tail_page, tail); |
1953 | kmemcheck_annotate_bitfield(event, bitfield); | 2031 | kmemcheck_annotate_bitfield(event, bitfield); |
1954 | rb_update_event(event, type, length); | 2032 | rb_update_event(cpu_buffer, event, length, add_timestamp, delta); |
1955 | 2033 | ||
1956 | /* The passed in type is zero for DATA */ | 2034 | local_inc(&tail_page->entries); |
1957 | if (likely(!type)) | ||
1958 | local_inc(&tail_page->entries); | ||
1959 | 2035 | ||
1960 | /* | 2036 | /* |
1961 | * If this is the first commit on the page, then update | 2037 | * If this is the first commit on the page, then update |
1962 | * its timestamp. | 2038 | * its timestamp. |
1963 | */ | 2039 | */ |
1964 | if (!tail) | 2040 | if (!tail) |
1965 | tail_page->page->time_stamp = *ts; | 2041 | tail_page->page->time_stamp = ts; |
1966 | 2042 | ||
1967 | return event; | 2043 | return event; |
1968 | } | 2044 | } |
@@ -1977,7 +2053,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
1977 | unsigned long addr; | 2053 | unsigned long addr; |
1978 | 2054 | ||
1979 | new_index = rb_event_index(event); | 2055 | new_index = rb_event_index(event); |
1980 | old_index = new_index + rb_event_length(event); | 2056 | old_index = new_index + rb_event_ts_length(event); |
1981 | addr = (unsigned long)event; | 2057 | addr = (unsigned long)event; |
1982 | addr &= PAGE_MASK; | 2058 | addr &= PAGE_MASK; |
1983 | 2059 | ||
@@ -2003,76 +2079,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2003 | return 0; | 2079 | return 0; |
2004 | } | 2080 | } |
2005 | 2081 | ||
2006 | static int | ||
2007 | rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, | ||
2008 | u64 *ts, u64 *delta) | ||
2009 | { | ||
2010 | struct ring_buffer_event *event; | ||
2011 | int ret; | ||
2012 | |||
2013 | WARN_ONCE(*delta > (1ULL << 59), | ||
2014 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", | ||
2015 | (unsigned long long)*delta, | ||
2016 | (unsigned long long)*ts, | ||
2017 | (unsigned long long)cpu_buffer->write_stamp); | ||
2018 | |||
2019 | /* | ||
2020 | * The delta is too big, we to add a | ||
2021 | * new timestamp. | ||
2022 | */ | ||
2023 | event = __rb_reserve_next(cpu_buffer, | ||
2024 | RINGBUF_TYPE_TIME_EXTEND, | ||
2025 | RB_LEN_TIME_EXTEND, | ||
2026 | ts); | ||
2027 | if (!event) | ||
2028 | return -EBUSY; | ||
2029 | |||
2030 | if (PTR_ERR(event) == -EAGAIN) | ||
2031 | return -EAGAIN; | ||
2032 | |||
2033 | /* Only a commited time event can update the write stamp */ | ||
2034 | if (rb_event_is_commit(cpu_buffer, event)) { | ||
2035 | /* | ||
2036 | * If this is the first on the page, then it was | ||
2037 | * updated with the page itself. Try to discard it | ||
2038 | * and if we can't just make it zero. | ||
2039 | */ | ||
2040 | if (rb_event_index(event)) { | ||
2041 | event->time_delta = *delta & TS_MASK; | ||
2042 | event->array[0] = *delta >> TS_SHIFT; | ||
2043 | } else { | ||
2044 | /* try to discard, since we do not need this */ | ||
2045 | if (!rb_try_to_discard(cpu_buffer, event)) { | ||
2046 | /* nope, just zero it */ | ||
2047 | event->time_delta = 0; | ||
2048 | event->array[0] = 0; | ||
2049 | } | ||
2050 | } | ||
2051 | cpu_buffer->write_stamp = *ts; | ||
2052 | /* let the caller know this was the commit */ | ||
2053 | ret = 1; | ||
2054 | } else { | ||
2055 | /* Try to discard the event */ | ||
2056 | if (!rb_try_to_discard(cpu_buffer, event)) { | ||
2057 | /* Darn, this is just wasted space */ | ||
2058 | event->time_delta = 0; | ||
2059 | event->array[0] = 0; | ||
2060 | } | ||
2061 | ret = 0; | ||
2062 | } | ||
2063 | |||
2064 | *delta = 0; | ||
2065 | |||
2066 | return ret; | ||
2067 | } | ||
2068 | |||
2069 | static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2082 | static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2070 | { | 2083 | { |
2071 | local_inc(&cpu_buffer->committing); | 2084 | local_inc(&cpu_buffer->committing); |
2072 | local_inc(&cpu_buffer->commits); | 2085 | local_inc(&cpu_buffer->commits); |
2073 | } | 2086 | } |
2074 | 2087 | ||
2075 | static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2088 | static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2076 | { | 2089 | { |
2077 | unsigned long commits; | 2090 | unsigned long commits; |
2078 | 2091 | ||
@@ -2110,9 +2123,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2110 | unsigned long length) | 2123 | unsigned long length) |
2111 | { | 2124 | { |
2112 | struct ring_buffer_event *event; | 2125 | struct ring_buffer_event *event; |
2113 | u64 ts, delta = 0; | 2126 | u64 ts, delta; |
2114 | int commit = 0; | ||
2115 | int nr_loops = 0; | 2127 | int nr_loops = 0; |
2128 | int add_timestamp; | ||
2129 | u64 diff; | ||
2116 | 2130 | ||
2117 | rb_start_commit(cpu_buffer); | 2131 | rb_start_commit(cpu_buffer); |
2118 | 2132 | ||
@@ -2133,6 +2147,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2133 | 2147 | ||
2134 | length = rb_calculate_event_length(length); | 2148 | length = rb_calculate_event_length(length); |
2135 | again: | 2149 | again: |
2150 | add_timestamp = 0; | ||
2151 | delta = 0; | ||
2152 | |||
2136 | /* | 2153 | /* |
2137 | * We allow for interrupts to reenter here and do a trace. | 2154 | * We allow for interrupts to reenter here and do a trace. |
2138 | * If one does, it will cause this original code to loop | 2155 | * If one does, it will cause this original code to loop |
@@ -2146,56 +2163,40 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2146 | goto out_fail; | 2163 | goto out_fail; |
2147 | 2164 | ||
2148 | ts = rb_time_stamp(cpu_buffer->buffer); | 2165 | ts = rb_time_stamp(cpu_buffer->buffer); |
2166 | diff = ts - cpu_buffer->write_stamp; | ||
2149 | 2167 | ||
2150 | /* | 2168 | /* make sure this diff is calculated here */ |
2151 | * Only the first commit can update the timestamp. | 2169 | barrier(); |
2152 | * Yes there is a race here. If an interrupt comes in | ||
2153 | * just after the conditional and it traces too, then it | ||
2154 | * will also check the deltas. More than one timestamp may | ||
2155 | * also be made. But only the entry that did the actual | ||
2156 | * commit will be something other than zero. | ||
2157 | */ | ||
2158 | if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && | ||
2159 | rb_page_write(cpu_buffer->tail_page) == | ||
2160 | rb_commit_index(cpu_buffer))) { | ||
2161 | u64 diff; | ||
2162 | |||
2163 | diff = ts - cpu_buffer->write_stamp; | ||
2164 | |||
2165 | /* make sure this diff is calculated here */ | ||
2166 | barrier(); | ||
2167 | |||
2168 | /* Did the write stamp get updated already? */ | ||
2169 | if (unlikely(ts < cpu_buffer->write_stamp)) | ||
2170 | goto get_event; | ||
2171 | 2170 | ||
2171 | /* Did the write stamp get updated already? */ | ||
2172 | if (likely(ts >= cpu_buffer->write_stamp)) { | ||
2172 | delta = diff; | 2173 | delta = diff; |
2173 | if (unlikely(test_time_stamp(delta))) { | 2174 | if (unlikely(test_time_stamp(delta))) { |
2174 | 2175 | int local_clock_stable = 1; | |
2175 | commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); | 2176 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
2176 | if (commit == -EBUSY) | 2177 | local_clock_stable = sched_clock_stable; |
2177 | goto out_fail; | 2178 | #endif |
2178 | 2179 | WARN_ONCE(delta > (1ULL << 59), | |
2179 | if (commit == -EAGAIN) | 2180 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", |
2180 | goto again; | 2181 | (unsigned long long)delta, |
2181 | 2182 | (unsigned long long)ts, | |
2182 | RB_WARN_ON(cpu_buffer, commit < 0); | 2183 | (unsigned long long)cpu_buffer->write_stamp, |
2184 | local_clock_stable ? "" : | ||
2185 | "If you just came from a suspend/resume,\n" | ||
2186 | "please switch to the trace global clock:\n" | ||
2187 | " echo global > /sys/kernel/debug/tracing/trace_clock\n"); | ||
2188 | add_timestamp = 1; | ||
2183 | } | 2189 | } |
2184 | } | 2190 | } |
2185 | 2191 | ||
2186 | get_event: | 2192 | event = __rb_reserve_next(cpu_buffer, length, ts, |
2187 | event = __rb_reserve_next(cpu_buffer, 0, length, &ts); | 2193 | delta, add_timestamp); |
2188 | if (unlikely(PTR_ERR(event) == -EAGAIN)) | 2194 | if (unlikely(PTR_ERR(event) == -EAGAIN)) |
2189 | goto again; | 2195 | goto again; |
2190 | 2196 | ||
2191 | if (!event) | 2197 | if (!event) |
2192 | goto out_fail; | 2198 | goto out_fail; |
2193 | 2199 | ||
2194 | if (!rb_event_is_commit(cpu_buffer, event)) | ||
2195 | delta = 0; | ||
2196 | |||
2197 | event->time_delta = delta; | ||
2198 | |||
2199 | return event; | 2200 | return event; |
2200 | 2201 | ||
2201 | out_fail: | 2202 | out_fail: |
@@ -2207,32 +2208,39 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2207 | 2208 | ||
2208 | #define TRACE_RECURSIVE_DEPTH 16 | 2209 | #define TRACE_RECURSIVE_DEPTH 16 |
2209 | 2210 | ||
2210 | static int trace_recursive_lock(void) | 2211 | /* Keep this code out of the fast path cache */ |
2212 | static noinline void trace_recursive_fail(void) | ||
2211 | { | 2213 | { |
2212 | current->trace_recursion++; | ||
2213 | |||
2214 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | ||
2215 | return 0; | ||
2216 | |||
2217 | /* Disable all tracing before we do anything else */ | 2214 | /* Disable all tracing before we do anything else */ |
2218 | tracing_off_permanent(); | 2215 | tracing_off_permanent(); |
2219 | 2216 | ||
2220 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" | 2217 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" |
2221 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", | 2218 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", |
2222 | current->trace_recursion, | 2219 | trace_recursion_buffer(), |
2223 | hardirq_count() >> HARDIRQ_SHIFT, | 2220 | hardirq_count() >> HARDIRQ_SHIFT, |
2224 | softirq_count() >> SOFTIRQ_SHIFT, | 2221 | softirq_count() >> SOFTIRQ_SHIFT, |
2225 | in_nmi()); | 2222 | in_nmi()); |
2226 | 2223 | ||
2227 | WARN_ON_ONCE(1); | 2224 | WARN_ON_ONCE(1); |
2225 | } | ||
2226 | |||
2227 | static inline int trace_recursive_lock(void) | ||
2228 | { | ||
2229 | trace_recursion_inc(); | ||
2230 | |||
2231 | if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) | ||
2232 | return 0; | ||
2233 | |||
2234 | trace_recursive_fail(); | ||
2235 | |||
2228 | return -1; | 2236 | return -1; |
2229 | } | 2237 | } |
2230 | 2238 | ||
2231 | static void trace_recursive_unlock(void) | 2239 | static inline void trace_recursive_unlock(void) |
2232 | { | 2240 | { |
2233 | WARN_ON_ONCE(!current->trace_recursion); | 2241 | WARN_ON_ONCE(!trace_recursion_buffer()); |
2234 | 2242 | ||
2235 | current->trace_recursion--; | 2243 | trace_recursion_dec(); |
2236 | } | 2244 | } |
2237 | 2245 | ||
2238 | #else | 2246 | #else |
@@ -2308,12 +2316,28 @@ static void | |||
2308 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, | 2316 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, |
2309 | struct ring_buffer_event *event) | 2317 | struct ring_buffer_event *event) |
2310 | { | 2318 | { |
2319 | u64 delta; | ||
2320 | |||
2311 | /* | 2321 | /* |
2312 | * The event first in the commit queue updates the | 2322 | * The event first in the commit queue updates the |
2313 | * time stamp. | 2323 | * time stamp. |
2314 | */ | 2324 | */ |
2315 | if (rb_event_is_commit(cpu_buffer, event)) | 2325 | if (rb_event_is_commit(cpu_buffer, event)) { |
2316 | cpu_buffer->write_stamp += event->time_delta; | 2326 | /* |
2327 | * A commit event that is first on a page | ||
2328 | * updates the write timestamp with the page stamp | ||
2329 | */ | ||
2330 | if (!rb_event_index(event)) | ||
2331 | cpu_buffer->write_stamp = | ||
2332 | cpu_buffer->commit_page->page->time_stamp; | ||
2333 | else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
2334 | delta = event->array[0]; | ||
2335 | delta <<= TS_SHIFT; | ||
2336 | delta += event->time_delta; | ||
2337 | cpu_buffer->write_stamp += delta; | ||
2338 | } else | ||
2339 | cpu_buffer->write_stamp += event->time_delta; | ||
2340 | } | ||
2317 | } | 2341 | } |
2318 | 2342 | ||
2319 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, | 2343 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, |
@@ -2353,6 +2377,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); | |||
2353 | 2377 | ||
2354 | static inline void rb_event_discard(struct ring_buffer_event *event) | 2378 | static inline void rb_event_discard(struct ring_buffer_event *event) |
2355 | { | 2379 | { |
2380 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
2381 | event = skip_time_extend(event); | ||
2382 | |||
2356 | /* array[0] holds the actual length for the discarded event */ | 2383 | /* array[0] holds the actual length for the discarded event */ |
2357 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; | 2384 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; |
2358 | event->type_len = RINGBUF_TYPE_PADDING; | 2385 | event->type_len = RINGBUF_TYPE_PADDING; |
@@ -2606,6 +2633,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) | |||
2606 | } | 2633 | } |
2607 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | 2634 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); |
2608 | 2635 | ||
2636 | /* | ||
2637 | * The total entries in the ring buffer is the running counter | ||
2638 | * of entries entered into the ring buffer, minus the sum of | ||
2639 | * the entries read from the ring buffer and the number of | ||
2640 | * entries that were overwritten. | ||
2641 | */ | ||
2642 | static inline unsigned long | ||
2643 | rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | ||
2644 | { | ||
2645 | return local_read(&cpu_buffer->entries) - | ||
2646 | (local_read(&cpu_buffer->overrun) + cpu_buffer->read); | ||
2647 | } | ||
2648 | |||
2609 | /** | 2649 | /** |
2610 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer | 2650 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer |
2611 | * @buffer: The ring buffer | 2651 | * @buffer: The ring buffer |
@@ -2614,16 +2654,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | |||
2614 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | 2654 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) |
2615 | { | 2655 | { |
2616 | struct ring_buffer_per_cpu *cpu_buffer; | 2656 | struct ring_buffer_per_cpu *cpu_buffer; |
2617 | unsigned long ret; | ||
2618 | 2657 | ||
2619 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2658 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
2620 | return 0; | 2659 | return 0; |
2621 | 2660 | ||
2622 | cpu_buffer = buffer->buffers[cpu]; | 2661 | cpu_buffer = buffer->buffers[cpu]; |
2623 | ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) | ||
2624 | - cpu_buffer->read; | ||
2625 | 2662 | ||
2626 | return ret; | 2663 | return rb_num_of_entries(cpu_buffer); |
2627 | } | 2664 | } |
2628 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 2665 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
2629 | 2666 | ||
@@ -2684,8 +2721,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) | |||
2684 | /* if you care about this being correct, lock the buffer */ | 2721 | /* if you care about this being correct, lock the buffer */ |
2685 | for_each_buffer_cpu(buffer, cpu) { | 2722 | for_each_buffer_cpu(buffer, cpu) { |
2686 | cpu_buffer = buffer->buffers[cpu]; | 2723 | cpu_buffer = buffer->buffers[cpu]; |
2687 | entries += (local_read(&cpu_buffer->entries) - | 2724 | entries += rb_num_of_entries(cpu_buffer); |
2688 | local_read(&cpu_buffer->overrun)) - cpu_buffer->read; | ||
2689 | } | 2725 | } |
2690 | 2726 | ||
2691 | return entries; | 2727 | return entries; |
@@ -2896,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
2896 | /* | 2932 | /* |
2897 | * cpu_buffer->pages just needs to point to the buffer, it | 2933 | * cpu_buffer->pages just needs to point to the buffer, it |
2898 | * has no specific buffer page to point to. Lets move it out | 2934 | * has no specific buffer page to point to. Lets move it out |
2899 | * of our way so we don't accidently swap it. | 2935 | * of our way so we don't accidentally swap it. |
2900 | */ | 2936 | */ |
2901 | cpu_buffer->pages = reader->list.prev; | 2937 | cpu_buffer->pages = reader->list.prev; |
2902 | 2938 | ||
@@ -3040,12 +3076,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, | |||
3040 | 3076 | ||
3041 | again: | 3077 | again: |
3042 | /* | 3078 | /* |
3043 | * We repeat when a timestamp is encountered. It is possible | 3079 | * We repeat when a time extend is encountered. |
3044 | * to get multiple timestamps from an interrupt entering just | 3080 | * Since the time extend is always attached to a data event, |
3045 | * as one timestamp is about to be written, or from discarded | 3081 | * we should never loop more than once. |
3046 | * commits. The most that we can have is the number on a single page. | 3082 | * (We never hit the following condition more than twice). |
3047 | */ | 3083 | */ |
3048 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) | 3084 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) |
3049 | return NULL; | 3085 | return NULL; |
3050 | 3086 | ||
3051 | reader = rb_get_reader_page(cpu_buffer); | 3087 | reader = rb_get_reader_page(cpu_buffer); |
@@ -3121,14 +3157,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3121 | return NULL; | 3157 | return NULL; |
3122 | 3158 | ||
3123 | /* | 3159 | /* |
3124 | * We repeat when a timestamp is encountered. | 3160 | * We repeat when a time extend is encountered. |
3125 | * We can get multiple timestamps by nested interrupts or also | 3161 | * Since the time extend is always attached to a data event, |
3126 | * if filtering is on (discarding commits). Since discarding | 3162 | * we should never loop more than once. |
3127 | * commits can be frequent we can get a lot of timestamps. | 3163 | * (We never hit the following condition more than twice). |
3128 | * But we limit them by not adding timestamps if they begin | ||
3129 | * at the start of a page. | ||
3130 | */ | 3164 | */ |
3131 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) | 3165 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) |
3132 | return NULL; | 3166 | return NULL; |
3133 | 3167 | ||
3134 | if (rb_per_cpu_empty(cpu_buffer)) | 3168 | if (rb_per_cpu_empty(cpu_buffer)) |
@@ -3826,7 +3860,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3826 | if (len > (commit - read)) | 3860 | if (len > (commit - read)) |
3827 | len = (commit - read); | 3861 | len = (commit - read); |
3828 | 3862 | ||
3829 | size = rb_event_length(event); | 3863 | /* Always keep the time extend and data together */ |
3864 | size = rb_event_ts_length(event); | ||
3830 | 3865 | ||
3831 | if (len < size) | 3866 | if (len < size) |
3832 | goto out_unlock; | 3867 | goto out_unlock; |
@@ -3836,6 +3871,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3836 | 3871 | ||
3837 | /* Need to copy one event at a time */ | 3872 | /* Need to copy one event at a time */ |
3838 | do { | 3873 | do { |
3874 | /* We need the size of one event, because | ||
3875 | * rb_advance_reader only advances by one event, | ||
3876 | * whereas rb_event_ts_length may include the size of | ||
3877 | * one or two events. | ||
3878 | * We have already ensured there's enough space if this | ||
3879 | * is a time extend. */ | ||
3880 | size = rb_event_length(event); | ||
3839 | memcpy(bpage->data + pos, rpage->data + rpos, size); | 3881 | memcpy(bpage->data + pos, rpage->data + rpos, size); |
3840 | 3882 | ||
3841 | len -= size; | 3883 | len -= size; |
@@ -3848,8 +3890,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3848 | break; | 3890 | break; |
3849 | 3891 | ||
3850 | event = rb_reader_event(cpu_buffer); | 3892 | event = rb_reader_event(cpu_buffer); |
3851 | size = rb_event_length(event); | 3893 | /* Always keep the time extend and data together */ |
3852 | } while (len > size); | 3894 | size = rb_event_ts_length(event); |
3895 | } while (len >= size); | ||
3853 | 3896 | ||
3854 | /* update bpage */ | 3897 | /* update bpage */ |
3855 | local_set(&bpage->commit, pos); | 3898 | local_set(&bpage->commit, pos); |
@@ -3965,6 +4008,7 @@ static const struct file_operations rb_simple_fops = { | |||
3965 | .open = tracing_open_generic, | 4008 | .open = tracing_open_generic, |
3966 | .read = rb_simple_read, | 4009 | .read = rb_simple_read, |
3967 | .write = rb_simple_write, | 4010 | .write = rb_simple_write, |
4011 | .llseek = default_llseek, | ||
3968 | }; | 4012 | }; |
3969 | 4013 | ||
3970 | 4014 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..ee9c921d7f21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/writeback.h> | 17 | #include <linux/writeback.h> |
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/smp_lock.h> | ||
21 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
22 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
23 | #include <linux/debugfs.h> | 22 | #include <linux/debugfs.h> |
@@ -42,8 +41,6 @@ | |||
42 | #include "trace.h" | 41 | #include "trace.h" |
43 | #include "trace_output.h" | 42 | #include "trace_output.h" |
44 | 43 | ||
45 | #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) | ||
46 | |||
47 | /* | 44 | /* |
48 | * On boot up, the ring buffer is set to the minimum size, so that | 45 | * On boot up, the ring buffer is set to the minimum size, so that |
49 | * we do not waste memory on systems that are not using tracing. | 46 | * we do not waste memory on systems that are not using tracing. |
@@ -341,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
341 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
342 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
343 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
344 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; |
345 | 342 | ||
346 | static int trace_stop_count; | 343 | static int trace_stop_count; |
347 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_SPINLOCK(tracing_start_lock); |
@@ -426,6 +423,7 @@ static const char *trace_options[] = { | |||
426 | "sleep-time", | 423 | "sleep-time", |
427 | "graph-time", | 424 | "graph-time", |
428 | "record-cmd", | 425 | "record-cmd", |
426 | "overwrite", | ||
429 | NULL | 427 | NULL |
430 | }; | 428 | }; |
431 | 429 | ||
@@ -781,6 +779,11 @@ __acquires(kernel_lock) | |||
781 | tracing_reset_online_cpus(tr); | 779 | tracing_reset_online_cpus(tr); |
782 | 780 | ||
783 | current_trace = type; | 781 | current_trace = type; |
782 | |||
783 | /* If we expanded the buffers, make sure the max is expanded too */ | ||
784 | if (ring_buffer_expanded && type->use_max_tr) | ||
785 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | ||
786 | |||
784 | /* the test is responsible for initializing and enabling */ | 787 | /* the test is responsible for initializing and enabling */ |
785 | pr_info("Testing tracer %s: ", type->name); | 788 | pr_info("Testing tracer %s: ", type->name); |
786 | ret = type->selftest(type, tr); | 789 | ret = type->selftest(type, tr); |
@@ -793,6 +796,10 @@ __acquires(kernel_lock) | |||
793 | /* Only reset on passing, to avoid touching corrupted buffers */ | 796 | /* Only reset on passing, to avoid touching corrupted buffers */ |
794 | tracing_reset_online_cpus(tr); | 797 | tracing_reset_online_cpus(tr); |
795 | 798 | ||
799 | /* Shrink the max buffer again */ | ||
800 | if (ring_buffer_expanded && type->use_max_tr) | ||
801 | ring_buffer_resize(max_tr.buffer, 1); | ||
802 | |||
796 | printk(KERN_CONT "PASSED\n"); | 803 | printk(KERN_CONT "PASSED\n"); |
797 | } | 804 | } |
798 | #endif | 805 | #endif |
@@ -1103,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
1103 | 1110 | ||
1104 | entry->preempt_count = pc & 0xff; | 1111 | entry->preempt_count = pc & 0xff; |
1105 | entry->pid = (tsk) ? tsk->pid : 0; | 1112 | entry->pid = (tsk) ? tsk->pid : 0; |
1106 | entry->lock_depth = (tsk) ? tsk->lock_depth : 0; | 1113 | entry->padding = 0; |
1107 | entry->flags = | 1114 | entry->flags = |
1108 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | 1115 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT |
1109 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | | 1116 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | |
@@ -1284,6 +1291,8 @@ void trace_dump_stack(void) | |||
1284 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); | 1291 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); |
1285 | } | 1292 | } |
1286 | 1293 | ||
1294 | static DEFINE_PER_CPU(int, user_stack_count); | ||
1295 | |||
1287 | void | 1296 | void |
1288 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | 1297 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) |
1289 | { | 1298 | { |
@@ -1302,10 +1311,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1302 | if (unlikely(in_nmi())) | 1311 | if (unlikely(in_nmi())) |
1303 | return; | 1312 | return; |
1304 | 1313 | ||
1314 | /* | ||
1315 | * prevent recursion, since the user stack tracing may | ||
1316 | * trigger other kernel events. | ||
1317 | */ | ||
1318 | preempt_disable(); | ||
1319 | if (__this_cpu_read(user_stack_count)) | ||
1320 | goto out; | ||
1321 | |||
1322 | __this_cpu_inc(user_stack_count); | ||
1323 | |||
1305 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, | 1324 | event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, |
1306 | sizeof(*entry), flags, pc); | 1325 | sizeof(*entry), flags, pc); |
1307 | if (!event) | 1326 | if (!event) |
1308 | return; | 1327 | goto out_drop_count; |
1309 | entry = ring_buffer_event_data(event); | 1328 | entry = ring_buffer_event_data(event); |
1310 | 1329 | ||
1311 | entry->tgid = current->tgid; | 1330 | entry->tgid = current->tgid; |
@@ -1319,6 +1338,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
1319 | save_stack_trace_user(&trace); | 1338 | save_stack_trace_user(&trace); |
1320 | if (!filter_check_discard(call, entry, buffer, event)) | 1339 | if (!filter_check_discard(call, entry, buffer, event)) |
1321 | ring_buffer_unlock_commit(buffer, event); | 1340 | ring_buffer_unlock_commit(buffer, event); |
1341 | |||
1342 | out_drop_count: | ||
1343 | __this_cpu_dec(user_stack_count); | ||
1344 | out: | ||
1345 | preempt_enable(); | ||
1322 | } | 1346 | } |
1323 | 1347 | ||
1324 | #ifdef UNUSED | 1348 | #ifdef UNUSED |
@@ -1733,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m) | |||
1733 | seq_puts(m, "# | / _----=> need-resched \n"); | 1757 | seq_puts(m, "# | / _----=> need-resched \n"); |
1734 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); | 1758 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); |
1735 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); | 1759 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); |
1736 | seq_puts(m, "# |||| /_--=> lock-depth \n"); | 1760 | seq_puts(m, "# |||| / delay \n"); |
1737 | seq_puts(m, "# |||||/ delay \n"); | 1761 | seq_puts(m, "# cmd pid ||||| time | caller \n"); |
1738 | seq_puts(m, "# cmd pid |||||| time | caller \n"); | 1762 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
1739 | seq_puts(m, "# \\ / |||||| \\ | / \n"); | ||
1740 | } | 1763 | } |
1741 | 1764 | ||
1742 | static void print_func_help_header(struct seq_file *m) | 1765 | static void print_func_help_header(struct seq_file *m) |
@@ -1991,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
1991 | { | 2014 | { |
1992 | enum print_line_t ret; | 2015 | enum print_line_t ret; |
1993 | 2016 | ||
1994 | if (iter->lost_events) | 2017 | if (iter->lost_events && |
1995 | trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", | 2018 | !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", |
1996 | iter->cpu, iter->lost_events); | 2019 | iter->cpu, iter->lost_events)) |
2020 | return TRACE_TYPE_PARTIAL_LINE; | ||
1997 | 2021 | ||
1998 | if (iter->trace && iter->trace->print_line) { | 2022 | if (iter->trace && iter->trace->print_line) { |
1999 | ret = iter->trace->print_line(iter); | 2023 | ret = iter->trace->print_line(iter); |
@@ -2196,7 +2220,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) | |||
2196 | 2220 | ||
2197 | static int tracing_release(struct inode *inode, struct file *file) | 2221 | static int tracing_release(struct inode *inode, struct file *file) |
2198 | { | 2222 | { |
2199 | struct seq_file *m = (struct seq_file *)file->private_data; | 2223 | struct seq_file *m = file->private_data; |
2200 | struct trace_iterator *iter; | 2224 | struct trace_iterator *iter; |
2201 | int cpu; | 2225 | int cpu; |
2202 | 2226 | ||
@@ -2320,11 +2344,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, | |||
2320 | return count; | 2344 | return count; |
2321 | } | 2345 | } |
2322 | 2346 | ||
2347 | static loff_t tracing_seek(struct file *file, loff_t offset, int origin) | ||
2348 | { | ||
2349 | if (file->f_mode & FMODE_READ) | ||
2350 | return seq_lseek(file, offset, origin); | ||
2351 | else | ||
2352 | return 0; | ||
2353 | } | ||
2354 | |||
2323 | static const struct file_operations tracing_fops = { | 2355 | static const struct file_operations tracing_fops = { |
2324 | .open = tracing_open, | 2356 | .open = tracing_open, |
2325 | .read = seq_read, | 2357 | .read = seq_read, |
2326 | .write = tracing_write_stub, | 2358 | .write = tracing_write_stub, |
2327 | .llseek = seq_lseek, | 2359 | .llseek = tracing_seek, |
2328 | .release = tracing_release, | 2360 | .release = tracing_release, |
2329 | }; | 2361 | }; |
2330 | 2362 | ||
@@ -2505,6 +2537,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2505 | 2537 | ||
2506 | if (mask == TRACE_ITER_RECORD_CMD) | 2538 | if (mask == TRACE_ITER_RECORD_CMD) |
2507 | trace_event_enable_cmd_record(enabled); | 2539 | trace_event_enable_cmd_record(enabled); |
2540 | |||
2541 | if (mask == TRACE_ITER_OVERWRITE) | ||
2542 | ring_buffer_change_overwrite(global_trace.buffer, enabled); | ||
2508 | } | 2543 | } |
2509 | 2544 | ||
2510 | static ssize_t | 2545 | static ssize_t |
@@ -2686,6 +2721,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
2686 | 2721 | ||
2687 | mutex_lock(&trace_types_lock); | 2722 | mutex_lock(&trace_types_lock); |
2688 | if (tracer_enabled ^ val) { | 2723 | if (tracer_enabled ^ val) { |
2724 | |||
2725 | /* Only need to warn if this is used to change the state */ | ||
2726 | WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); | ||
2727 | |||
2689 | if (val) { | 2728 | if (val) { |
2690 | tracer_enabled = 1; | 2729 | tracer_enabled = 1; |
2691 | if (current_trace->start) | 2730 | if (current_trace->start) |
@@ -3192,6 +3231,14 @@ waitagain: | |||
3192 | 3231 | ||
3193 | if (iter->seq.len >= cnt) | 3232 | if (iter->seq.len >= cnt) |
3194 | break; | 3233 | break; |
3234 | |||
3235 | /* | ||
3236 | * Setting the full flag means we reached the trace_seq buffer | ||
3237 | * size and we should leave by partial output condition above. | ||
3238 | * One of the trace_seq_* functions is not used properly. | ||
3239 | */ | ||
3240 | WARN_ONCE(iter->seq.full, "full flag set for trace type %d", | ||
3241 | iter->ent->type); | ||
3195 | } | 3242 | } |
3196 | trace_access_unlock(iter->cpu_file); | 3243 | trace_access_unlock(iter->cpu_file); |
3197 | trace_event_read_unlock(); | 3244 | trace_event_read_unlock(); |
@@ -3202,7 +3249,7 @@ waitagain: | |||
3202 | trace_seq_init(&iter->seq); | 3249 | trace_seq_init(&iter->seq); |
3203 | 3250 | ||
3204 | /* | 3251 | /* |
3205 | * If there was nothing to send to user, inspite of consuming trace | 3252 | * If there was nothing to send to user, in spite of consuming trace |
3206 | * entries, go back to wait for more entries. | 3253 | * entries, go back to wait for more entries. |
3207 | */ | 3254 | */ |
3208 | if (sret == -EBUSY) | 3255 | if (sret == -EBUSY) |
@@ -3996,13 +4043,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
3996 | { | 4043 | { |
3997 | struct dentry *d_percpu = tracing_dentry_percpu(); | 4044 | struct dentry *d_percpu = tracing_dentry_percpu(); |
3998 | struct dentry *d_cpu; | 4045 | struct dentry *d_cpu; |
3999 | /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ | 4046 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4000 | char cpu_dir[7]; | ||
4001 | 4047 | ||
4002 | if (cpu > 999 || cpu < 0) | 4048 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4003 | return; | ||
4004 | |||
4005 | sprintf(cpu_dir, "cpu%ld", cpu); | ||
4006 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4049 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4007 | if (!d_cpu) { | 4050 | if (!d_cpu) { |
4008 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); | 4051 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); |
@@ -4531,9 +4574,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
4531 | __init static int tracer_alloc_buffers(void) | 4574 | __init static int tracer_alloc_buffers(void) |
4532 | { | 4575 | { |
4533 | int ring_buf_size; | 4576 | int ring_buf_size; |
4577 | enum ring_buffer_flags rb_flags; | ||
4534 | int i; | 4578 | int i; |
4535 | int ret = -ENOMEM; | 4579 | int ret = -ENOMEM; |
4536 | 4580 | ||
4581 | |||
4537 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) | 4582 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) |
4538 | goto out; | 4583 | goto out; |
4539 | 4584 | ||
@@ -4546,12 +4591,13 @@ __init static int tracer_alloc_buffers(void) | |||
4546 | else | 4591 | else |
4547 | ring_buf_size = 1; | 4592 | ring_buf_size = 1; |
4548 | 4593 | ||
4594 | rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; | ||
4595 | |||
4549 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 4596 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
4550 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 4597 | cpumask_copy(tracing_cpumask, cpu_all_mask); |
4551 | 4598 | ||
4552 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 4599 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
4553 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, | 4600 | global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); |
4554 | TRACE_BUFFER_FLAGS); | ||
4555 | if (!global_trace.buffer) { | 4601 | if (!global_trace.buffer) { |
4556 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 4602 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
4557 | WARN_ON(1); | 4603 | WARN_ON(1); |
@@ -4561,7 +4607,7 @@ __init static int tracer_alloc_buffers(void) | |||
4561 | 4607 | ||
4562 | 4608 | ||
4563 | #ifdef CONFIG_TRACER_MAX_TRACE | 4609 | #ifdef CONFIG_TRACER_MAX_TRACE |
4564 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); | 4610 | max_tr.buffer = ring_buffer_alloc(1, rb_flags); |
4565 | if (!max_tr.buffer) { | 4611 | if (!max_tr.buffer) { |
4566 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4612 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
4567 | WARN_ON(1); | 4613 | WARN_ON(1); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..229f8591f61d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -272,8 +272,8 @@ struct tracer { | |||
272 | /* If you handled the flag setting, return 0 */ | 272 | /* If you handled the flag setting, return 0 */ |
273 | int (*set_flag)(u32 old_flags, u32 bit, int set); | 273 | int (*set_flag)(u32 old_flags, u32 bit, int set); |
274 | struct tracer *next; | 274 | struct tracer *next; |
275 | int print_max; | ||
276 | struct tracer_flags *flags; | 275 | struct tracer_flags *flags; |
276 | int print_max; | ||
277 | int use_max_tr; | 277 | int use_max_tr; |
278 | }; | 278 | }; |
279 | 279 | ||
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, | |||
343 | unsigned long ip, | 343 | unsigned long ip, |
344 | unsigned long parent_ip, | 344 | unsigned long parent_ip, |
345 | unsigned long flags, int pc); | 345 | unsigned long flags, int pc); |
346 | void trace_graph_function(struct trace_array *tr, | ||
347 | unsigned long ip, | ||
348 | unsigned long parent_ip, | ||
349 | unsigned long flags, int pc); | ||
346 | void trace_default_header(struct seq_file *m); | 350 | void trace_default_header(struct seq_file *m); |
347 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 351 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
348 | int trace_empty(struct trace_iterator *iter); | 352 | int trace_empty(struct trace_iterator *iter); |
@@ -415,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]); | |||
415 | extern unsigned long ftrace_update_tot_cnt; | 419 | extern unsigned long ftrace_update_tot_cnt; |
416 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 420 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
417 | extern int DYN_FTRACE_TEST_NAME(void); | 421 | extern int DYN_FTRACE_TEST_NAME(void); |
422 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | ||
423 | extern int DYN_FTRACE_TEST_NAME2(void); | ||
418 | #endif | 424 | #endif |
419 | 425 | ||
420 | extern int ring_buffer_expanded; | 426 | extern int ring_buffer_expanded; |
@@ -602,6 +608,7 @@ enum trace_iterator_flags { | |||
602 | TRACE_ITER_SLEEP_TIME = 0x40000, | 608 | TRACE_ITER_SLEEP_TIME = 0x40000, |
603 | TRACE_ITER_GRAPH_TIME = 0x80000, | 609 | TRACE_ITER_GRAPH_TIME = 0x80000, |
604 | TRACE_ITER_RECORD_CMD = 0x100000, | 610 | TRACE_ITER_RECORD_CMD = 0x100000, |
611 | TRACE_ITER_OVERWRITE = 0x200000, | ||
605 | }; | 612 | }; |
606 | 613 | ||
607 | /* | 614 | /* |
@@ -657,8 +664,10 @@ struct ftrace_event_field { | |||
657 | }; | 664 | }; |
658 | 665 | ||
659 | struct event_filter { | 666 | struct event_filter { |
660 | int n_preds; | 667 | int n_preds; /* Number assigned */ |
661 | struct filter_pred **preds; | 668 | int a_preds; /* allocated */ |
669 | struct filter_pred *preds; | ||
670 | struct filter_pred *root; | ||
662 | char *filter_string; | 671 | char *filter_string; |
663 | }; | 672 | }; |
664 | 673 | ||
@@ -670,11 +679,23 @@ struct event_subsystem { | |||
670 | int nr_events; | 679 | int nr_events; |
671 | }; | 680 | }; |
672 | 681 | ||
682 | #define FILTER_PRED_INVALID ((unsigned short)-1) | ||
683 | #define FILTER_PRED_IS_RIGHT (1 << 15) | ||
684 | #define FILTER_PRED_FOLD (1 << 15) | ||
685 | |||
686 | /* | ||
687 | * The max preds is the size of unsigned short with | ||
688 | * two flags at the MSBs. One bit is used for both the IS_RIGHT | ||
689 | * and FOLD flags. The other is reserved. | ||
690 | * | ||
691 | * 2^14 preds is way more than enough. | ||
692 | */ | ||
693 | #define MAX_FILTER_PRED 16384 | ||
694 | |||
673 | struct filter_pred; | 695 | struct filter_pred; |
674 | struct regex; | 696 | struct regex; |
675 | 697 | ||
676 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, | 698 | typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); |
677 | int val1, int val2); | ||
678 | 699 | ||
679 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); | 700 | typedef int (*regex_match_func)(char *str, struct regex *r, int len); |
680 | 701 | ||
@@ -696,11 +717,23 @@ struct filter_pred { | |||
696 | filter_pred_fn_t fn; | 717 | filter_pred_fn_t fn; |
697 | u64 val; | 718 | u64 val; |
698 | struct regex regex; | 719 | struct regex regex; |
699 | char *field_name; | 720 | /* |
721 | * Leaf nodes use field_name, ops is used by AND and OR | ||
722 | * nodes. The field_name is always freed when freeing a pred. | ||
723 | * We can overload field_name for ops and have it freed | ||
724 | * as well. | ||
725 | */ | ||
726 | union { | ||
727 | char *field_name; | ||
728 | unsigned short *ops; | ||
729 | }; | ||
700 | int offset; | 730 | int offset; |
701 | int not; | 731 | int not; |
702 | int op; | 732 | int op; |
703 | int pop_n; | 733 | unsigned short index; |
734 | unsigned short parent; | ||
735 | unsigned short left; | ||
736 | unsigned short right; | ||
704 | }; | 737 | }; |
705 | 738 | ||
706 | extern struct list_head ftrace_common_fields; | 739 | extern struct list_head ftrace_common_fields; |
@@ -751,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
751 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 784 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) |
752 | #include "trace_entries.h" | 785 | #include "trace_entries.h" |
753 | 786 | ||
787 | /* Only current can touch trace_recursion */ | ||
788 | #define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) | ||
789 | #define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) | ||
790 | |||
791 | /* Ring buffer has the 10 LSB bits to count */ | ||
792 | #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) | ||
793 | |||
794 | /* for function tracing recursion */ | ||
795 | #define TRACE_INTERNAL_BIT (1<<11) | ||
796 | #define TRACE_GLOBAL_BIT (1<<12) | ||
797 | |||
798 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) | ||
799 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) | ||
800 | #define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) | ||
801 | |||
754 | #endif /* _LINUX_KERNEL_TRACE_H */ | 802 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d55db0..6302747a1398 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) | |||
46 | } | 46 | } |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * trace_clock(): 'inbetween' trace clock. Not completely serialized, | 49 | * trace_clock(): 'between' trace clock. Not completely serialized, |
50 | * but not completely incorrect when crossing CPUs either. | 50 | * but not completely incorrect when crossing CPUs either. |
51 | * | 51 | * |
52 | * This is based on cpu_clock(), which will allow at most ~1 jiffy of | 52 | * This is based on cpu_clock(), which will allow at most ~1 jiffy of |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e6..e32744c84d94 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -27,7 +27,7 @@ | |||
27 | * in the structure. | 27 | * in the structure. |
28 | * | 28 | * |
29 | * * for structures within structures, the format of the internal | 29 | * * for structures within structures, the format of the internal |
30 | * structure is layed out. This allows the internal structure | 30 | * structure is laid out. This allows the internal structure |
31 | * to be deciphered for the format file. Although these macros | 31 | * to be deciphered for the format file. Although these macros |
32 | * may become out of sync with the internal structure, they | 32 | * may become out of sync with the internal structure, they |
33 | * will create a compile error if it happens. Since the | 33 | * will create a compile error if it happens. Since the |
@@ -53,7 +53,7 @@ | |||
53 | */ | 53 | */ |
54 | 54 | ||
55 | /* | 55 | /* |
56 | * Function trace entry - function address and parent function addres: | 56 | * Function trace entry - function address and parent function address: |
57 | */ | 57 | */ |
58 | FTRACE_ENTRY(function, ftrace_entry, | 58 | FTRACE_ENTRY(function, ftrace_entry, |
59 | 59 | ||
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | |||
109 | */ | 109 | */ |
110 | #define FTRACE_CTX_FIELDS \ | 110 | #define FTRACE_CTX_FIELDS \ |
111 | __field( unsigned int, prev_pid ) \ | 111 | __field( unsigned int, prev_pid ) \ |
112 | __field( unsigned int, next_pid ) \ | ||
113 | __field( unsigned int, next_cpu ) \ | ||
112 | __field( unsigned char, prev_prio ) \ | 114 | __field( unsigned char, prev_prio ) \ |
113 | __field( unsigned char, prev_state ) \ | 115 | __field( unsigned char, prev_state ) \ |
114 | __field( unsigned int, next_pid ) \ | ||
115 | __field( unsigned char, next_prio ) \ | 116 | __field( unsigned char, next_prio ) \ |
116 | __field( unsigned char, next_state ) \ | 117 | __field( unsigned char, next_state ) |
117 | __field( unsigned int, next_cpu ) | ||
118 | 118 | ||
119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, | 119 | FTRACE_ENTRY(context_switch, ctx_switch_entry, |
120 | 120 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 31cc4cb0dbf2..19a359d5e6d5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
10 | #include "trace.h" | 10 | #include "trace.h" |
11 | 11 | ||
12 | static char *perf_trace_buf[4]; | 12 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Force it to be aligned to unsigned long to avoid misaligned accesses | 15 | * Force it to be aligned to unsigned long to avoid misaligned accesses |
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | |||
21 | /* Count the events in use (per event id, not per instance) */ | 21 | /* Count the events in use (per event id, not per instance) */ |
22 | static int total_ref_count; | 22 | static int total_ref_count; |
23 | 23 | ||
24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | ||
25 | struct perf_event *p_event) | ||
26 | { | ||
27 | /* No tracing, just counting, so no obvious leak */ | ||
28 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | ||
29 | return 0; | ||
30 | |||
31 | /* Some events are ok to be traced by non-root users... */ | ||
32 | if (p_event->attach_state == PERF_ATTACH_TASK) { | ||
33 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | /* | ||
38 | * ...otherwise raw tracepoint data can be a severe data leak, | ||
39 | * only allow root to have these. | ||
40 | */ | ||
41 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | ||
42 | return -EPERM; | ||
43 | |||
44 | return 0; | ||
45 | } | ||
46 | |||
24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 47 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 48 | struct perf_event *p_event) |
26 | { | 49 | { |
27 | struct hlist_head *list; | 50 | struct hlist_head __percpu *list; |
28 | int ret = -ENOMEM; | 51 | int ret; |
29 | int cpu; | 52 | int cpu; |
30 | 53 | ||
54 | ret = perf_trace_event_perm(tp_event, p_event); | ||
55 | if (ret) | ||
56 | return ret; | ||
57 | |||
31 | p_event->tp_event = tp_event; | 58 | p_event->tp_event = tp_event; |
32 | if (tp_event->perf_refcount++ > 0) | 59 | if (tp_event->perf_refcount++ > 0) |
33 | return 0; | 60 | return 0; |
34 | 61 | ||
62 | ret = -ENOMEM; | ||
63 | |||
35 | list = alloc_percpu(struct hlist_head); | 64 | list = alloc_percpu(struct hlist_head); |
36 | if (!list) | 65 | if (!list) |
37 | goto fail; | 66 | goto fail; |
@@ -42,11 +71,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
42 | tp_event->perf_events = list; | 71 | tp_event->perf_events = list; |
43 | 72 | ||
44 | if (!total_ref_count) { | 73 | if (!total_ref_count) { |
45 | char *buf; | 74 | char __percpu *buf; |
46 | int i; | 75 | int i; |
47 | 76 | ||
48 | for (i = 0; i < 4; i++) { | 77 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
49 | buf = (char *)alloc_percpu(perf_trace_t); | 78 | buf = (char __percpu *)alloc_percpu(perf_trace_t); |
50 | if (!buf) | 79 | if (!buf) |
51 | goto fail; | 80 | goto fail; |
52 | 81 | ||
@@ -65,7 +94,7 @@ fail: | |||
65 | if (!total_ref_count) { | 94 | if (!total_ref_count) { |
66 | int i; | 95 | int i; |
67 | 96 | ||
68 | for (i = 0; i < 4; i++) { | 97 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
69 | free_percpu(perf_trace_buf[i]); | 98 | free_percpu(perf_trace_buf[i]); |
70 | perf_trace_buf[i] = NULL; | 99 | perf_trace_buf[i] = NULL; |
71 | } | 100 | } |
@@ -101,22 +130,26 @@ int perf_trace_init(struct perf_event *p_event) | |||
101 | return ret; | 130 | return ret; |
102 | } | 131 | } |
103 | 132 | ||
104 | int perf_trace_enable(struct perf_event *p_event) | 133 | int perf_trace_add(struct perf_event *p_event, int flags) |
105 | { | 134 | { |
106 | struct ftrace_event_call *tp_event = p_event->tp_event; | 135 | struct ftrace_event_call *tp_event = p_event->tp_event; |
136 | struct hlist_head __percpu *pcpu_list; | ||
107 | struct hlist_head *list; | 137 | struct hlist_head *list; |
108 | 138 | ||
109 | list = tp_event->perf_events; | 139 | pcpu_list = tp_event->perf_events; |
110 | if (WARN_ON_ONCE(!list)) | 140 | if (WARN_ON_ONCE(!pcpu_list)) |
111 | return -EINVAL; | 141 | return -EINVAL; |
112 | 142 | ||
113 | list = this_cpu_ptr(list); | 143 | if (!(flags & PERF_EF_START)) |
144 | p_event->hw.state = PERF_HES_STOPPED; | ||
145 | |||
146 | list = this_cpu_ptr(pcpu_list); | ||
114 | hlist_add_head_rcu(&p_event->hlist_entry, list); | 147 | hlist_add_head_rcu(&p_event->hlist_entry, list); |
115 | 148 | ||
116 | return 0; | 149 | return 0; |
117 | } | 150 | } |
118 | 151 | ||
119 | void perf_trace_disable(struct perf_event *p_event) | 152 | void perf_trace_del(struct perf_event *p_event, int flags) |
120 | { | 153 | { |
121 | hlist_del_rcu(&p_event->hlist_entry); | 154 | hlist_del_rcu(&p_event->hlist_entry); |
122 | } | 155 | } |
@@ -142,7 +175,7 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
142 | tp_event->perf_events = NULL; | 175 | tp_event->perf_events = NULL; |
143 | 176 | ||
144 | if (!--total_ref_count) { | 177 | if (!--total_ref_count) { |
145 | for (i = 0; i < 4; i++) { | 178 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
146 | free_percpu(perf_trace_buf[i]); | 179 | free_percpu(perf_trace_buf[i]); |
147 | perf_trace_buf[i] = NULL; | 180 | perf_trace_buf[i] = NULL; |
148 | } | 181 | } |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..686ec399f2a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -27,6 +27,12 @@ | |||
27 | 27 | ||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | DEFINE_MUTEX(event_storage_mutex); | ||
31 | EXPORT_SYMBOL_GPL(event_storage_mutex); | ||
32 | |||
33 | char event_storage[EVENT_STORAGE_SIZE]; | ||
34 | EXPORT_SYMBOL_GPL(event_storage); | ||
35 | |||
30 | LIST_HEAD(ftrace_events); | 36 | LIST_HEAD(ftrace_events); |
31 | LIST_HEAD(ftrace_common_fields); | 37 | LIST_HEAD(ftrace_common_fields); |
32 | 38 | ||
@@ -110,7 +116,7 @@ static int trace_define_common_fields(void) | |||
110 | __common_field(unsigned char, flags); | 116 | __common_field(unsigned char, flags); |
111 | __common_field(unsigned char, preempt_count); | 117 | __common_field(unsigned char, preempt_count); |
112 | __common_field(int, pid); | 118 | __common_field(int, pid); |
113 | __common_field(int, lock_depth); | 119 | __common_field(int, padding); |
114 | 120 | ||
115 | return ret; | 121 | return ret; |
116 | } | 122 | } |
@@ -320,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) | |||
320 | { | 326 | { |
321 | return __ftrace_set_clr_event(NULL, system, event, set); | 327 | return __ftrace_set_clr_event(NULL, system, event, set); |
322 | } | 328 | } |
329 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | ||
323 | 330 | ||
324 | /* 128 should be much more than enough */ | 331 | /* 128 should be much more than enough */ |
325 | #define EVENT_BUF_SIZE 127 | 332 | #define EVENT_BUF_SIZE 127 |
@@ -600,21 +607,29 @@ out: | |||
600 | 607 | ||
601 | enum { | 608 | enum { |
602 | FORMAT_HEADER = 1, | 609 | FORMAT_HEADER = 1, |
603 | FORMAT_PRINTFMT = 2, | 610 | FORMAT_FIELD_SEPERATOR = 2, |
611 | FORMAT_PRINTFMT = 3, | ||
604 | }; | 612 | }; |
605 | 613 | ||
606 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) | 614 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) |
607 | { | 615 | { |
608 | struct ftrace_event_call *call = m->private; | 616 | struct ftrace_event_call *call = m->private; |
609 | struct ftrace_event_field *field; | 617 | struct ftrace_event_field *field; |
610 | struct list_head *head; | 618 | struct list_head *common_head = &ftrace_common_fields; |
619 | struct list_head *head = trace_get_fields(call); | ||
611 | 620 | ||
612 | (*pos)++; | 621 | (*pos)++; |
613 | 622 | ||
614 | switch ((unsigned long)v) { | 623 | switch ((unsigned long)v) { |
615 | case FORMAT_HEADER: | 624 | case FORMAT_HEADER: |
616 | head = &ftrace_common_fields; | 625 | if (unlikely(list_empty(common_head))) |
626 | return NULL; | ||
617 | 627 | ||
628 | field = list_entry(common_head->prev, | ||
629 | struct ftrace_event_field, link); | ||
630 | return field; | ||
631 | |||
632 | case FORMAT_FIELD_SEPERATOR: | ||
618 | if (unlikely(list_empty(head))) | 633 | if (unlikely(list_empty(head))) |
619 | return NULL; | 634 | return NULL; |
620 | 635 | ||
@@ -626,31 +641,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) | |||
626 | return NULL; | 641 | return NULL; |
627 | } | 642 | } |
628 | 643 | ||
629 | head = trace_get_fields(call); | ||
630 | |||
631 | /* | ||
632 | * To separate common fields from event fields, the | ||
633 | * LSB is set on the first event field. Clear it in case. | ||
634 | */ | ||
635 | v = (void *)((unsigned long)v & ~1L); | ||
636 | |||
637 | field = v; | 644 | field = v; |
638 | /* | 645 | if (field->link.prev == common_head) |
639 | * If this is a common field, and at the end of the list, then | 646 | return (void *)FORMAT_FIELD_SEPERATOR; |
640 | * continue with main list. | 647 | else if (field->link.prev == head) |
641 | */ | ||
642 | if (field->link.prev == &ftrace_common_fields) { | ||
643 | if (unlikely(list_empty(head))) | ||
644 | return NULL; | ||
645 | field = list_entry(head->prev, struct ftrace_event_field, link); | ||
646 | /* Set the LSB to notify f_show to print an extra newline */ | ||
647 | field = (struct ftrace_event_field *) | ||
648 | ((unsigned long)field | 1); | ||
649 | return field; | ||
650 | } | ||
651 | |||
652 | /* If we are done tell f_show to print the format */ | ||
653 | if (field->link.prev == head) | ||
654 | return (void *)FORMAT_PRINTFMT; | 648 | return (void *)FORMAT_PRINTFMT; |
655 | 649 | ||
656 | field = list_entry(field->link.prev, struct ftrace_event_field, link); | 650 | field = list_entry(field->link.prev, struct ftrace_event_field, link); |
@@ -688,22 +682,16 @@ static int f_show(struct seq_file *m, void *v) | |||
688 | seq_printf(m, "format:\n"); | 682 | seq_printf(m, "format:\n"); |
689 | return 0; | 683 | return 0; |
690 | 684 | ||
685 | case FORMAT_FIELD_SEPERATOR: | ||
686 | seq_putc(m, '\n'); | ||
687 | return 0; | ||
688 | |||
691 | case FORMAT_PRINTFMT: | 689 | case FORMAT_PRINTFMT: |
692 | seq_printf(m, "\nprint fmt: %s\n", | 690 | seq_printf(m, "\nprint fmt: %s\n", |
693 | call->print_fmt); | 691 | call->print_fmt); |
694 | return 0; | 692 | return 0; |
695 | } | 693 | } |
696 | 694 | ||
697 | /* | ||
698 | * To separate common fields from event fields, the | ||
699 | * LSB is set on the first event field. Clear it and | ||
700 | * print a newline if it is set. | ||
701 | */ | ||
702 | if ((unsigned long)v & 1) { | ||
703 | seq_putc(m, '\n'); | ||
704 | v = (void *)((unsigned long)v & ~1L); | ||
705 | } | ||
706 | |||
707 | field = v; | 695 | field = v; |
708 | 696 | ||
709 | /* | 697 | /* |
@@ -951,6 +939,7 @@ static const struct file_operations ftrace_enable_fops = { | |||
951 | .open = tracing_open_generic, | 939 | .open = tracing_open_generic, |
952 | .read = event_enable_read, | 940 | .read = event_enable_read, |
953 | .write = event_enable_write, | 941 | .write = event_enable_write, |
942 | .llseek = default_llseek, | ||
954 | }; | 943 | }; |
955 | 944 | ||
956 | static const struct file_operations ftrace_event_format_fops = { | 945 | static const struct file_operations ftrace_event_format_fops = { |
@@ -963,29 +952,34 @@ static const struct file_operations ftrace_event_format_fops = { | |||
963 | static const struct file_operations ftrace_event_id_fops = { | 952 | static const struct file_operations ftrace_event_id_fops = { |
964 | .open = tracing_open_generic, | 953 | .open = tracing_open_generic, |
965 | .read = event_id_read, | 954 | .read = event_id_read, |
955 | .llseek = default_llseek, | ||
966 | }; | 956 | }; |
967 | 957 | ||
968 | static const struct file_operations ftrace_event_filter_fops = { | 958 | static const struct file_operations ftrace_event_filter_fops = { |
969 | .open = tracing_open_generic, | 959 | .open = tracing_open_generic, |
970 | .read = event_filter_read, | 960 | .read = event_filter_read, |
971 | .write = event_filter_write, | 961 | .write = event_filter_write, |
962 | .llseek = default_llseek, | ||
972 | }; | 963 | }; |
973 | 964 | ||
974 | static const struct file_operations ftrace_subsystem_filter_fops = { | 965 | static const struct file_operations ftrace_subsystem_filter_fops = { |
975 | .open = tracing_open_generic, | 966 | .open = tracing_open_generic, |
976 | .read = subsystem_filter_read, | 967 | .read = subsystem_filter_read, |
977 | .write = subsystem_filter_write, | 968 | .write = subsystem_filter_write, |
969 | .llseek = default_llseek, | ||
978 | }; | 970 | }; |
979 | 971 | ||
980 | static const struct file_operations ftrace_system_enable_fops = { | 972 | static const struct file_operations ftrace_system_enable_fops = { |
981 | .open = tracing_open_generic, | 973 | .open = tracing_open_generic, |
982 | .read = system_enable_read, | 974 | .read = system_enable_read, |
983 | .write = system_enable_write, | 975 | .write = system_enable_write, |
976 | .llseek = default_llseek, | ||
984 | }; | 977 | }; |
985 | 978 | ||
986 | static const struct file_operations ftrace_show_header_fops = { | 979 | static const struct file_operations ftrace_show_header_fops = { |
987 | .open = tracing_open_generic, | 980 | .open = tracing_open_generic, |
988 | .read = show_header, | 981 | .read = show_header, |
982 | .llseek = default_llseek, | ||
989 | }; | 983 | }; |
990 | 984 | ||
991 | static struct dentry *event_trace_events_dir(void) | 985 | static struct dentry *event_trace_events_dir(void) |
@@ -1291,7 +1285,7 @@ trace_create_file_ops(struct module *mod) | |||
1291 | static void trace_module_add_events(struct module *mod) | 1285 | static void trace_module_add_events(struct module *mod) |
1292 | { | 1286 | { |
1293 | struct ftrace_module_file_ops *file_ops = NULL; | 1287 | struct ftrace_module_file_ops *file_ops = NULL; |
1294 | struct ftrace_event_call *call, *start, *end; | 1288 | struct ftrace_event_call **call, **start, **end; |
1295 | 1289 | ||
1296 | start = mod->trace_events; | 1290 | start = mod->trace_events; |
1297 | end = mod->trace_events + mod->num_trace_events; | 1291 | end = mod->trace_events + mod->num_trace_events; |
@@ -1304,7 +1298,7 @@ static void trace_module_add_events(struct module *mod) | |||
1304 | return; | 1298 | return; |
1305 | 1299 | ||
1306 | for_each_event(call, start, end) { | 1300 | for_each_event(call, start, end) { |
1307 | __trace_add_event_call(call, mod, | 1301 | __trace_add_event_call(*call, mod, |
1308 | &file_ops->id, &file_ops->enable, | 1302 | &file_ops->id, &file_ops->enable, |
1309 | &file_ops->filter, &file_ops->format); | 1303 | &file_ops->filter, &file_ops->format); |
1310 | } | 1304 | } |
@@ -1374,8 +1368,8 @@ static struct notifier_block trace_module_nb = { | |||
1374 | .priority = 0, | 1368 | .priority = 0, |
1375 | }; | 1369 | }; |
1376 | 1370 | ||
1377 | extern struct ftrace_event_call __start_ftrace_events[]; | 1371 | extern struct ftrace_event_call *__start_ftrace_events[]; |
1378 | extern struct ftrace_event_call __stop_ftrace_events[]; | 1372 | extern struct ftrace_event_call *__stop_ftrace_events[]; |
1379 | 1373 | ||
1380 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; | 1374 | static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; |
1381 | 1375 | ||
@@ -1391,7 +1385,7 @@ __setup("trace_event=", setup_trace_event); | |||
1391 | 1385 | ||
1392 | static __init int event_trace_init(void) | 1386 | static __init int event_trace_init(void) |
1393 | { | 1387 | { |
1394 | struct ftrace_event_call *call; | 1388 | struct ftrace_event_call **call; |
1395 | struct dentry *d_tracer; | 1389 | struct dentry *d_tracer; |
1396 | struct dentry *entry; | 1390 | struct dentry *entry; |
1397 | struct dentry *d_events; | 1391 | struct dentry *d_events; |
@@ -1437,7 +1431,7 @@ static __init int event_trace_init(void) | |||
1437 | pr_warning("tracing: Failed to allocate common fields"); | 1431 | pr_warning("tracing: Failed to allocate common fields"); |
1438 | 1432 | ||
1439 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1433 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
1440 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, | 1434 | __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, |
1441 | &ftrace_enable_fops, | 1435 | &ftrace_enable_fops, |
1442 | &ftrace_event_filter_fops, | 1436 | &ftrace_event_filter_fops, |
1443 | &ftrace_event_format_fops); | 1437 | &ftrace_event_format_fops); |
@@ -1663,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata = | |||
1663 | 1657 | ||
1664 | static __init void event_trace_self_test_with_function(void) | 1658 | static __init void event_trace_self_test_with_function(void) |
1665 | { | 1659 | { |
1666 | register_ftrace_function(&trace_ops); | 1660 | int ret; |
1661 | ret = register_ftrace_function(&trace_ops); | ||
1662 | if (WARN_ON(ret < 0)) { | ||
1663 | pr_info("Failed to enable function tracer for event tests\n"); | ||
1664 | return; | ||
1665 | } | ||
1667 | pr_info("Running tests again, along with the function tracer\n"); | 1666 | pr_info("Running tests again, along with the function tracer\n"); |
1668 | event_trace_self_tests(); | 1667 | event_trace_self_tests(); |
1669 | unregister_ftrace_function(&trace_ops); | 1668 | unregister_ftrace_function(&trace_ops); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..8008ddcfbf20 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -123,9 +123,13 @@ struct filter_parse_state { | |||
123 | } operand; | 123 | } operand; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | struct pred_stack { | ||
127 | struct filter_pred **preds; | ||
128 | int index; | ||
129 | }; | ||
130 | |||
126 | #define DEFINE_COMPARISON_PRED(type) \ | 131 | #define DEFINE_COMPARISON_PRED(type) \ |
127 | static int filter_pred_##type(struct filter_pred *pred, void *event, \ | 132 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ |
128 | int val1, int val2) \ | ||
129 | { \ | 133 | { \ |
130 | type *addr = (type *)(event + pred->offset); \ | 134 | type *addr = (type *)(event + pred->offset); \ |
131 | type val = (type)pred->val; \ | 135 | type val = (type)pred->val; \ |
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ | |||
152 | } | 156 | } |
153 | 157 | ||
154 | #define DEFINE_EQUALITY_PRED(size) \ | 158 | #define DEFINE_EQUALITY_PRED(size) \ |
155 | static int filter_pred_##size(struct filter_pred *pred, void *event, \ | 159 | static int filter_pred_##size(struct filter_pred *pred, void *event) \ |
156 | int val1, int val2) \ | ||
157 | { \ | 160 | { \ |
158 | u##size *addr = (u##size *)(event + pred->offset); \ | 161 | u##size *addr = (u##size *)(event + pred->offset); \ |
159 | u##size val = (u##size)pred->val; \ | 162 | u##size val = (u##size)pred->val; \ |
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); | |||
178 | DEFINE_EQUALITY_PRED(16); | 181 | DEFINE_EQUALITY_PRED(16); |
179 | DEFINE_EQUALITY_PRED(8); | 182 | DEFINE_EQUALITY_PRED(8); |
180 | 183 | ||
181 | static int filter_pred_and(struct filter_pred *pred __attribute((unused)), | ||
182 | void *event __attribute((unused)), | ||
183 | int val1, int val2) | ||
184 | { | ||
185 | return val1 && val2; | ||
186 | } | ||
187 | |||
188 | static int filter_pred_or(struct filter_pred *pred __attribute((unused)), | ||
189 | void *event __attribute((unused)), | ||
190 | int val1, int val2) | ||
191 | { | ||
192 | return val1 || val2; | ||
193 | } | ||
194 | |||
195 | /* Filter predicate for fixed sized arrays of characters */ | 184 | /* Filter predicate for fixed sized arrays of characters */ |
196 | static int filter_pred_string(struct filter_pred *pred, void *event, | 185 | static int filter_pred_string(struct filter_pred *pred, void *event) |
197 | int val1, int val2) | ||
198 | { | 186 | { |
199 | char *addr = (char *)(event + pred->offset); | 187 | char *addr = (char *)(event + pred->offset); |
200 | int cmp, match; | 188 | int cmp, match; |
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, | |||
207 | } | 195 | } |
208 | 196 | ||
209 | /* Filter predicate for char * pointers */ | 197 | /* Filter predicate for char * pointers */ |
210 | static int filter_pred_pchar(struct filter_pred *pred, void *event, | 198 | static int filter_pred_pchar(struct filter_pred *pred, void *event) |
211 | int val1, int val2) | ||
212 | { | 199 | { |
213 | char **addr = (char **)(event + pred->offset); | 200 | char **addr = (char **)(event + pred->offset); |
214 | int cmp, match; | 201 | int cmp, match; |
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, | |||
231 | * and add it to the address of the entry, and at last we have | 218 | * and add it to the address of the entry, and at last we have |
232 | * the address of the string. | 219 | * the address of the string. |
233 | */ | 220 | */ |
234 | static int filter_pred_strloc(struct filter_pred *pred, void *event, | 221 | static int filter_pred_strloc(struct filter_pred *pred, void *event) |
235 | int val1, int val2) | ||
236 | { | 222 | { |
237 | u32 str_item = *(u32 *)(event + pred->offset); | 223 | u32 str_item = *(u32 *)(event + pred->offset); |
238 | int str_loc = str_item & 0xffff; | 224 | int str_loc = str_item & 0xffff; |
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, | |||
247 | return match; | 233 | return match; |
248 | } | 234 | } |
249 | 235 | ||
250 | static int filter_pred_none(struct filter_pred *pred, void *event, | 236 | static int filter_pred_none(struct filter_pred *pred, void *event) |
251 | int val1, int val2) | ||
252 | { | 237 | { |
253 | return 0; | 238 | return 0; |
254 | } | 239 | } |
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) | |||
377 | pred->not ^= not; | 362 | pred->not ^= not; |
378 | } | 363 | } |
379 | 364 | ||
365 | enum move_type { | ||
366 | MOVE_DOWN, | ||
367 | MOVE_UP_FROM_LEFT, | ||
368 | MOVE_UP_FROM_RIGHT | ||
369 | }; | ||
370 | |||
371 | static struct filter_pred * | ||
372 | get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, | ||
373 | int index, enum move_type *move) | ||
374 | { | ||
375 | if (pred->parent & FILTER_PRED_IS_RIGHT) | ||
376 | *move = MOVE_UP_FROM_RIGHT; | ||
377 | else | ||
378 | *move = MOVE_UP_FROM_LEFT; | ||
379 | pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; | ||
380 | |||
381 | return pred; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * A series of AND or ORs where found together. Instead of | ||
386 | * climbing up and down the tree branches, an array of the | ||
387 | * ops were made in order of checks. We can just move across | ||
388 | * the array and short circuit if needed. | ||
389 | */ | ||
390 | static int process_ops(struct filter_pred *preds, | ||
391 | struct filter_pred *op, void *rec) | ||
392 | { | ||
393 | struct filter_pred *pred; | ||
394 | int match = 0; | ||
395 | int type; | ||
396 | int i; | ||
397 | |||
398 | /* | ||
399 | * Micro-optimization: We set type to true if op | ||
400 | * is an OR and false otherwise (AND). Then we | ||
401 | * just need to test if the match is equal to | ||
402 | * the type, and if it is, we can short circuit the | ||
403 | * rest of the checks: | ||
404 | * | ||
405 | * if ((match && op->op == OP_OR) || | ||
406 | * (!match && op->op == OP_AND)) | ||
407 | * return match; | ||
408 | */ | ||
409 | type = op->op == OP_OR; | ||
410 | |||
411 | for (i = 0; i < op->val; i++) { | ||
412 | pred = &preds[op->ops[i]]; | ||
413 | match = pred->fn(pred, rec); | ||
414 | if (!!match == type) | ||
415 | return match; | ||
416 | } | ||
417 | return match; | ||
418 | } | ||
419 | |||
380 | /* return 1 if event matches, 0 otherwise (discard) */ | 420 | /* return 1 if event matches, 0 otherwise (discard) */ |
381 | int filter_match_preds(struct event_filter *filter, void *rec) | 421 | int filter_match_preds(struct event_filter *filter, void *rec) |
382 | { | 422 | { |
383 | int match, top = 0, val1 = 0, val2 = 0; | 423 | int match = -1; |
384 | int stack[MAX_FILTER_PRED]; | 424 | enum move_type move = MOVE_DOWN; |
425 | struct filter_pred *preds; | ||
385 | struct filter_pred *pred; | 426 | struct filter_pred *pred; |
386 | int i; | 427 | struct filter_pred *root; |
428 | int n_preds; | ||
429 | int done = 0; | ||
430 | |||
431 | /* no filter is considered a match */ | ||
432 | if (!filter) | ||
433 | return 1; | ||
434 | |||
435 | n_preds = filter->n_preds; | ||
436 | |||
437 | if (!n_preds) | ||
438 | return 1; | ||
439 | |||
440 | /* | ||
441 | * n_preds, root and filter->preds are protect with preemption disabled. | ||
442 | */ | ||
443 | preds = rcu_dereference_sched(filter->preds); | ||
444 | root = rcu_dereference_sched(filter->root); | ||
445 | if (!root) | ||
446 | return 1; | ||
447 | |||
448 | pred = root; | ||
387 | 449 | ||
388 | for (i = 0; i < filter->n_preds; i++) { | 450 | /* match is currently meaningless */ |
389 | pred = filter->preds[i]; | 451 | match = -1; |
390 | if (!pred->pop_n) { | 452 | |
391 | match = pred->fn(pred, rec, val1, val2); | 453 | do { |
392 | stack[top++] = match; | 454 | switch (move) { |
455 | case MOVE_DOWN: | ||
456 | /* only AND and OR have children */ | ||
457 | if (pred->left != FILTER_PRED_INVALID) { | ||
458 | /* If ops is set, then it was folded. */ | ||
459 | if (!pred->ops) { | ||
460 | /* keep going to down the left side */ | ||
461 | pred = &preds[pred->left]; | ||
462 | continue; | ||
463 | } | ||
464 | /* We can treat folded ops as a leaf node */ | ||
465 | match = process_ops(preds, pred, rec); | ||
466 | } else | ||
467 | match = pred->fn(pred, rec); | ||
468 | /* If this pred is the only pred */ | ||
469 | if (pred == root) | ||
470 | break; | ||
471 | pred = get_pred_parent(pred, preds, | ||
472 | pred->parent, &move); | ||
473 | continue; | ||
474 | case MOVE_UP_FROM_LEFT: | ||
475 | /* | ||
476 | * Check for short circuits. | ||
477 | * | ||
478 | * Optimization: !!match == (pred->op == OP_OR) | ||
479 | * is the same as: | ||
480 | * if ((match && pred->op == OP_OR) || | ||
481 | * (!match && pred->op == OP_AND)) | ||
482 | */ | ||
483 | if (!!match == (pred->op == OP_OR)) { | ||
484 | if (pred == root) | ||
485 | break; | ||
486 | pred = get_pred_parent(pred, preds, | ||
487 | pred->parent, &move); | ||
488 | continue; | ||
489 | } | ||
490 | /* now go down the right side of the tree. */ | ||
491 | pred = &preds[pred->right]; | ||
492 | move = MOVE_DOWN; | ||
493 | continue; | ||
494 | case MOVE_UP_FROM_RIGHT: | ||
495 | /* We finished this equation. */ | ||
496 | if (pred == root) | ||
497 | break; | ||
498 | pred = get_pred_parent(pred, preds, | ||
499 | pred->parent, &move); | ||
393 | continue; | 500 | continue; |
394 | } | 501 | } |
395 | if (pred->pop_n > top) { | 502 | done = 1; |
396 | WARN_ON_ONCE(1); | 503 | } while (!done); |
397 | return 0; | ||
398 | } | ||
399 | val1 = stack[--top]; | ||
400 | val2 = stack[--top]; | ||
401 | match = pred->fn(pred, rec, val1, val2); | ||
402 | stack[top++] = match; | ||
403 | } | ||
404 | 504 | ||
405 | return stack[--top]; | 505 | return match; |
406 | } | 506 | } |
407 | EXPORT_SYMBOL_GPL(filter_match_preds); | 507 | EXPORT_SYMBOL_GPL(filter_match_preds); |
408 | 508 | ||
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) | |||
414 | 514 | ||
415 | static void remove_filter_string(struct event_filter *filter) | 515 | static void remove_filter_string(struct event_filter *filter) |
416 | { | 516 | { |
517 | if (!filter) | ||
518 | return; | ||
519 | |||
417 | kfree(filter->filter_string); | 520 | kfree(filter->filter_string); |
418 | filter->filter_string = NULL; | 521 | filter->filter_string = NULL; |
419 | } | 522 | } |
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, | |||
473 | 576 | ||
474 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | 577 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) |
475 | { | 578 | { |
476 | struct event_filter *filter = call->filter; | 579 | struct event_filter *filter; |
477 | 580 | ||
478 | mutex_lock(&event_mutex); | 581 | mutex_lock(&event_mutex); |
582 | filter = call->filter; | ||
479 | if (filter && filter->filter_string) | 583 | if (filter && filter->filter_string) |
480 | trace_seq_printf(s, "%s\n", filter->filter_string); | 584 | trace_seq_printf(s, "%s\n", filter->filter_string); |
481 | else | 585 | else |
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | |||
486 | void print_subsystem_event_filter(struct event_subsystem *system, | 590 | void print_subsystem_event_filter(struct event_subsystem *system, |
487 | struct trace_seq *s) | 591 | struct trace_seq *s) |
488 | { | 592 | { |
489 | struct event_filter *filter = system->filter; | 593 | struct event_filter *filter; |
490 | 594 | ||
491 | mutex_lock(&event_mutex); | 595 | mutex_lock(&event_mutex); |
596 | filter = system->filter; | ||
492 | if (filter && filter->filter_string) | 597 | if (filter && filter->filter_string) |
493 | trace_seq_printf(s, "%s\n", filter->filter_string); | 598 | trace_seq_printf(s, "%s\n", filter->filter_string); |
494 | else | 599 | else |
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) | |||
539 | pred->regex.len = 0; | 644 | pred->regex.len = 0; |
540 | } | 645 | } |
541 | 646 | ||
542 | static int filter_set_pred(struct filter_pred *dest, | 647 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
648 | { | ||
649 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | ||
650 | if (!stack->preds) | ||
651 | return -ENOMEM; | ||
652 | stack->index = n_preds; | ||
653 | return 0; | ||
654 | } | ||
655 | |||
656 | static void __free_pred_stack(struct pred_stack *stack) | ||
657 | { | ||
658 | kfree(stack->preds); | ||
659 | stack->index = 0; | ||
660 | } | ||
661 | |||
662 | static int __push_pred_stack(struct pred_stack *stack, | ||
663 | struct filter_pred *pred) | ||
664 | { | ||
665 | int index = stack->index; | ||
666 | |||
667 | if (WARN_ON(index == 0)) | ||
668 | return -ENOSPC; | ||
669 | |||
670 | stack->preds[--index] = pred; | ||
671 | stack->index = index; | ||
672 | return 0; | ||
673 | } | ||
674 | |||
675 | static struct filter_pred * | ||
676 | __pop_pred_stack(struct pred_stack *stack) | ||
677 | { | ||
678 | struct filter_pred *pred; | ||
679 | int index = stack->index; | ||
680 | |||
681 | pred = stack->preds[index++]; | ||
682 | if (!pred) | ||
683 | return NULL; | ||
684 | |||
685 | stack->index = index; | ||
686 | return pred; | ||
687 | } | ||
688 | |||
689 | static int filter_set_pred(struct event_filter *filter, | ||
690 | int idx, | ||
691 | struct pred_stack *stack, | ||
543 | struct filter_pred *src, | 692 | struct filter_pred *src, |
544 | filter_pred_fn_t fn) | 693 | filter_pred_fn_t fn) |
545 | { | 694 | { |
695 | struct filter_pred *dest = &filter->preds[idx]; | ||
696 | struct filter_pred *left; | ||
697 | struct filter_pred *right; | ||
698 | |||
546 | *dest = *src; | 699 | *dest = *src; |
547 | if (src->field_name) { | 700 | if (src->field_name) { |
548 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); | 701 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); |
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, | |||
550 | return -ENOMEM; | 703 | return -ENOMEM; |
551 | } | 704 | } |
552 | dest->fn = fn; | 705 | dest->fn = fn; |
706 | dest->index = idx; | ||
553 | 707 | ||
554 | return 0; | 708 | if (dest->op == OP_OR || dest->op == OP_AND) { |
709 | right = __pop_pred_stack(stack); | ||
710 | left = __pop_pred_stack(stack); | ||
711 | if (!left || !right) | ||
712 | return -EINVAL; | ||
713 | /* | ||
714 | * If both children can be folded | ||
715 | * and they are the same op as this op or a leaf, | ||
716 | * then this op can be folded. | ||
717 | */ | ||
718 | if (left->index & FILTER_PRED_FOLD && | ||
719 | (left->op == dest->op || | ||
720 | left->left == FILTER_PRED_INVALID) && | ||
721 | right->index & FILTER_PRED_FOLD && | ||
722 | (right->op == dest->op || | ||
723 | right->left == FILTER_PRED_INVALID)) | ||
724 | dest->index |= FILTER_PRED_FOLD; | ||
725 | |||
726 | dest->left = left->index & ~FILTER_PRED_FOLD; | ||
727 | dest->right = right->index & ~FILTER_PRED_FOLD; | ||
728 | left->parent = dest->index & ~FILTER_PRED_FOLD; | ||
729 | right->parent = dest->index | FILTER_PRED_IS_RIGHT; | ||
730 | } else { | ||
731 | /* | ||
732 | * Make dest->left invalid to be used as a quick | ||
733 | * way to know this is a leaf node. | ||
734 | */ | ||
735 | dest->left = FILTER_PRED_INVALID; | ||
736 | |||
737 | /* All leafs allow folding the parent ops. */ | ||
738 | dest->index |= FILTER_PRED_FOLD; | ||
739 | } | ||
740 | |||
741 | return __push_pred_stack(stack, dest); | ||
555 | } | 742 | } |
556 | 743 | ||
557 | static void filter_disable_preds(struct ftrace_event_call *call) | 744 | static void __free_preds(struct event_filter *filter) |
558 | { | 745 | { |
559 | struct event_filter *filter = call->filter; | ||
560 | int i; | 746 | int i; |
561 | 747 | ||
562 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | 748 | if (filter->preds) { |
749 | for (i = 0; i < filter->a_preds; i++) | ||
750 | kfree(filter->preds[i].field_name); | ||
751 | kfree(filter->preds); | ||
752 | filter->preds = NULL; | ||
753 | } | ||
754 | filter->a_preds = 0; | ||
563 | filter->n_preds = 0; | 755 | filter->n_preds = 0; |
564 | |||
565 | for (i = 0; i < MAX_FILTER_PRED; i++) | ||
566 | filter->preds[i]->fn = filter_pred_none; | ||
567 | } | 756 | } |
568 | 757 | ||
569 | static void __free_preds(struct event_filter *filter) | 758 | static void filter_disable(struct ftrace_event_call *call) |
570 | { | 759 | { |
571 | int i; | 760 | call->flags &= ~TRACE_EVENT_FL_FILTERED; |
761 | } | ||
572 | 762 | ||
763 | static void __free_filter(struct event_filter *filter) | ||
764 | { | ||
573 | if (!filter) | 765 | if (!filter) |
574 | return; | 766 | return; |
575 | 767 | ||
576 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 768 | __free_preds(filter); |
577 | if (filter->preds[i]) | ||
578 | filter_free_pred(filter->preds[i]); | ||
579 | } | ||
580 | kfree(filter->preds); | ||
581 | kfree(filter->filter_string); | 769 | kfree(filter->filter_string); |
582 | kfree(filter); | 770 | kfree(filter); |
583 | } | 771 | } |
584 | 772 | ||
773 | /* | ||
774 | * Called when destroying the ftrace_event_call. | ||
775 | * The call is being freed, so we do not need to worry about | ||
776 | * the call being currently used. This is for module code removing | ||
777 | * the tracepoints from within it. | ||
778 | */ | ||
585 | void destroy_preds(struct ftrace_event_call *call) | 779 | void destroy_preds(struct ftrace_event_call *call) |
586 | { | 780 | { |
587 | __free_preds(call->filter); | 781 | __free_filter(call->filter); |
588 | call->filter = NULL; | 782 | call->filter = NULL; |
589 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
590 | } | 783 | } |
591 | 784 | ||
592 | static struct event_filter *__alloc_preds(void) | 785 | static struct event_filter *__alloc_filter(void) |
593 | { | 786 | { |
594 | struct event_filter *filter; | 787 | struct event_filter *filter; |
788 | |||
789 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | ||
790 | return filter; | ||
791 | } | ||
792 | |||
793 | static int __alloc_preds(struct event_filter *filter, int n_preds) | ||
794 | { | ||
595 | struct filter_pred *pred; | 795 | struct filter_pred *pred; |
596 | int i; | 796 | int i; |
597 | 797 | ||
598 | filter = kzalloc(sizeof(*filter), GFP_KERNEL); | 798 | if (filter->preds) |
599 | if (!filter) | 799 | __free_preds(filter); |
600 | return ERR_PTR(-ENOMEM); | ||
601 | 800 | ||
602 | filter->n_preds = 0; | 801 | filter->preds = |
802 | kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); | ||
603 | 803 | ||
604 | filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); | ||
605 | if (!filter->preds) | 804 | if (!filter->preds) |
606 | goto oom; | 805 | return -ENOMEM; |
607 | 806 | ||
608 | for (i = 0; i < MAX_FILTER_PRED; i++) { | 807 | filter->a_preds = n_preds; |
609 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | 808 | filter->n_preds = 0; |
610 | if (!pred) | 809 | |
611 | goto oom; | 810 | for (i = 0; i < n_preds; i++) { |
811 | pred = &filter->preds[i]; | ||
612 | pred->fn = filter_pred_none; | 812 | pred->fn = filter_pred_none; |
613 | filter->preds[i] = pred; | ||
614 | } | 813 | } |
615 | 814 | ||
616 | return filter; | ||
617 | |||
618 | oom: | ||
619 | __free_preds(filter); | ||
620 | return ERR_PTR(-ENOMEM); | ||
621 | } | ||
622 | |||
623 | static int init_preds(struct ftrace_event_call *call) | ||
624 | { | ||
625 | if (call->filter) | ||
626 | return 0; | ||
627 | |||
628 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
629 | call->filter = __alloc_preds(); | ||
630 | if (IS_ERR(call->filter)) | ||
631 | return PTR_ERR(call->filter); | ||
632 | |||
633 | return 0; | 815 | return 0; |
634 | } | 816 | } |
635 | 817 | ||
636 | static int init_subsystem_preds(struct event_subsystem *system) | 818 | static void filter_free_subsystem_preds(struct event_subsystem *system) |
637 | { | 819 | { |
638 | struct ftrace_event_call *call; | 820 | struct ftrace_event_call *call; |
639 | int err; | ||
640 | 821 | ||
641 | list_for_each_entry(call, &ftrace_events, list) { | 822 | list_for_each_entry(call, &ftrace_events, list) { |
642 | if (strcmp(call->class->system, system->name) != 0) | 823 | if (strcmp(call->class->system, system->name) != 0) |
643 | continue; | 824 | continue; |
644 | 825 | ||
645 | err = init_preds(call); | 826 | filter_disable(call); |
646 | if (err) | 827 | remove_filter_string(call->filter); |
647 | return err; | ||
648 | } | 828 | } |
649 | |||
650 | return 0; | ||
651 | } | 829 | } |
652 | 830 | ||
653 | static void filter_free_subsystem_preds(struct event_subsystem *system) | 831 | static void filter_free_subsystem_filters(struct event_subsystem *system) |
654 | { | 832 | { |
655 | struct ftrace_event_call *call; | 833 | struct ftrace_event_call *call; |
656 | 834 | ||
657 | list_for_each_entry(call, &ftrace_events, list) { | 835 | list_for_each_entry(call, &ftrace_events, list) { |
658 | if (strcmp(call->class->system, system->name) != 0) | 836 | if (strcmp(call->class->system, system->name) != 0) |
659 | continue; | 837 | continue; |
660 | 838 | __free_filter(call->filter); | |
661 | filter_disable_preds(call); | 839 | call->filter = NULL; |
662 | remove_filter_string(call->filter); | ||
663 | } | 840 | } |
664 | } | 841 | } |
665 | 842 | ||
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, | |||
667 | struct ftrace_event_call *call, | 844 | struct ftrace_event_call *call, |
668 | struct event_filter *filter, | 845 | struct event_filter *filter, |
669 | struct filter_pred *pred, | 846 | struct filter_pred *pred, |
847 | struct pred_stack *stack, | ||
670 | filter_pred_fn_t fn) | 848 | filter_pred_fn_t fn) |
671 | { | 849 | { |
672 | int idx, err; | 850 | int idx, err; |
673 | 851 | ||
674 | if (filter->n_preds == MAX_FILTER_PRED) { | 852 | if (WARN_ON(filter->n_preds == filter->a_preds)) { |
675 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 853 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
676 | return -ENOSPC; | 854 | return -ENOSPC; |
677 | } | 855 | } |
678 | 856 | ||
679 | idx = filter->n_preds; | 857 | idx = filter->n_preds; |
680 | filter_clear_pred(filter->preds[idx]); | 858 | filter_clear_pred(&filter->preds[idx]); |
681 | err = filter_set_pred(filter->preds[idx], pred, fn); | 859 | err = filter_set_pred(filter, idx, stack, pred, fn); |
682 | if (err) | 860 | if (err) |
683 | return err; | 861 | return err; |
684 | 862 | ||
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
763 | struct ftrace_event_call *call, | 941 | struct ftrace_event_call *call, |
764 | struct event_filter *filter, | 942 | struct event_filter *filter, |
765 | struct filter_pred *pred, | 943 | struct filter_pred *pred, |
944 | struct pred_stack *stack, | ||
766 | bool dry_run) | 945 | bool dry_run) |
767 | { | 946 | { |
768 | struct ftrace_event_field *field; | 947 | struct ftrace_event_field *field; |
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
770 | unsigned long long val; | 949 | unsigned long long val; |
771 | int ret; | 950 | int ret; |
772 | 951 | ||
773 | pred->fn = filter_pred_none; | 952 | fn = pred->fn = filter_pred_none; |
774 | 953 | ||
775 | if (pred->op == OP_AND) { | 954 | if (pred->op == OP_AND) |
776 | pred->pop_n = 2; | ||
777 | fn = filter_pred_and; | ||
778 | goto add_pred_fn; | 955 | goto add_pred_fn; |
779 | } else if (pred->op == OP_OR) { | 956 | else if (pred->op == OP_OR) |
780 | pred->pop_n = 2; | ||
781 | fn = filter_pred_or; | ||
782 | goto add_pred_fn; | 957 | goto add_pred_fn; |
783 | } | ||
784 | 958 | ||
785 | field = find_event_field(call, pred->field_name); | 959 | field = find_event_field(call, pred->field_name); |
786 | if (!field) { | 960 | if (!field) { |
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
829 | 1003 | ||
830 | add_pred_fn: | 1004 | add_pred_fn: |
831 | if (!dry_run) | 1005 | if (!dry_run) |
832 | return filter_add_pred_fn(ps, call, filter, pred, fn); | 1006 | return filter_add_pred_fn(ps, call, filter, pred, stack, fn); |
833 | return 0; | 1007 | return 0; |
834 | } | 1008 | } |
835 | 1009 | ||
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) | |||
1187 | return 0; | 1361 | return 0; |
1188 | } | 1362 | } |
1189 | 1363 | ||
1364 | static int count_preds(struct filter_parse_state *ps) | ||
1365 | { | ||
1366 | struct postfix_elt *elt; | ||
1367 | int n_preds = 0; | ||
1368 | |||
1369 | list_for_each_entry(elt, &ps->postfix, list) { | ||
1370 | if (elt->op == OP_NONE) | ||
1371 | continue; | ||
1372 | n_preds++; | ||
1373 | } | ||
1374 | |||
1375 | return n_preds; | ||
1376 | } | ||
1377 | |||
1378 | /* | ||
1379 | * The tree is walked at filtering of an event. If the tree is not correctly | ||
1380 | * built, it may cause an infinite loop. Check here that the tree does | ||
1381 | * indeed terminate. | ||
1382 | */ | ||
1383 | static int check_pred_tree(struct event_filter *filter, | ||
1384 | struct filter_pred *root) | ||
1385 | { | ||
1386 | struct filter_pred *preds; | ||
1387 | struct filter_pred *pred; | ||
1388 | enum move_type move = MOVE_DOWN; | ||
1389 | int count = 0; | ||
1390 | int done = 0; | ||
1391 | int max; | ||
1392 | |||
1393 | /* | ||
1394 | * The max that we can hit a node is three times. | ||
1395 | * Once going down, once coming up from left, and | ||
1396 | * once coming up from right. This is more than enough | ||
1397 | * since leafs are only hit a single time. | ||
1398 | */ | ||
1399 | max = 3 * filter->n_preds; | ||
1400 | |||
1401 | preds = filter->preds; | ||
1402 | if (!preds) | ||
1403 | return -EINVAL; | ||
1404 | pred = root; | ||
1405 | |||
1406 | do { | ||
1407 | if (WARN_ON(count++ > max)) | ||
1408 | return -EINVAL; | ||
1409 | |||
1410 | switch (move) { | ||
1411 | case MOVE_DOWN: | ||
1412 | if (pred->left != FILTER_PRED_INVALID) { | ||
1413 | pred = &preds[pred->left]; | ||
1414 | continue; | ||
1415 | } | ||
1416 | /* A leaf at the root is just a leaf in the tree */ | ||
1417 | if (pred == root) | ||
1418 | break; | ||
1419 | pred = get_pred_parent(pred, preds, | ||
1420 | pred->parent, &move); | ||
1421 | continue; | ||
1422 | case MOVE_UP_FROM_LEFT: | ||
1423 | pred = &preds[pred->right]; | ||
1424 | move = MOVE_DOWN; | ||
1425 | continue; | ||
1426 | case MOVE_UP_FROM_RIGHT: | ||
1427 | if (pred == root) | ||
1428 | break; | ||
1429 | pred = get_pred_parent(pred, preds, | ||
1430 | pred->parent, &move); | ||
1431 | continue; | ||
1432 | } | ||
1433 | done = 1; | ||
1434 | } while (!done); | ||
1435 | |||
1436 | /* We are fine. */ | ||
1437 | return 0; | ||
1438 | } | ||
1439 | |||
1440 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) | ||
1441 | { | ||
1442 | struct filter_pred *pred; | ||
1443 | enum move_type move = MOVE_DOWN; | ||
1444 | int count = 0; | ||
1445 | int done = 0; | ||
1446 | |||
1447 | pred = root; | ||
1448 | |||
1449 | do { | ||
1450 | switch (move) { | ||
1451 | case MOVE_DOWN: | ||
1452 | if (pred->left != FILTER_PRED_INVALID) { | ||
1453 | pred = &preds[pred->left]; | ||
1454 | continue; | ||
1455 | } | ||
1456 | /* A leaf at the root is just a leaf in the tree */ | ||
1457 | if (pred == root) | ||
1458 | return 1; | ||
1459 | count++; | ||
1460 | pred = get_pred_parent(pred, preds, | ||
1461 | pred->parent, &move); | ||
1462 | continue; | ||
1463 | case MOVE_UP_FROM_LEFT: | ||
1464 | pred = &preds[pred->right]; | ||
1465 | move = MOVE_DOWN; | ||
1466 | continue; | ||
1467 | case MOVE_UP_FROM_RIGHT: | ||
1468 | if (pred == root) | ||
1469 | break; | ||
1470 | pred = get_pred_parent(pred, preds, | ||
1471 | pred->parent, &move); | ||
1472 | continue; | ||
1473 | } | ||
1474 | done = 1; | ||
1475 | } while (!done); | ||
1476 | |||
1477 | return count; | ||
1478 | } | ||
1479 | |||
1480 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | ||
1481 | { | ||
1482 | struct filter_pred *pred; | ||
1483 | enum move_type move = MOVE_DOWN; | ||
1484 | int count = 0; | ||
1485 | int children; | ||
1486 | int done = 0; | ||
1487 | |||
1488 | /* No need to keep the fold flag */ | ||
1489 | root->index &= ~FILTER_PRED_FOLD; | ||
1490 | |||
1491 | /* If the root is a leaf then do nothing */ | ||
1492 | if (root->left == FILTER_PRED_INVALID) | ||
1493 | return 0; | ||
1494 | |||
1495 | /* count the children */ | ||
1496 | children = count_leafs(preds, &preds[root->left]); | ||
1497 | children += count_leafs(preds, &preds[root->right]); | ||
1498 | |||
1499 | root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); | ||
1500 | if (!root->ops) | ||
1501 | return -ENOMEM; | ||
1502 | |||
1503 | root->val = children; | ||
1504 | |||
1505 | pred = root; | ||
1506 | do { | ||
1507 | switch (move) { | ||
1508 | case MOVE_DOWN: | ||
1509 | if (pred->left != FILTER_PRED_INVALID) { | ||
1510 | pred = &preds[pred->left]; | ||
1511 | continue; | ||
1512 | } | ||
1513 | if (WARN_ON(count == children)) | ||
1514 | return -EINVAL; | ||
1515 | pred->index &= ~FILTER_PRED_FOLD; | ||
1516 | root->ops[count++] = pred->index; | ||
1517 | pred = get_pred_parent(pred, preds, | ||
1518 | pred->parent, &move); | ||
1519 | continue; | ||
1520 | case MOVE_UP_FROM_LEFT: | ||
1521 | pred = &preds[pred->right]; | ||
1522 | move = MOVE_DOWN; | ||
1523 | continue; | ||
1524 | case MOVE_UP_FROM_RIGHT: | ||
1525 | if (pred == root) | ||
1526 | break; | ||
1527 | pred = get_pred_parent(pred, preds, | ||
1528 | pred->parent, &move); | ||
1529 | continue; | ||
1530 | } | ||
1531 | done = 1; | ||
1532 | } while (!done); | ||
1533 | |||
1534 | return 0; | ||
1535 | } | ||
1536 | |||
1537 | /* | ||
1538 | * To optimize the processing of the ops, if we have several "ors" or | ||
1539 | * "ands" together, we can put them in an array and process them all | ||
1540 | * together speeding up the filter logic. | ||
1541 | */ | ||
1542 | static int fold_pred_tree(struct event_filter *filter, | ||
1543 | struct filter_pred *root) | ||
1544 | { | ||
1545 | struct filter_pred *preds; | ||
1546 | struct filter_pred *pred; | ||
1547 | enum move_type move = MOVE_DOWN; | ||
1548 | int done = 0; | ||
1549 | int err; | ||
1550 | |||
1551 | preds = filter->preds; | ||
1552 | if (!preds) | ||
1553 | return -EINVAL; | ||
1554 | pred = root; | ||
1555 | |||
1556 | do { | ||
1557 | switch (move) { | ||
1558 | case MOVE_DOWN: | ||
1559 | if (pred->index & FILTER_PRED_FOLD) { | ||
1560 | err = fold_pred(preds, pred); | ||
1561 | if (err) | ||
1562 | return err; | ||
1563 | /* Folded nodes are like leafs */ | ||
1564 | } else if (pred->left != FILTER_PRED_INVALID) { | ||
1565 | pred = &preds[pred->left]; | ||
1566 | continue; | ||
1567 | } | ||
1568 | |||
1569 | /* A leaf at the root is just a leaf in the tree */ | ||
1570 | if (pred == root) | ||
1571 | break; | ||
1572 | pred = get_pred_parent(pred, preds, | ||
1573 | pred->parent, &move); | ||
1574 | continue; | ||
1575 | case MOVE_UP_FROM_LEFT: | ||
1576 | pred = &preds[pred->right]; | ||
1577 | move = MOVE_DOWN; | ||
1578 | continue; | ||
1579 | case MOVE_UP_FROM_RIGHT: | ||
1580 | if (pred == root) | ||
1581 | break; | ||
1582 | pred = get_pred_parent(pred, preds, | ||
1583 | pred->parent, &move); | ||
1584 | continue; | ||
1585 | } | ||
1586 | done = 1; | ||
1587 | } while (!done); | ||
1588 | |||
1589 | return 0; | ||
1590 | } | ||
1591 | |||
1190 | static int replace_preds(struct ftrace_event_call *call, | 1592 | static int replace_preds(struct ftrace_event_call *call, |
1191 | struct event_filter *filter, | 1593 | struct event_filter *filter, |
1192 | struct filter_parse_state *ps, | 1594 | struct filter_parse_state *ps, |
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1195 | { | 1597 | { |
1196 | char *operand1 = NULL, *operand2 = NULL; | 1598 | char *operand1 = NULL, *operand2 = NULL; |
1197 | struct filter_pred *pred; | 1599 | struct filter_pred *pred; |
1600 | struct filter_pred *root; | ||
1198 | struct postfix_elt *elt; | 1601 | struct postfix_elt *elt; |
1602 | struct pred_stack stack = { }; /* init to NULL */ | ||
1199 | int err; | 1603 | int err; |
1200 | int n_preds = 0; | 1604 | int n_preds = 0; |
1201 | 1605 | ||
1606 | n_preds = count_preds(ps); | ||
1607 | if (n_preds >= MAX_FILTER_PRED) { | ||
1608 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | ||
1609 | return -ENOSPC; | ||
1610 | } | ||
1611 | |||
1202 | err = check_preds(ps); | 1612 | err = check_preds(ps); |
1203 | if (err) | 1613 | if (err) |
1204 | return err; | 1614 | return err; |
1205 | 1615 | ||
1616 | if (!dry_run) { | ||
1617 | err = __alloc_pred_stack(&stack, n_preds); | ||
1618 | if (err) | ||
1619 | return err; | ||
1620 | err = __alloc_preds(filter, n_preds); | ||
1621 | if (err) | ||
1622 | goto fail; | ||
1623 | } | ||
1624 | |||
1625 | n_preds = 0; | ||
1206 | list_for_each_entry(elt, &ps->postfix, list) { | 1626 | list_for_each_entry(elt, &ps->postfix, list) { |
1207 | if (elt->op == OP_NONE) { | 1627 | if (elt->op == OP_NONE) { |
1208 | if (!operand1) | 1628 | if (!operand1) |
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1211 | operand2 = elt->operand; | 1631 | operand2 = elt->operand; |
1212 | else { | 1632 | else { |
1213 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); | 1633 | parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); |
1214 | return -EINVAL; | 1634 | err = -EINVAL; |
1635 | goto fail; | ||
1215 | } | 1636 | } |
1216 | continue; | 1637 | continue; |
1217 | } | 1638 | } |
1218 | 1639 | ||
1219 | if (n_preds++ == MAX_FILTER_PRED) { | 1640 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { |
1220 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 1641 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
1221 | return -ENOSPC; | 1642 | err = -ENOSPC; |
1643 | goto fail; | ||
1222 | } | 1644 | } |
1223 | 1645 | ||
1224 | if (elt->op == OP_AND || elt->op == OP_OR) { | 1646 | if (elt->op == OP_AND || elt->op == OP_OR) { |
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1228 | 1650 | ||
1229 | if (!operand1 || !operand2) { | 1651 | if (!operand1 || !operand2) { |
1230 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | 1652 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); |
1231 | return -EINVAL; | 1653 | err = -EINVAL; |
1654 | goto fail; | ||
1232 | } | 1655 | } |
1233 | 1656 | ||
1234 | pred = create_pred(elt->op, operand1, operand2); | 1657 | pred = create_pred(elt->op, operand1, operand2); |
1235 | add_pred: | 1658 | add_pred: |
1236 | if (!pred) | 1659 | if (!pred) { |
1237 | return -ENOMEM; | 1660 | err = -ENOMEM; |
1238 | err = filter_add_pred(ps, call, filter, pred, dry_run); | 1661 | goto fail; |
1662 | } | ||
1663 | err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); | ||
1239 | filter_free_pred(pred); | 1664 | filter_free_pred(pred); |
1240 | if (err) | 1665 | if (err) |
1241 | return err; | 1666 | goto fail; |
1242 | 1667 | ||
1243 | operand1 = operand2 = NULL; | 1668 | operand1 = operand2 = NULL; |
1244 | } | 1669 | } |
1245 | 1670 | ||
1246 | return 0; | 1671 | if (!dry_run) { |
1672 | /* We should have one item left on the stack */ | ||
1673 | pred = __pop_pred_stack(&stack); | ||
1674 | if (!pred) | ||
1675 | return -EINVAL; | ||
1676 | /* This item is where we start from in matching */ | ||
1677 | root = pred; | ||
1678 | /* Make sure the stack is empty */ | ||
1679 | pred = __pop_pred_stack(&stack); | ||
1680 | if (WARN_ON(pred)) { | ||
1681 | err = -EINVAL; | ||
1682 | filter->root = NULL; | ||
1683 | goto fail; | ||
1684 | } | ||
1685 | err = check_pred_tree(filter, root); | ||
1686 | if (err) | ||
1687 | goto fail; | ||
1688 | |||
1689 | /* Optimize the tree */ | ||
1690 | err = fold_pred_tree(filter, root); | ||
1691 | if (err) | ||
1692 | goto fail; | ||
1693 | |||
1694 | /* We don't set root until we know it works */ | ||
1695 | barrier(); | ||
1696 | filter->root = root; | ||
1697 | } | ||
1698 | |||
1699 | err = 0; | ||
1700 | fail: | ||
1701 | __free_pred_stack(&stack); | ||
1702 | return err; | ||
1247 | } | 1703 | } |
1248 | 1704 | ||
1705 | struct filter_list { | ||
1706 | struct list_head list; | ||
1707 | struct event_filter *filter; | ||
1708 | }; | ||
1709 | |||
1249 | static int replace_system_preds(struct event_subsystem *system, | 1710 | static int replace_system_preds(struct event_subsystem *system, |
1250 | struct filter_parse_state *ps, | 1711 | struct filter_parse_state *ps, |
1251 | char *filter_string) | 1712 | char *filter_string) |
1252 | { | 1713 | { |
1253 | struct ftrace_event_call *call; | 1714 | struct ftrace_event_call *call; |
1715 | struct filter_list *filter_item; | ||
1716 | struct filter_list *tmp; | ||
1717 | LIST_HEAD(filter_list); | ||
1254 | bool fail = true; | 1718 | bool fail = true; |
1255 | int err; | 1719 | int err; |
1256 | 1720 | ||
1257 | list_for_each_entry(call, &ftrace_events, list) { | 1721 | list_for_each_entry(call, &ftrace_events, list) { |
1258 | struct event_filter *filter = call->filter; | ||
1259 | 1722 | ||
1260 | if (strcmp(call->class->system, system->name) != 0) | 1723 | if (strcmp(call->class->system, system->name) != 0) |
1261 | continue; | 1724 | continue; |
1262 | 1725 | ||
1263 | /* try to see if the filter can be applied */ | 1726 | /* |
1264 | err = replace_preds(call, filter, ps, filter_string, true); | 1727 | * Try to see if the filter can be applied |
1728 | * (filter arg is ignored on dry_run) | ||
1729 | */ | ||
1730 | err = replace_preds(call, NULL, ps, filter_string, true); | ||
1265 | if (err) | 1731 | if (err) |
1732 | goto fail; | ||
1733 | } | ||
1734 | |||
1735 | list_for_each_entry(call, &ftrace_events, list) { | ||
1736 | struct event_filter *filter; | ||
1737 | |||
1738 | if (strcmp(call->class->system, system->name) != 0) | ||
1266 | continue; | 1739 | continue; |
1267 | 1740 | ||
1268 | /* really apply the filter */ | 1741 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
1269 | filter_disable_preds(call); | 1742 | if (!filter_item) |
1270 | err = replace_preds(call, filter, ps, filter_string, false); | 1743 | goto fail_mem; |
1744 | |||
1745 | list_add_tail(&filter_item->list, &filter_list); | ||
1746 | |||
1747 | filter_item->filter = __alloc_filter(); | ||
1748 | if (!filter_item->filter) | ||
1749 | goto fail_mem; | ||
1750 | filter = filter_item->filter; | ||
1751 | |||
1752 | /* Can only fail on no memory */ | ||
1753 | err = replace_filter_string(filter, filter_string); | ||
1271 | if (err) | 1754 | if (err) |
1272 | filter_disable_preds(call); | 1755 | goto fail_mem; |
1273 | else { | 1756 | |
1757 | err = replace_preds(call, filter, ps, filter_string, false); | ||
1758 | if (err) { | ||
1759 | filter_disable(call); | ||
1760 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
1761 | append_filter_err(ps, filter); | ||
1762 | } else | ||
1274 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1763 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1275 | replace_filter_string(filter, filter_string); | 1764 | /* |
1276 | } | 1765 | * Regardless of if this returned an error, we still |
1766 | * replace the filter for the call. | ||
1767 | */ | ||
1768 | filter = call->filter; | ||
1769 | call->filter = filter_item->filter; | ||
1770 | filter_item->filter = filter; | ||
1771 | |||
1277 | fail = false; | 1772 | fail = false; |
1278 | } | 1773 | } |
1279 | 1774 | ||
1280 | if (fail) { | 1775 | if (fail) |
1281 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1776 | goto fail; |
1282 | return -EINVAL; | 1777 | |
1778 | /* | ||
1779 | * The calls can still be using the old filters. | ||
1780 | * Do a synchronize_sched() to ensure all calls are | ||
1781 | * done with them before we free them. | ||
1782 | */ | ||
1783 | synchronize_sched(); | ||
1784 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1785 | __free_filter(filter_item->filter); | ||
1786 | list_del(&filter_item->list); | ||
1787 | kfree(filter_item); | ||
1283 | } | 1788 | } |
1284 | return 0; | 1789 | return 0; |
1790 | fail: | ||
1791 | /* No call succeeded */ | ||
1792 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1793 | list_del(&filter_item->list); | ||
1794 | kfree(filter_item); | ||
1795 | } | ||
1796 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | ||
1797 | return -EINVAL; | ||
1798 | fail_mem: | ||
1799 | /* If any call succeeded, we still need to sync */ | ||
1800 | if (!fail) | ||
1801 | synchronize_sched(); | ||
1802 | list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { | ||
1803 | __free_filter(filter_item->filter); | ||
1804 | list_del(&filter_item->list); | ||
1805 | kfree(filter_item); | ||
1806 | } | ||
1807 | return -ENOMEM; | ||
1285 | } | 1808 | } |
1286 | 1809 | ||
1287 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1810 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) |
1288 | { | 1811 | { |
1289 | int err; | ||
1290 | struct filter_parse_state *ps; | 1812 | struct filter_parse_state *ps; |
1813 | struct event_filter *filter; | ||
1814 | struct event_filter *tmp; | ||
1815 | int err = 0; | ||
1291 | 1816 | ||
1292 | mutex_lock(&event_mutex); | 1817 | mutex_lock(&event_mutex); |
1293 | 1818 | ||
1294 | err = init_preds(call); | ||
1295 | if (err) | ||
1296 | goto out_unlock; | ||
1297 | |||
1298 | if (!strcmp(strstrip(filter_string), "0")) { | 1819 | if (!strcmp(strstrip(filter_string), "0")) { |
1299 | filter_disable_preds(call); | 1820 | filter_disable(call); |
1300 | remove_filter_string(call->filter); | 1821 | filter = call->filter; |
1822 | if (!filter) | ||
1823 | goto out_unlock; | ||
1824 | call->filter = NULL; | ||
1825 | /* Make sure the filter is not being used */ | ||
1826 | synchronize_sched(); | ||
1827 | __free_filter(filter); | ||
1301 | goto out_unlock; | 1828 | goto out_unlock; |
1302 | } | 1829 | } |
1303 | 1830 | ||
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1306 | if (!ps) | 1833 | if (!ps) |
1307 | goto out_unlock; | 1834 | goto out_unlock; |
1308 | 1835 | ||
1309 | filter_disable_preds(call); | 1836 | filter = __alloc_filter(); |
1310 | replace_filter_string(call->filter, filter_string); | 1837 | if (!filter) { |
1838 | kfree(ps); | ||
1839 | goto out_unlock; | ||
1840 | } | ||
1841 | |||
1842 | replace_filter_string(filter, filter_string); | ||
1311 | 1843 | ||
1312 | parse_init(ps, filter_ops, filter_string); | 1844 | parse_init(ps, filter_ops, filter_string); |
1313 | err = filter_parse(ps); | 1845 | err = filter_parse(ps); |
1314 | if (err) { | 1846 | if (err) { |
1315 | append_filter_err(ps, call->filter); | 1847 | append_filter_err(ps, filter); |
1316 | goto out; | 1848 | goto out; |
1317 | } | 1849 | } |
1318 | 1850 | ||
1319 | err = replace_preds(call, call->filter, ps, filter_string, false); | 1851 | err = replace_preds(call, filter, ps, filter_string, false); |
1320 | if (err) | 1852 | if (err) { |
1321 | append_filter_err(ps, call->filter); | 1853 | filter_disable(call); |
1322 | else | 1854 | append_filter_err(ps, filter); |
1855 | } else | ||
1323 | call->flags |= TRACE_EVENT_FL_FILTERED; | 1856 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1324 | out: | 1857 | out: |
1858 | /* | ||
1859 | * Always swap the call filter with the new filter | ||
1860 | * even if there was an error. If there was an error | ||
1861 | * in the filter, we disable the filter and show the error | ||
1862 | * string | ||
1863 | */ | ||
1864 | tmp = call->filter; | ||
1865 | call->filter = filter; | ||
1866 | if (tmp) { | ||
1867 | /* Make sure the call is done with the filter */ | ||
1868 | synchronize_sched(); | ||
1869 | __free_filter(tmp); | ||
1870 | } | ||
1325 | filter_opstack_clear(ps); | 1871 | filter_opstack_clear(ps); |
1326 | postfix_clear(ps); | 1872 | postfix_clear(ps); |
1327 | kfree(ps); | 1873 | kfree(ps); |
@@ -1334,18 +1880,21 @@ out_unlock: | |||
1334 | int apply_subsystem_event_filter(struct event_subsystem *system, | 1880 | int apply_subsystem_event_filter(struct event_subsystem *system, |
1335 | char *filter_string) | 1881 | char *filter_string) |
1336 | { | 1882 | { |
1337 | int err; | ||
1338 | struct filter_parse_state *ps; | 1883 | struct filter_parse_state *ps; |
1884 | struct event_filter *filter; | ||
1885 | int err = 0; | ||
1339 | 1886 | ||
1340 | mutex_lock(&event_mutex); | 1887 | mutex_lock(&event_mutex); |
1341 | 1888 | ||
1342 | err = init_subsystem_preds(system); | ||
1343 | if (err) | ||
1344 | goto out_unlock; | ||
1345 | |||
1346 | if (!strcmp(strstrip(filter_string), "0")) { | 1889 | if (!strcmp(strstrip(filter_string), "0")) { |
1347 | filter_free_subsystem_preds(system); | 1890 | filter_free_subsystem_preds(system); |
1348 | remove_filter_string(system->filter); | 1891 | remove_filter_string(system->filter); |
1892 | filter = system->filter; | ||
1893 | system->filter = NULL; | ||
1894 | /* Ensure all filters are no longer used */ | ||
1895 | synchronize_sched(); | ||
1896 | filter_free_subsystem_filters(system); | ||
1897 | __free_filter(filter); | ||
1349 | goto out_unlock; | 1898 | goto out_unlock; |
1350 | } | 1899 | } |
1351 | 1900 | ||
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1354 | if (!ps) | 1903 | if (!ps) |
1355 | goto out_unlock; | 1904 | goto out_unlock; |
1356 | 1905 | ||
1357 | replace_filter_string(system->filter, filter_string); | 1906 | filter = __alloc_filter(); |
1907 | if (!filter) | ||
1908 | goto out; | ||
1909 | |||
1910 | replace_filter_string(filter, filter_string); | ||
1911 | /* | ||
1912 | * No event actually uses the system filter | ||
1913 | * we can free it without synchronize_sched(). | ||
1914 | */ | ||
1915 | __free_filter(system->filter); | ||
1916 | system->filter = filter; | ||
1358 | 1917 | ||
1359 | parse_init(ps, filter_ops, filter_string); | 1918 | parse_init(ps, filter_ops, filter_string); |
1360 | err = filter_parse(ps); | 1919 | err = filter_parse(ps); |
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) | |||
1384 | struct event_filter *filter = event->filter; | 1943 | struct event_filter *filter = event->filter; |
1385 | 1944 | ||
1386 | event->filter = NULL; | 1945 | event->filter = NULL; |
1387 | __free_preds(filter); | 1946 | __free_filter(filter); |
1388 | } | 1947 | } |
1389 | 1948 | ||
1390 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, | 1949 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, |
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1410 | if (event->filter) | 1969 | if (event->filter) |
1411 | goto out_unlock; | 1970 | goto out_unlock; |
1412 | 1971 | ||
1413 | filter = __alloc_preds(); | 1972 | filter = __alloc_filter(); |
1414 | if (IS_ERR(filter)) { | 1973 | if (!filter) { |
1415 | err = PTR_ERR(filter); | 1974 | err = PTR_ERR(filter); |
1416 | goto out_unlock; | 1975 | goto out_unlock; |
1417 | } | 1976 | } |
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1419 | err = -ENOMEM; | 1978 | err = -ENOMEM; |
1420 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | 1979 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); |
1421 | if (!ps) | 1980 | if (!ps) |
1422 | goto free_preds; | 1981 | goto free_filter; |
1423 | 1982 | ||
1424 | parse_init(ps, filter_ops, filter_str); | 1983 | parse_init(ps, filter_ops, filter_str); |
1425 | err = filter_parse(ps); | 1984 | err = filter_parse(ps); |
@@ -1435,9 +1994,9 @@ free_ps: | |||
1435 | postfix_clear(ps); | 1994 | postfix_clear(ps); |
1436 | kfree(ps); | 1995 | kfree(ps); |
1437 | 1996 | ||
1438 | free_preds: | 1997 | free_filter: |
1439 | if (err) | 1998 | if (err) |
1440 | __free_preds(filter); | 1999 | __free_filter(filter); |
1441 | 2000 | ||
1442 | out_unlock: | 2001 | out_unlock: |
1443 | mutex_unlock(&event_mutex); | 2002 | mutex_unlock(&event_mutex); |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac25..bbeec31e0ae3 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
83 | 83 | ||
84 | #undef __array | 84 | #undef __array |
85 | #define __array(type, item, len) \ | 85 | #define __array(type, item, len) \ |
86 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ | 86 | do { \ |
87 | ret = trace_define_field(event_call, #type "[" #len "]", #item, \ | 87 | BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ |
88 | mutex_lock(&event_storage_mutex); \ | ||
89 | snprintf(event_storage, sizeof(event_storage), \ | ||
90 | "%s[%d]", #type, len); \ | ||
91 | ret = trace_define_field(event_call, event_storage, #item, \ | ||
88 | offsetof(typeof(field), item), \ | 92 | offsetof(typeof(field), item), \ |
89 | sizeof(field.item), \ | 93 | sizeof(field.item), \ |
90 | is_signed_type(type), FILTER_OTHER); \ | 94 | is_signed_type(type), FILTER_OTHER); \ |
91 | if (ret) \ | 95 | mutex_unlock(&event_storage_mutex); \ |
92 | return ret; | 96 | if (ret) \ |
97 | return ret; \ | ||
98 | } while (0); | ||
93 | 99 | ||
94 | #undef __array_desc | 100 | #undef __array_desc |
95 | #define __array_desc(type, container, item, len) \ | 101 | #define __array_desc(type, container, item, len) \ |
@@ -155,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \ | |||
155 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ | 161 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
156 | }; \ | 162 | }; \ |
157 | \ | 163 | \ |
158 | struct ftrace_event_call __used \ | 164 | struct ftrace_event_call __used event_##call = { \ |
159 | __attribute__((__aligned__(4))) \ | ||
160 | __attribute__((section("_ftrace_events"))) event_##call = { \ | ||
161 | .name = #call, \ | 165 | .name = #call, \ |
162 | .event.type = etype, \ | 166 | .event.type = etype, \ |
163 | .class = &event_class_ftrace_##call, \ | 167 | .class = &event_class_ftrace_##call, \ |
164 | .print_fmt = print, \ | 168 | .print_fmt = print, \ |
165 | }; \ | 169 | }; \ |
170 | struct ftrace_event_call __used \ | ||
171 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | ||
166 | 172 | ||
167 | #include "trace_entries.h" | 173 | #include "trace_entries.h" |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8f..8d0e1cc4e974 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
149 | static struct ftrace_ops trace_ops __read_mostly = | 149 | static struct ftrace_ops trace_ops __read_mostly = |
150 | { | 150 | { |
151 | .func = function_trace_call, | 151 | .func = function_trace_call, |
152 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
152 | }; | 153 | }; |
153 | 154 | ||
154 | static struct ftrace_ops trace_stack_ops __read_mostly = | 155 | static struct ftrace_ops trace_stack_ops __read_mostly = |
155 | { | 156 | { |
156 | .func = function_stack_trace_call, | 157 | .func = function_stack_trace_call, |
158 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
157 | }; | 159 | }; |
158 | 160 | ||
159 | /* Our two options */ | 161 | /* Our two options */ |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f233698518e..962cdb24ed81 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -15,15 +15,19 @@ | |||
15 | #include "trace.h" | 15 | #include "trace.h" |
16 | #include "trace_output.h" | 16 | #include "trace_output.h" |
17 | 17 | ||
18 | /* When set, irq functions will be ignored */ | ||
19 | static int ftrace_graph_skip_irqs; | ||
20 | |||
18 | struct fgraph_cpu_data { | 21 | struct fgraph_cpu_data { |
19 | pid_t last_pid; | 22 | pid_t last_pid; |
20 | int depth; | 23 | int depth; |
24 | int depth_irq; | ||
21 | int ignore; | 25 | int ignore; |
22 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | 26 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; |
23 | }; | 27 | }; |
24 | 28 | ||
25 | struct fgraph_data { | 29 | struct fgraph_data { |
26 | struct fgraph_cpu_data *cpu_data; | 30 | struct fgraph_cpu_data __percpu *cpu_data; |
27 | 31 | ||
28 | /* Place to preserve last processed entry. */ | 32 | /* Place to preserve last processed entry. */ |
29 | struct ftrace_graph_ent_entry ent; | 33 | struct ftrace_graph_ent_entry ent; |
@@ -41,6 +45,7 @@ struct fgraph_data { | |||
41 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 45 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
42 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 46 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
43 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
44 | 49 | ||
45 | static struct tracer_opt trace_opts[] = { | 50 | static struct tracer_opt trace_opts[] = { |
46 | /* Display overruns? (for self-debug purpose) */ | 51 | /* Display overruns? (for self-debug purpose) */ |
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { | |||
55 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, | 60 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, |
56 | /* Display absolute time of an entry */ | 61 | /* Display absolute time of an entry */ |
57 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, | 62 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, |
63 | /* Display interrupts */ | ||
64 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, | ||
58 | { } /* Empty entry */ | 65 | { } /* Empty entry */ |
59 | }; | 66 | }; |
60 | 67 | ||
61 | static struct tracer_flags tracer_flags = { | 68 | static struct tracer_flags tracer_flags = { |
62 | /* Don't display overruns and proc by default */ | 69 | /* Don't display overruns and proc by default */ |
63 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | | 70 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | |
64 | TRACE_GRAPH_PRINT_DURATION, | 71 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, |
65 | .opts = trace_opts | 72 | .opts = trace_opts |
66 | }; | 73 | }; |
67 | 74 | ||
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, | |||
204 | return 1; | 211 | return 1; |
205 | } | 212 | } |
206 | 213 | ||
214 | static inline int ftrace_graph_ignore_irqs(void) | ||
215 | { | ||
216 | if (!ftrace_graph_skip_irqs) | ||
217 | return 0; | ||
218 | |||
219 | return in_irq(); | ||
220 | } | ||
221 | |||
207 | int trace_graph_entry(struct ftrace_graph_ent *trace) | 222 | int trace_graph_entry(struct ftrace_graph_ent *trace) |
208 | { | 223 | { |
209 | struct trace_array *tr = graph_array; | 224 | struct trace_array *tr = graph_array; |
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
218 | return 0; | 233 | return 0; |
219 | 234 | ||
220 | /* trace it when it is-nested-in or is a function enabled. */ | 235 | /* trace it when it is-nested-in or is a function enabled. */ |
221 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | 236 | if (!(trace->depth || ftrace_graph_addr(trace->func)) || |
237 | ftrace_graph_ignore_irqs()) | ||
222 | return 0; | 238 | return 0; |
223 | 239 | ||
224 | local_irq_save(flags); | 240 | local_irq_save(flags); |
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | |||
246 | return trace_graph_entry(trace); | 262 | return trace_graph_entry(trace); |
247 | } | 263 | } |
248 | 264 | ||
265 | static void | ||
266 | __trace_graph_function(struct trace_array *tr, | ||
267 | unsigned long ip, unsigned long flags, int pc) | ||
268 | { | ||
269 | u64 time = trace_clock_local(); | ||
270 | struct ftrace_graph_ent ent = { | ||
271 | .func = ip, | ||
272 | .depth = 0, | ||
273 | }; | ||
274 | struct ftrace_graph_ret ret = { | ||
275 | .func = ip, | ||
276 | .depth = 0, | ||
277 | .calltime = time, | ||
278 | .rettime = time, | ||
279 | }; | ||
280 | |||
281 | __trace_graph_entry(tr, &ent, flags, pc); | ||
282 | __trace_graph_return(tr, &ret, flags, pc); | ||
283 | } | ||
284 | |||
285 | void | ||
286 | trace_graph_function(struct trace_array *tr, | ||
287 | unsigned long ip, unsigned long parent_ip, | ||
288 | unsigned long flags, int pc) | ||
289 | { | ||
290 | __trace_graph_function(tr, ip, flags, pc); | ||
291 | } | ||
292 | |||
249 | void __trace_graph_return(struct trace_array *tr, | 293 | void __trace_graph_return(struct trace_array *tr, |
250 | struct ftrace_graph_ret *trace, | 294 | struct ftrace_graph_ret *trace, |
251 | unsigned long flags, | 295 | unsigned long flags, |
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
649 | 693 | ||
650 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 694 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
651 | if (len < 7) { | 695 | if (len < 7) { |
652 | snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", | 696 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); |
653 | nsecs_rem); | 697 | |
698 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | ||
654 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 699 | ret = trace_seq_printf(s, ".%s", nsecs_str); |
655 | if (!ret) | 700 | if (!ret) |
656 | return TRACE_TYPE_PARTIAL_LINE; | 701 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
855 | return 0; | 900 | return 0; |
856 | } | 901 | } |
857 | 902 | ||
903 | /* | ||
904 | * Entry check for irq code | ||
905 | * | ||
906 | * returns 1 if | ||
907 | * - we are inside irq code | ||
908 | * - we just entered irq code | ||
909 | * | ||
910 | * retunns 0 if | ||
911 | * - funcgraph-interrupts option is set | ||
912 | * - we are not inside irq code | ||
913 | */ | ||
914 | static int | ||
915 | check_irq_entry(struct trace_iterator *iter, u32 flags, | ||
916 | unsigned long addr, int depth) | ||
917 | { | ||
918 | int cpu = iter->cpu; | ||
919 | int *depth_irq; | ||
920 | struct fgraph_data *data = iter->private; | ||
921 | |||
922 | /* | ||
923 | * If we are either displaying irqs, or we got called as | ||
924 | * a graph event and private data does not exist, | ||
925 | * then we bypass the irq check. | ||
926 | */ | ||
927 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
928 | (!data)) | ||
929 | return 0; | ||
930 | |||
931 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
932 | |||
933 | /* | ||
934 | * We are inside the irq code | ||
935 | */ | ||
936 | if (*depth_irq >= 0) | ||
937 | return 1; | ||
938 | |||
939 | if ((addr < (unsigned long)__irqentry_text_start) || | ||
940 | (addr >= (unsigned long)__irqentry_text_end)) | ||
941 | return 0; | ||
942 | |||
943 | /* | ||
944 | * We are entering irq code. | ||
945 | */ | ||
946 | *depth_irq = depth; | ||
947 | return 1; | ||
948 | } | ||
949 | |||
950 | /* | ||
951 | * Return check for irq code | ||
952 | * | ||
953 | * returns 1 if | ||
954 | * - we are inside irq code | ||
955 | * - we just left irq code | ||
956 | * | ||
957 | * returns 0 if | ||
958 | * - funcgraph-interrupts option is set | ||
959 | * - we are not inside irq code | ||
960 | */ | ||
961 | static int | ||
962 | check_irq_return(struct trace_iterator *iter, u32 flags, int depth) | ||
963 | { | ||
964 | int cpu = iter->cpu; | ||
965 | int *depth_irq; | ||
966 | struct fgraph_data *data = iter->private; | ||
967 | |||
968 | /* | ||
969 | * If we are either displaying irqs, or we got called as | ||
970 | * a graph event and private data does not exist, | ||
971 | * then we bypass the irq check. | ||
972 | */ | ||
973 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
974 | (!data)) | ||
975 | return 0; | ||
976 | |||
977 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
978 | |||
979 | /* | ||
980 | * We are not inside the irq code. | ||
981 | */ | ||
982 | if (*depth_irq == -1) | ||
983 | return 0; | ||
984 | |||
985 | /* | ||
986 | * We are inside the irq code, and this is returning entry. | ||
987 | * Let's not trace it and clear the entry depth, since | ||
988 | * we are out of irq code. | ||
989 | * | ||
990 | * This condition ensures that we 'leave the irq code' once | ||
991 | * we are out of the entry depth. Thus protecting us from | ||
992 | * the RETURN entry loss. | ||
993 | */ | ||
994 | if (*depth_irq >= depth) { | ||
995 | *depth_irq = -1; | ||
996 | return 1; | ||
997 | } | ||
998 | |||
999 | /* | ||
1000 | * We are inside the irq code, and this is not the entry. | ||
1001 | */ | ||
1002 | return 1; | ||
1003 | } | ||
1004 | |||
858 | static enum print_line_t | 1005 | static enum print_line_t |
859 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | 1006 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, |
860 | struct trace_iterator *iter, u32 flags) | 1007 | struct trace_iterator *iter, u32 flags) |
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | |||
865 | static enum print_line_t ret; | 1012 | static enum print_line_t ret; |
866 | int cpu = iter->cpu; | 1013 | int cpu = iter->cpu; |
867 | 1014 | ||
1015 | if (check_irq_entry(iter, flags, call->func, call->depth)) | ||
1016 | return TRACE_TYPE_HANDLED; | ||
1017 | |||
868 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) | 1018 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) |
869 | return TRACE_TYPE_PARTIAL_LINE; | 1019 | return TRACE_TYPE_PARTIAL_LINE; |
870 | 1020 | ||
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
902 | int ret; | 1052 | int ret; |
903 | int i; | 1053 | int i; |
904 | 1054 | ||
1055 | if (check_irq_return(iter, flags, trace->depth)) | ||
1056 | return TRACE_TYPE_HANDLED; | ||
1057 | |||
905 | if (data) { | 1058 | if (data) { |
906 | struct fgraph_cpu_data *cpu_data; | 1059 | struct fgraph_cpu_data *cpu_data; |
907 | int cpu = iter->cpu; | 1060 | int cpu = iter->cpu; |
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1054 | 1207 | ||
1055 | 1208 | ||
1056 | enum print_line_t | 1209 | enum print_line_t |
1057 | print_graph_function_flags(struct trace_iterator *iter, u32 flags) | 1210 | __print_graph_function_flags(struct trace_iterator *iter, u32 flags) |
1058 | { | 1211 | { |
1059 | struct ftrace_graph_ent_entry *field; | 1212 | struct ftrace_graph_ent_entry *field; |
1060 | struct fgraph_data *data = iter->private; | 1213 | struct fgraph_data *data = iter->private; |
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
1117 | static enum print_line_t | 1270 | static enum print_line_t |
1118 | print_graph_function(struct trace_iterator *iter) | 1271 | print_graph_function(struct trace_iterator *iter) |
1119 | { | 1272 | { |
1120 | return print_graph_function_flags(iter, tracer_flags.val); | 1273 | return __print_graph_function_flags(iter, tracer_flags.val); |
1274 | } | ||
1275 | |||
1276 | enum print_line_t print_graph_function_flags(struct trace_iterator *iter, | ||
1277 | u32 flags) | ||
1278 | { | ||
1279 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
1280 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1281 | else | ||
1282 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1283 | |||
1284 | return __print_graph_function_flags(iter, flags); | ||
1121 | } | 1285 | } |
1122 | 1286 | ||
1123 | static enum print_line_t | 1287 | static enum print_line_t |
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) | |||
1149 | seq_printf(s, "#%.*s|||| / \n", size, spaces); | 1313 | seq_printf(s, "#%.*s|||| / \n", size, spaces); |
1150 | } | 1314 | } |
1151 | 1315 | ||
1152 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | 1316 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) |
1153 | { | 1317 | { |
1154 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; | 1318 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; |
1155 | 1319 | ||
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) | |||
1190 | print_graph_headers_flags(s, tracer_flags.val); | 1354 | print_graph_headers_flags(s, tracer_flags.val); |
1191 | } | 1355 | } |
1192 | 1356 | ||
1357 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | ||
1358 | { | ||
1359 | struct trace_iterator *iter = s->private; | ||
1360 | |||
1361 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
1362 | /* print nothing if the buffers are empty */ | ||
1363 | if (trace_empty(iter)) | ||
1364 | return; | ||
1365 | |||
1366 | print_trace_header(s, iter); | ||
1367 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1368 | } else | ||
1369 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1370 | |||
1371 | __print_graph_headers_flags(s, flags); | ||
1372 | } | ||
1373 | |||
1193 | void graph_trace_open(struct trace_iterator *iter) | 1374 | void graph_trace_open(struct trace_iterator *iter) |
1194 | { | 1375 | { |
1195 | /* pid and depth on the last trace processed */ | 1376 | /* pid and depth on the last trace processed */ |
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) | |||
1210 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); | 1391 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); |
1211 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 1392 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); |
1212 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); | 1393 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); |
1394 | int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
1395 | |||
1213 | *pid = -1; | 1396 | *pid = -1; |
1214 | *depth = 0; | 1397 | *depth = 0; |
1215 | *ignore = 0; | 1398 | *ignore = 0; |
1399 | *depth_irq = -1; | ||
1216 | } | 1400 | } |
1217 | 1401 | ||
1218 | iter->private = data; | 1402 | iter->private = data; |
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) | |||
1235 | } | 1419 | } |
1236 | } | 1420 | } |
1237 | 1421 | ||
1422 | static int func_graph_set_flag(u32 old_flags, u32 bit, int set) | ||
1423 | { | ||
1424 | if (bit == TRACE_GRAPH_PRINT_IRQS) | ||
1425 | ftrace_graph_skip_irqs = !set; | ||
1426 | |||
1427 | return 0; | ||
1428 | } | ||
1429 | |||
1238 | static struct trace_event_functions graph_functions = { | 1430 | static struct trace_event_functions graph_functions = { |
1239 | .trace = print_graph_function_event, | 1431 | .trace = print_graph_function_event, |
1240 | }; | 1432 | }; |
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { | |||
1261 | .print_line = print_graph_function, | 1453 | .print_line = print_graph_function, |
1262 | .print_header = print_graph_headers, | 1454 | .print_header = print_graph_headers, |
1263 | .flags = &tracer_flags, | 1455 | .flags = &tracer_flags, |
1456 | .set_flag = func_graph_set_flag, | ||
1264 | #ifdef CONFIG_FTRACE_SELFTEST | 1457 | #ifdef CONFIG_FTRACE_SELFTEST |
1265 | .selftest = trace_selftest_startup_function_graph, | 1458 | .selftest = trace_selftest_startup_function_graph, |
1266 | #endif | 1459 | #endif |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..c77424be284d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -80,21 +80,29 @@ static struct tracer_flags tracer_flags = { | |||
80 | * skip the latency if the sequence has changed - some other section | 80 | * skip the latency if the sequence has changed - some other section |
81 | * did a maximum and could disturb our measurement with serial console | 81 | * did a maximum and could disturb our measurement with serial console |
82 | * printouts, etc. Truly coinciding maximum latencies should be rare | 82 | * printouts, etc. Truly coinciding maximum latencies should be rare |
83 | * and what happens together happens separately as well, so this doesnt | 83 | * and what happens together happens separately as well, so this doesn't |
84 | * decrease the validity of the maximum found: | 84 | * decrease the validity of the maximum found: |
85 | */ | 85 | */ |
86 | static __cacheline_aligned_in_smp unsigned long max_sequence; | 86 | static __cacheline_aligned_in_smp unsigned long max_sequence; |
87 | 87 | ||
88 | #ifdef CONFIG_FUNCTION_TRACER | 88 | #ifdef CONFIG_FUNCTION_TRACER |
89 | /* | 89 | /* |
90 | * irqsoff uses its own tracer function to keep the overhead down: | 90 | * Prologue for the preempt and irqs off function tracers. |
91 | * | ||
92 | * Returns 1 if it is OK to continue, and data->disabled is | ||
93 | * incremented. | ||
94 | * 0 if the trace is to be ignored, and data->disabled | ||
95 | * is kept the same. | ||
96 | * | ||
97 | * Note, this function is also used outside this ifdef but | ||
98 | * inside the #ifdef of the function graph tracer below. | ||
99 | * This is OK, since the function graph tracer is | ||
100 | * dependent on the function tracer. | ||
91 | */ | 101 | */ |
92 | static void | 102 | static int func_prolog_dec(struct trace_array *tr, |
93 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | 103 | struct trace_array_cpu **data, |
104 | unsigned long *flags) | ||
94 | { | 105 | { |
95 | struct trace_array *tr = irqsoff_trace; | ||
96 | struct trace_array_cpu *data; | ||
97 | unsigned long flags; | ||
98 | long disabled; | 106 | long disabled; |
99 | int cpu; | 107 | int cpu; |
100 | 108 | ||
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
106 | */ | 114 | */ |
107 | cpu = raw_smp_processor_id(); | 115 | cpu = raw_smp_processor_id(); |
108 | if (likely(!per_cpu(tracing_cpu, cpu))) | 116 | if (likely(!per_cpu(tracing_cpu, cpu))) |
109 | return; | 117 | return 0; |
110 | 118 | ||
111 | local_save_flags(flags); | 119 | local_save_flags(*flags); |
112 | /* slight chance to get a false positive on tracing_cpu */ | 120 | /* slight chance to get a false positive on tracing_cpu */ |
113 | if (!irqs_disabled_flags(flags)) | 121 | if (!irqs_disabled_flags(*flags)) |
114 | return; | 122 | return 0; |
115 | 123 | ||
116 | data = tr->data[cpu]; | 124 | *data = tr->data[cpu]; |
117 | disabled = atomic_inc_return(&data->disabled); | 125 | disabled = atomic_inc_return(&(*data)->disabled); |
118 | 126 | ||
119 | if (likely(disabled == 1)) | 127 | if (likely(disabled == 1)) |
120 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | 128 | return 1; |
129 | |||
130 | atomic_dec(&(*data)->disabled); | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * irqsoff uses its own tracer function to keep the overhead down: | ||
137 | */ | ||
138 | static void | ||
139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
140 | { | ||
141 | struct trace_array *tr = irqsoff_trace; | ||
142 | struct trace_array_cpu *data; | ||
143 | unsigned long flags; | ||
144 | |||
145 | if (!func_prolog_dec(tr, &data, &flags)) | ||
146 | return; | ||
147 | |||
148 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | ||
121 | 149 | ||
122 | atomic_dec(&data->disabled); | 150 | atomic_dec(&data->disabled); |
123 | } | 151 | } |
@@ -125,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
125 | static struct ftrace_ops trace_ops __read_mostly = | 153 | static struct ftrace_ops trace_ops __read_mostly = |
126 | { | 154 | { |
127 | .func = irqsoff_tracer_call, | 155 | .func = irqsoff_tracer_call, |
156 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
128 | }; | 157 | }; |
129 | #endif /* CONFIG_FUNCTION_TRACER */ | 158 | #endif /* CONFIG_FUNCTION_TRACER */ |
130 | 159 | ||
@@ -155,30 +184,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) | |||
155 | struct trace_array *tr = irqsoff_trace; | 184 | struct trace_array *tr = irqsoff_trace; |
156 | struct trace_array_cpu *data; | 185 | struct trace_array_cpu *data; |
157 | unsigned long flags; | 186 | unsigned long flags; |
158 | long disabled; | ||
159 | int ret; | 187 | int ret; |
160 | int cpu; | ||
161 | int pc; | 188 | int pc; |
162 | 189 | ||
163 | cpu = raw_smp_processor_id(); | 190 | if (!func_prolog_dec(tr, &data, &flags)) |
164 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
165 | return 0; | ||
166 | |||
167 | local_save_flags(flags); | ||
168 | /* slight chance to get a false positive on tracing_cpu */ | ||
169 | if (!irqs_disabled_flags(flags)) | ||
170 | return 0; | 191 | return 0; |
171 | 192 | ||
172 | data = tr->data[cpu]; | 193 | pc = preempt_count(); |
173 | disabled = atomic_inc_return(&data->disabled); | 194 | ret = __trace_graph_entry(tr, trace, flags, pc); |
174 | |||
175 | if (likely(disabled == 1)) { | ||
176 | pc = preempt_count(); | ||
177 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
178 | } else | ||
179 | ret = 0; | ||
180 | |||
181 | atomic_dec(&data->disabled); | 195 | atomic_dec(&data->disabled); |
196 | |||
182 | return ret; | 197 | return ret; |
183 | } | 198 | } |
184 | 199 | ||
@@ -187,27 +202,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) | |||
187 | struct trace_array *tr = irqsoff_trace; | 202 | struct trace_array *tr = irqsoff_trace; |
188 | struct trace_array_cpu *data; | 203 | struct trace_array_cpu *data; |
189 | unsigned long flags; | 204 | unsigned long flags; |
190 | long disabled; | ||
191 | int cpu; | ||
192 | int pc; | 205 | int pc; |
193 | 206 | ||
194 | cpu = raw_smp_processor_id(); | 207 | if (!func_prolog_dec(tr, &data, &flags)) |
195 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
196 | return; | ||
197 | |||
198 | local_save_flags(flags); | ||
199 | /* slight chance to get a false positive on tracing_cpu */ | ||
200 | if (!irqs_disabled_flags(flags)) | ||
201 | return; | 208 | return; |
202 | 209 | ||
203 | data = tr->data[cpu]; | 210 | pc = preempt_count(); |
204 | disabled = atomic_inc_return(&data->disabled); | 211 | __trace_graph_return(tr, trace, flags, pc); |
205 | |||
206 | if (likely(disabled == 1)) { | ||
207 | pc = preempt_count(); | ||
208 | __trace_graph_return(tr, trace, flags, pc); | ||
209 | } | ||
210 | |||
211 | atomic_dec(&data->disabled); | 212 | atomic_dec(&data->disabled); |
212 | } | 213 | } |
213 | 214 | ||
@@ -229,75 +230,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) | |||
229 | 230 | ||
230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | 231 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) |
231 | { | 232 | { |
232 | u32 flags = GRAPH_TRACER_FLAGS; | ||
233 | |||
234 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
235 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
236 | else | ||
237 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
238 | |||
239 | /* | 233 | /* |
240 | * In graph mode call the graph tracer output function, | 234 | * In graph mode call the graph tracer output function, |
241 | * otherwise go with the TRACE_FN event handler | 235 | * otherwise go with the TRACE_FN event handler |
242 | */ | 236 | */ |
243 | if (is_graph()) | 237 | if (is_graph()) |
244 | return print_graph_function_flags(iter, flags); | 238 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); |
245 | 239 | ||
246 | return TRACE_TYPE_UNHANDLED; | 240 | return TRACE_TYPE_UNHANDLED; |
247 | } | 241 | } |
248 | 242 | ||
249 | static void irqsoff_print_header(struct seq_file *s) | 243 | static void irqsoff_print_header(struct seq_file *s) |
250 | { | 244 | { |
251 | if (is_graph()) { | 245 | if (is_graph()) |
252 | struct trace_iterator *iter = s->private; | 246 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); |
253 | u32 flags = GRAPH_TRACER_FLAGS; | 247 | else |
254 | |||
255 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
256 | /* print nothing if the buffers are empty */ | ||
257 | if (trace_empty(iter)) | ||
258 | return; | ||
259 | |||
260 | print_trace_header(s, iter); | ||
261 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
262 | } else | ||
263 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
264 | |||
265 | print_graph_headers_flags(s, flags); | ||
266 | } else | ||
267 | trace_default_header(s); | 248 | trace_default_header(s); |
268 | } | 249 | } |
269 | 250 | ||
270 | static void | 251 | static void |
271 | trace_graph_function(struct trace_array *tr, | ||
272 | unsigned long ip, unsigned long flags, int pc) | ||
273 | { | ||
274 | u64 time = trace_clock_local(); | ||
275 | struct ftrace_graph_ent ent = { | ||
276 | .func = ip, | ||
277 | .depth = 0, | ||
278 | }; | ||
279 | struct ftrace_graph_ret ret = { | ||
280 | .func = ip, | ||
281 | .depth = 0, | ||
282 | .calltime = time, | ||
283 | .rettime = time, | ||
284 | }; | ||
285 | |||
286 | __trace_graph_entry(tr, &ent, flags, pc); | ||
287 | __trace_graph_return(tr, &ret, flags, pc); | ||
288 | } | ||
289 | |||
290 | static void | ||
291 | __trace_function(struct trace_array *tr, | 252 | __trace_function(struct trace_array *tr, |
292 | unsigned long ip, unsigned long parent_ip, | 253 | unsigned long ip, unsigned long parent_ip, |
293 | unsigned long flags, int pc) | 254 | unsigned long flags, int pc) |
294 | { | 255 | { |
295 | if (!is_graph()) | 256 | if (is_graph()) |
257 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
258 | else | ||
296 | trace_function(tr, ip, parent_ip, flags, pc); | 259 | trace_function(tr, ip, parent_ip, flags, pc); |
297 | else { | ||
298 | trace_graph_function(tr, parent_ip, flags, pc); | ||
299 | trace_graph_function(tr, ip, flags, pc); | ||
300 | } | ||
301 | } | 260 | } |
302 | 261 | ||
303 | #else | 262 | #else |
@@ -495,14 +454,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) | |||
495 | * Stubs: | 454 | * Stubs: |
496 | */ | 455 | */ |
497 | 456 | ||
498 | void early_boot_irqs_off(void) | ||
499 | { | ||
500 | } | ||
501 | |||
502 | void early_boot_irqs_on(void) | ||
503 | { | ||
504 | } | ||
505 | |||
506 | void trace_softirqs_on(unsigned long ip) | 457 | void trace_softirqs_on(unsigned long ip) |
507 | { | 458 | { |
508 | } | 459 | } |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 7b8ecd751d93..3c5c5dfea0b3 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/kdb.h> | 13 | #include <linux/kdb.h> |
14 | #include <linux/ftrace.h> | 14 | #include <linux/ftrace.h> |
15 | 15 | ||
16 | #include "../debug/kdb/kdb_private.h" | ||
17 | #include "trace.h" | 16 | #include "trace.h" |
18 | #include "trace_output.h" | 17 | #include "trace_output.h" |
19 | 18 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 544301d29dee..27d13b36b8be 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/perf_event.h> | 31 | #include <linux/perf_event.h> |
32 | #include <linux/stringify.h> | 32 | #include <linux/stringify.h> |
33 | #include <linux/limits.h> | 33 | #include <linux/limits.h> |
34 | #include <linux/uaccess.h> | ||
35 | #include <asm/bitsperlong.h> | 34 | #include <asm/bitsperlong.h> |
36 | 35 | ||
37 | #include "trace.h" | 36 | #include "trace.h" |
@@ -54,7 +53,6 @@ const char *reserved_field_names[] = { | |||
54 | "common_preempt_count", | 53 | "common_preempt_count", |
55 | "common_pid", | 54 | "common_pid", |
56 | "common_tgid", | 55 | "common_tgid", |
57 | "common_lock_depth", | ||
58 | FIELD_STRING_IP, | 56 | FIELD_STRING_IP, |
59 | FIELD_STRING_RETIP, | 57 | FIELD_STRING_RETIP, |
60 | FIELD_STRING_FUNC, | 58 | FIELD_STRING_FUNC, |
@@ -354,6 +352,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
354 | kfree(data); | 352 | kfree(data); |
355 | } | 353 | } |
356 | 354 | ||
355 | /* Bitfield fetch function */ | ||
356 | struct bitfield_fetch_param { | ||
357 | struct fetch_param orig; | ||
358 | unsigned char hi_shift; | ||
359 | unsigned char low_shift; | ||
360 | }; | ||
361 | |||
362 | #define DEFINE_FETCH_bitfield(type) \ | ||
363 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
364 | void *data, void *dest) \ | ||
365 | { \ | ||
366 | struct bitfield_fetch_param *bprm = data; \ | ||
367 | type buf = 0; \ | ||
368 | call_fetch(&bprm->orig, regs, &buf); \ | ||
369 | if (buf) { \ | ||
370 | buf <<= bprm->hi_shift; \ | ||
371 | buf >>= bprm->low_shift; \ | ||
372 | } \ | ||
373 | *(type *)dest = buf; \ | ||
374 | } | ||
375 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
376 | #define fetch_bitfield_string NULL | ||
377 | #define fetch_bitfield_string_size NULL | ||
378 | |||
379 | static __kprobes void | ||
380 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
381 | { | ||
382 | /* | ||
383 | * Don't check the bitfield itself, because this must be the | ||
384 | * last fetch function. | ||
385 | */ | ||
386 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
387 | free_deref_fetch_param(data->orig.data); | ||
388 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
389 | free_symbol_cache(data->orig.data); | ||
390 | kfree(data); | ||
391 | } | ||
357 | /* Default (unsigned long) fetch type */ | 392 | /* Default (unsigned long) fetch type */ |
358 | #define __DEFAULT_FETCH_TYPE(t) u##t | 393 | #define __DEFAULT_FETCH_TYPE(t) u##t |
359 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 394 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
@@ -368,6 +403,7 @@ enum { | |||
368 | FETCH_MTD_memory, | 403 | FETCH_MTD_memory, |
369 | FETCH_MTD_symbol, | 404 | FETCH_MTD_symbol, |
370 | FETCH_MTD_deref, | 405 | FETCH_MTD_deref, |
406 | FETCH_MTD_bitfield, | ||
371 | FETCH_MTD_END, | 407 | FETCH_MTD_END, |
372 | }; | 408 | }; |
373 | 409 | ||
@@ -388,6 +424,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ | |||
388 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 424 | ASSIGN_FETCH_FUNC(memory, ftype), \ |
389 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 425 | ASSIGN_FETCH_FUNC(symbol, ftype), \ |
390 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 426 | ASSIGN_FETCH_FUNC(deref, ftype), \ |
427 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
391 | } \ | 428 | } \ |
392 | } | 429 | } |
393 | 430 | ||
@@ -431,9 +468,33 @@ static const struct fetch_type *find_fetch_type(const char *type) | |||
431 | if (!type) | 468 | if (!type) |
432 | type = DEFAULT_FETCH_TYPE_STR; | 469 | type = DEFAULT_FETCH_TYPE_STR; |
433 | 470 | ||
471 | /* Special case: bitfield */ | ||
472 | if (*type == 'b') { | ||
473 | unsigned long bs; | ||
474 | type = strchr(type, '/'); | ||
475 | if (!type) | ||
476 | goto fail; | ||
477 | type++; | ||
478 | if (strict_strtoul(type, 0, &bs)) | ||
479 | goto fail; | ||
480 | switch (bs) { | ||
481 | case 8: | ||
482 | return find_fetch_type("u8"); | ||
483 | case 16: | ||
484 | return find_fetch_type("u16"); | ||
485 | case 32: | ||
486 | return find_fetch_type("u32"); | ||
487 | case 64: | ||
488 | return find_fetch_type("u64"); | ||
489 | default: | ||
490 | goto fail; | ||
491 | } | ||
492 | } | ||
493 | |||
434 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | 494 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) |
435 | if (strcmp(type, fetch_type_table[i].name) == 0) | 495 | if (strcmp(type, fetch_type_table[i].name) == 0) |
436 | return &fetch_type_table[i]; | 496 | return &fetch_type_table[i]; |
497 | fail: | ||
437 | return NULL; | 498 | return NULL; |
438 | } | 499 | } |
439 | 500 | ||
@@ -587,7 +648,9 @@ error: | |||
587 | 648 | ||
588 | static void free_probe_arg(struct probe_arg *arg) | 649 | static void free_probe_arg(struct probe_arg *arg) |
589 | { | 650 | { |
590 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | 651 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) |
652 | free_bitfield_fetch_param(arg->fetch.data); | ||
653 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
591 | free_deref_fetch_param(arg->fetch.data); | 654 | free_deref_fetch_param(arg->fetch.data); |
592 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | 655 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
593 | free_symbol_cache(arg->fetch.data); | 656 | free_symbol_cache(arg->fetch.data); |
@@ -648,7 +711,7 @@ static int register_trace_probe(struct trace_probe *tp) | |||
648 | } | 711 | } |
649 | ret = register_probe_event(tp); | 712 | ret = register_probe_event(tp); |
650 | if (ret) { | 713 | if (ret) { |
651 | pr_warning("Faild to register probe event(%d)\n", ret); | 714 | pr_warning("Failed to register probe event(%d)\n", ret); |
652 | goto end; | 715 | goto end; |
653 | } | 716 | } |
654 | 717 | ||
@@ -768,16 +831,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
768 | } | 831 | } |
769 | break; | 832 | break; |
770 | case '+': /* deref memory */ | 833 | case '+': /* deref memory */ |
834 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
771 | case '-': | 835 | case '-': |
772 | tmp = strchr(arg, '('); | 836 | tmp = strchr(arg, '('); |
773 | if (!tmp) | 837 | if (!tmp) |
774 | break; | 838 | break; |
775 | *tmp = '\0'; | 839 | *tmp = '\0'; |
776 | ret = strict_strtol(arg + 1, 0, &offset); | 840 | ret = strict_strtol(arg, 0, &offset); |
777 | if (ret) | 841 | if (ret) |
778 | break; | 842 | break; |
779 | if (arg[0] == '-') | ||
780 | offset = -offset; | ||
781 | arg = tmp + 1; | 843 | arg = tmp + 1; |
782 | tmp = strrchr(arg, ')'); | 844 | tmp = strrchr(arg, ')'); |
783 | if (tmp) { | 845 | if (tmp) { |
@@ -808,6 +870,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
808 | return ret; | 870 | return ret; |
809 | } | 871 | } |
810 | 872 | ||
873 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
874 | |||
875 | /* Bitfield type needs to be parsed into a fetch function */ | ||
876 | static int __parse_bitfield_probe_arg(const char *bf, | ||
877 | const struct fetch_type *t, | ||
878 | struct fetch_param *f) | ||
879 | { | ||
880 | struct bitfield_fetch_param *bprm; | ||
881 | unsigned long bw, bo; | ||
882 | char *tail; | ||
883 | |||
884 | if (*bf != 'b') | ||
885 | return 0; | ||
886 | |||
887 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
888 | if (!bprm) | ||
889 | return -ENOMEM; | ||
890 | bprm->orig = *f; | ||
891 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
892 | f->data = (void *)bprm; | ||
893 | |||
894 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
895 | if (bw == 0 || *tail != '@') | ||
896 | return -EINVAL; | ||
897 | |||
898 | bf = tail + 1; | ||
899 | bo = simple_strtoul(bf, &tail, 0); | ||
900 | if (tail == bf || *tail != '/') | ||
901 | return -EINVAL; | ||
902 | |||
903 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
904 | bprm->low_shift = bprm->hi_shift + bo; | ||
905 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
906 | } | ||
907 | |||
811 | /* String length checking wrapper */ | 908 | /* String length checking wrapper */ |
812 | static int parse_probe_arg(char *arg, struct trace_probe *tp, | 909 | static int parse_probe_arg(char *arg, struct trace_probe *tp, |
813 | struct probe_arg *parg, int is_return) | 910 | struct probe_arg *parg, int is_return) |
@@ -837,6 +934,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
837 | parg->offset = tp->size; | 934 | parg->offset = tp->size; |
838 | tp->size += parg->type->size; | 935 | tp->size += parg->type->size; |
839 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 936 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
937 | if (ret >= 0 && t != NULL) | ||
938 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
840 | if (ret >= 0) { | 939 | if (ret >= 0) { |
841 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | 940 | parg->fetch_size.fn = get_fetch_size_function(parg->type, |
842 | parg->fetch.fn); | 941 | parg->fetch.fn); |
@@ -1131,7 +1230,7 @@ static int command_trace_probe(const char *buf) | |||
1131 | return ret; | 1230 | return ret; |
1132 | } | 1231 | } |
1133 | 1232 | ||
1134 | #define WRITE_BUFSIZE 128 | 1233 | #define WRITE_BUFSIZE 4096 |
1135 | 1234 | ||
1136 | static ssize_t probes_write(struct file *file, const char __user *buffer, | 1235 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
1137 | size_t count, loff_t *ppos) | 1236 | size_t count, loff_t *ppos) |
@@ -1739,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp) | |||
1739 | kfree(tp->call.print_fmt); | 1838 | kfree(tp->call.print_fmt); |
1740 | } | 1839 | } |
1741 | 1840 | ||
1742 | /* Make a debugfs interface for controling probe points */ | 1841 | /* Make a debugfs interface for controlling probe points */ |
1743 | static __init int init_kprobe_trace(void) | 1842 | static __init int init_kprobe_trace(void) |
1744 | { | 1843 | { |
1745 | struct dentry *d_tracer; | 1844 | struct dentry *d_tracer; |
@@ -1771,8 +1870,12 @@ fs_initcall(init_kprobe_trace); | |||
1771 | 1870 | ||
1772 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1871 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
1773 | 1872 | ||
1774 | static int kprobe_trace_selftest_target(int a1, int a2, int a3, | 1873 | /* |
1775 | int a4, int a5, int a6) | 1874 | * The "__used" keeps gcc from removing the function symbol |
1875 | * from the kallsyms table. | ||
1876 | */ | ||
1877 | static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, | ||
1878 | int a4, int a5, int a6) | ||
1776 | { | 1879 | { |
1777 | return a1 + a2 + a3 + a4 + a5 + a6; | 1880 | return a1 + a2 + a3 + a4 + a5 + a6; |
1778 | } | 1881 | } |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..e37de492a9e1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
353 | } | 353 | } |
354 | EXPORT_SYMBOL(ftrace_print_symbols_seq); | 354 | EXPORT_SYMBOL(ftrace_print_symbols_seq); |
355 | 355 | ||
356 | #if BITS_PER_LONG == 32 | ||
357 | const char * | ||
358 | ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, | ||
359 | const struct trace_print_flags_u64 *symbol_array) | ||
360 | { | ||
361 | int i; | ||
362 | const char *ret = p->buffer + p->len; | ||
363 | |||
364 | for (i = 0; symbol_array[i].name; i++) { | ||
365 | |||
366 | if (val != symbol_array[i].mask) | ||
367 | continue; | ||
368 | |||
369 | trace_seq_puts(p, symbol_array[i].name); | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | if (!p->len) | ||
374 | trace_seq_printf(p, "0x%llx", val); | ||
375 | |||
376 | trace_seq_putc(p, 0); | ||
377 | |||
378 | return ret; | ||
379 | } | ||
380 | EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); | ||
381 | #endif | ||
382 | |||
356 | const char * | 383 | const char * |
357 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 384 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) |
358 | { | 385 | { |
@@ -529,24 +556,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | |||
529 | * @entry: The trace entry field from the ring buffer | 556 | * @entry: The trace entry field from the ring buffer |
530 | * | 557 | * |
531 | * Prints the generic fields of irqs off, in hard or softirq, preempt | 558 | * Prints the generic fields of irqs off, in hard or softirq, preempt |
532 | * count and lock depth. | 559 | * count. |
533 | */ | 560 | */ |
534 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | 561 | int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
535 | { | 562 | { |
536 | int hardirq, softirq; | 563 | char hardsoft_irq; |
564 | char need_resched; | ||
565 | char irqs_off; | ||
566 | int hardirq; | ||
567 | int softirq; | ||
537 | int ret; | 568 | int ret; |
538 | 569 | ||
539 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; | 570 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; |
540 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; | 571 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; |
541 | 572 | ||
573 | irqs_off = | ||
574 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | ||
575 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | ||
576 | '.'; | ||
577 | need_resched = | ||
578 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | ||
579 | hardsoft_irq = | ||
580 | (hardirq && softirq) ? 'H' : | ||
581 | hardirq ? 'h' : | ||
582 | softirq ? 's' : | ||
583 | '.'; | ||
584 | |||
542 | if (!trace_seq_printf(s, "%c%c%c", | 585 | if (!trace_seq_printf(s, "%c%c%c", |
543 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 586 | irqs_off, need_resched, hardsoft_irq)) |
544 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? | ||
545 | 'X' : '.', | ||
546 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? | ||
547 | 'N' : '.', | ||
548 | (hardirq && softirq) ? 'H' : | ||
549 | hardirq ? 'h' : softirq ? 's' : '.')) | ||
550 | return 0; | 587 | return 0; |
551 | 588 | ||
552 | if (entry->preempt_count) | 589 | if (entry->preempt_count) |
@@ -554,13 +591,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
554 | else | 591 | else |
555 | ret = trace_seq_putc(s, '.'); | 592 | ret = trace_seq_putc(s, '.'); |
556 | 593 | ||
557 | if (!ret) | 594 | return ret; |
558 | return 0; | ||
559 | |||
560 | if (entry->lock_depth < 0) | ||
561 | return trace_seq_putc(s, '.'); | ||
562 | |||
563 | return trace_seq_printf(s, "%d", entry->lock_depth); | ||
564 | } | 595 | } |
565 | 596 | ||
566 | static int | 597 | static int |
@@ -826,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); | |||
826 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, | 857 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, |
827 | struct trace_event *event) | 858 | struct trace_event *event) |
828 | { | 859 | { |
860 | if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) | ||
861 | return TRACE_TYPE_PARTIAL_LINE; | ||
862 | |||
829 | return TRACE_TYPE_HANDLED; | 863 | return TRACE_TYPE_HANDLED; |
830 | } | 864 | } |
831 | 865 | ||
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..1f06468a10d7 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); | |||
32 | 32 | ||
33 | struct trace_bprintk_fmt { | 33 | struct trace_bprintk_fmt { |
34 | struct list_head list; | 34 | struct list_head list; |
35 | char fmt[0]; | 35 | const char *fmt; |
36 | }; | 36 | }; |
37 | 37 | ||
38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) | 38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) |
@@ -49,6 +49,7 @@ static | |||
49 | void hold_module_trace_bprintk_format(const char **start, const char **end) | 49 | void hold_module_trace_bprintk_format(const char **start, const char **end) |
50 | { | 50 | { |
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | ||
52 | 53 | ||
53 | mutex_lock(&btrace_mutex); | 54 | mutex_lock(&btrace_mutex); |
54 | for (iter = start; iter < end; iter++) { | 55 | for (iter = start; iter < end; iter++) { |
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
58 | continue; | 59 | continue; |
59 | } | 60 | } |
60 | 61 | ||
61 | tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) | 62 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); |
62 | + strlen(*iter) + 1, GFP_KERNEL); | 63 | if (tb_fmt) |
63 | if (tb_fmt) { | 64 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); |
65 | if (tb_fmt && fmt) { | ||
64 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); | 66 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); |
65 | strcpy(tb_fmt->fmt, *iter); | 67 | strcpy(fmt, *iter); |
68 | tb_fmt->fmt = fmt; | ||
66 | *iter = tb_fmt->fmt; | 69 | *iter = tb_fmt->fmt; |
67 | } else | 70 | } else { |
71 | kfree(tb_fmt); | ||
68 | *iter = NULL; | 72 | *iter = NULL; |
73 | } | ||
69 | } | 74 | } |
70 | mutex_unlock(&btrace_mutex); | 75 | mutex_unlock(&btrace_mutex); |
71 | } | 76 | } |
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, | |||
84 | return 0; | 89 | return 0; |
85 | } | 90 | } |
86 | 91 | ||
92 | /* | ||
93 | * The debugfs/tracing/printk_formats file maps the addresses with | ||
94 | * the ASCII formats that are used in the bprintk events in the | ||
95 | * buffer. For userspace tools to be able to decode the events from | ||
96 | * the buffer, they need to be able to map the address with the format. | ||
97 | * | ||
98 | * The addresses of the bprintk formats are in their own section | ||
99 | * __trace_printk_fmt. But for modules we copy them into a link list. | ||
100 | * The code to print the formats and their addresses passes around the | ||
101 | * address of the fmt string. If the fmt address passed into the seq | ||
102 | * functions is within the kernel core __trace_printk_fmt section, then | ||
103 | * it simply uses the next pointer in the list. | ||
104 | * | ||
105 | * When the fmt pointer is outside the kernel core __trace_printk_fmt | ||
106 | * section, then we need to read the link list pointers. The trick is | ||
107 | * we pass the address of the string to the seq function just like | ||
108 | * we do for the kernel core formats. To get back the structure that | ||
109 | * holds the format, we simply use containerof() and then go to the | ||
110 | * next format in the list. | ||
111 | */ | ||
112 | static const char ** | ||
113 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
114 | { | ||
115 | struct trace_bprintk_fmt *mod_fmt; | ||
116 | |||
117 | if (list_empty(&trace_bprintk_fmt_list)) | ||
118 | return NULL; | ||
119 | |||
120 | /* | ||
121 | * v will point to the address of the fmt record from t_next | ||
122 | * v will be NULL from t_start. | ||
123 | * If this is the first pointer or called from start | ||
124 | * then we need to walk the list. | ||
125 | */ | ||
126 | if (!v || start_index == *pos) { | ||
127 | struct trace_bprintk_fmt *p; | ||
128 | |||
129 | /* search the module list */ | ||
130 | list_for_each_entry(p, &trace_bprintk_fmt_list, list) { | ||
131 | if (start_index == *pos) | ||
132 | return &p->fmt; | ||
133 | start_index++; | ||
134 | } | ||
135 | /* pos > index */ | ||
136 | return NULL; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * v points to the address of the fmt field in the mod list | ||
141 | * structure that holds the module print format. | ||
142 | */ | ||
143 | mod_fmt = container_of(v, typeof(*mod_fmt), fmt); | ||
144 | if (mod_fmt->list.next == &trace_bprintk_fmt_list) | ||
145 | return NULL; | ||
146 | |||
147 | mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); | ||
148 | |||
149 | return &mod_fmt->fmt; | ||
150 | } | ||
151 | |||
152 | static void format_mod_start(void) | ||
153 | { | ||
154 | mutex_lock(&btrace_mutex); | ||
155 | } | ||
156 | |||
157 | static void format_mod_stop(void) | ||
158 | { | ||
159 | mutex_unlock(&btrace_mutex); | ||
160 | } | ||
161 | |||
87 | #else /* !CONFIG_MODULES */ | 162 | #else /* !CONFIG_MODULES */ |
88 | __init static int | 163 | __init static int |
89 | module_trace_bprintk_format_notify(struct notifier_block *self, | 164 | module_trace_bprintk_format_notify(struct notifier_block *self, |
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, | |||
91 | { | 166 | { |
92 | return 0; | 167 | return 0; |
93 | } | 168 | } |
169 | static inline const char ** | ||
170 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
171 | { | ||
172 | return NULL; | ||
173 | } | ||
174 | static inline void format_mod_start(void) { } | ||
175 | static inline void format_mod_stop(void) { } | ||
94 | #endif /* CONFIG_MODULES */ | 176 | #endif /* CONFIG_MODULES */ |
95 | 177 | ||
96 | 178 | ||
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) | |||
153 | } | 235 | } |
154 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); | 236 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); |
155 | 237 | ||
238 | static const char **find_next(void *v, loff_t *pos) | ||
239 | { | ||
240 | const char **fmt = v; | ||
241 | int start_index; | ||
242 | |||
243 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | ||
244 | |||
245 | if (*pos < start_index) | ||
246 | return __start___trace_bprintk_fmt + *pos; | ||
247 | |||
248 | return find_next_mod_format(start_index, v, fmt, pos); | ||
249 | } | ||
250 | |||
156 | static void * | 251 | static void * |
157 | t_start(struct seq_file *m, loff_t *pos) | 252 | t_start(struct seq_file *m, loff_t *pos) |
158 | { | 253 | { |
159 | const char **fmt = __start___trace_bprintk_fmt + *pos; | 254 | format_mod_start(); |
160 | 255 | return find_next(NULL, pos); | |
161 | if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) | ||
162 | return NULL; | ||
163 | return fmt; | ||
164 | } | 256 | } |
165 | 257 | ||
166 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) | 258 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) |
167 | { | 259 | { |
168 | (*pos)++; | 260 | (*pos)++; |
169 | return t_start(m, pos); | 261 | return find_next(v, pos); |
170 | } | 262 | } |
171 | 263 | ||
172 | static int t_show(struct seq_file *m, void *v) | 264 | static int t_show(struct seq_file *m, void *v) |
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v) | |||
205 | 297 | ||
206 | static void t_stop(struct seq_file *m, void *p) | 298 | static void t_stop(struct seq_file *m, void *p) |
207 | { | 299 | { |
300 | format_mod_stop(); | ||
208 | } | 301 | } |
209 | 302 | ||
210 | static const struct seq_operations show_format_seq_ops = { | 303 | static const struct seq_operations show_format_seq_ops = { |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) | |||
247 | ctx_trace = tr; | 247 | ctx_trace = tr; |
248 | } | 248 | } |
249 | 249 | ||
250 | static void stop_sched_trace(struct trace_array *tr) | ||
251 | { | ||
252 | tracing_stop_sched_switch_record(); | ||
253 | } | ||
254 | |||
255 | static int sched_switch_trace_init(struct trace_array *tr) | ||
256 | { | ||
257 | ctx_trace = tr; | ||
258 | tracing_reset_online_cpus(tr); | ||
259 | tracing_start_sched_switch_record(); | ||
260 | return 0; | ||
261 | } | ||
262 | |||
263 | static void sched_switch_trace_reset(struct trace_array *tr) | ||
264 | { | ||
265 | if (sched_ref) | ||
266 | stop_sched_trace(tr); | ||
267 | } | ||
268 | |||
269 | static void sched_switch_trace_start(struct trace_array *tr) | ||
270 | { | ||
271 | sched_stopped = 0; | ||
272 | } | ||
273 | |||
274 | static void sched_switch_trace_stop(struct trace_array *tr) | ||
275 | { | ||
276 | sched_stopped = 1; | ||
277 | } | ||
278 | |||
279 | static struct tracer sched_switch_trace __read_mostly = | ||
280 | { | ||
281 | .name = "sched_switch", | ||
282 | .init = sched_switch_trace_init, | ||
283 | .reset = sched_switch_trace_reset, | ||
284 | .start = sched_switch_trace_start, | ||
285 | .stop = sched_switch_trace_stop, | ||
286 | .wait_pipe = poll_wait_pipe, | ||
287 | #ifdef CONFIG_FTRACE_SELFTEST | ||
288 | .selftest = trace_selftest_startup_sched_switch, | ||
289 | #endif | ||
290 | }; | ||
291 | |||
292 | __init static int init_sched_switch_trace(void) | ||
293 | { | ||
294 | return register_tracer(&sched_switch_trace); | ||
295 | } | ||
296 | device_initcall(init_sched_switch_trace); | ||
297 | |||
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..f029dd4fd2ca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -31,57 +31,258 @@ static int wakeup_rt; | |||
31 | static arch_spinlock_t wakeup_lock = | 31 | static arch_spinlock_t wakeup_lock = |
32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
33 | 33 | ||
34 | static void wakeup_reset(struct trace_array *tr); | ||
34 | static void __wakeup_reset(struct trace_array *tr); | 35 | static void __wakeup_reset(struct trace_array *tr); |
36 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace); | ||
37 | static void wakeup_graph_return(struct ftrace_graph_ret *trace); | ||
35 | 38 | ||
36 | static int save_lat_flag; | 39 | static int save_lat_flag; |
37 | 40 | ||
41 | #define TRACE_DISPLAY_GRAPH 1 | ||
42 | |||
43 | static struct tracer_opt trace_opts[] = { | ||
44 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
45 | /* display latency trace as call graph */ | ||
46 | { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, | ||
47 | #endif | ||
48 | { } /* Empty entry */ | ||
49 | }; | ||
50 | |||
51 | static struct tracer_flags tracer_flags = { | ||
52 | .val = 0, | ||
53 | .opts = trace_opts, | ||
54 | }; | ||
55 | |||
56 | #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) | ||
57 | |||
38 | #ifdef CONFIG_FUNCTION_TRACER | 58 | #ifdef CONFIG_FUNCTION_TRACER |
59 | |||
39 | /* | 60 | /* |
40 | * irqsoff uses its own tracer function to keep the overhead down: | 61 | * Prologue for the wakeup function tracers. |
62 | * | ||
63 | * Returns 1 if it is OK to continue, and preemption | ||
64 | * is disabled and data->disabled is incremented. | ||
65 | * 0 if the trace is to be ignored, and preemption | ||
66 | * is not disabled and data->disabled is | ||
67 | * kept the same. | ||
68 | * | ||
69 | * Note, this function is also used outside this ifdef but | ||
70 | * inside the #ifdef of the function graph tracer below. | ||
71 | * This is OK, since the function graph tracer is | ||
72 | * dependent on the function tracer. | ||
41 | */ | 73 | */ |
42 | static void | 74 | static int |
43 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | 75 | func_prolog_preempt_disable(struct trace_array *tr, |
76 | struct trace_array_cpu **data, | ||
77 | int *pc) | ||
44 | { | 78 | { |
45 | struct trace_array *tr = wakeup_trace; | ||
46 | struct trace_array_cpu *data; | ||
47 | unsigned long flags; | ||
48 | long disabled; | 79 | long disabled; |
49 | int cpu; | 80 | int cpu; |
50 | int pc; | ||
51 | 81 | ||
52 | if (likely(!wakeup_task)) | 82 | if (likely(!wakeup_task)) |
53 | return; | 83 | return 0; |
54 | 84 | ||
55 | pc = preempt_count(); | 85 | *pc = preempt_count(); |
56 | preempt_disable_notrace(); | 86 | preempt_disable_notrace(); |
57 | 87 | ||
58 | cpu = raw_smp_processor_id(); | 88 | cpu = raw_smp_processor_id(); |
59 | if (cpu != wakeup_current_cpu) | 89 | if (cpu != wakeup_current_cpu) |
60 | goto out_enable; | 90 | goto out_enable; |
61 | 91 | ||
62 | data = tr->data[cpu]; | 92 | *data = tr->data[cpu]; |
63 | disabled = atomic_inc_return(&data->disabled); | 93 | disabled = atomic_inc_return(&(*data)->disabled); |
64 | if (unlikely(disabled != 1)) | 94 | if (unlikely(disabled != 1)) |
65 | goto out; | 95 | goto out; |
66 | 96 | ||
67 | local_irq_save(flags); | 97 | return 1; |
68 | 98 | ||
69 | trace_function(tr, ip, parent_ip, flags, pc); | 99 | out: |
100 | atomic_dec(&(*data)->disabled); | ||
101 | |||
102 | out_enable: | ||
103 | preempt_enable_notrace(); | ||
104 | return 0; | ||
105 | } | ||
70 | 106 | ||
107 | /* | ||
108 | * wakeup uses its own tracer function to keep the overhead down: | ||
109 | */ | ||
110 | static void | ||
111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
112 | { | ||
113 | struct trace_array *tr = wakeup_trace; | ||
114 | struct trace_array_cpu *data; | ||
115 | unsigned long flags; | ||
116 | int pc; | ||
117 | |||
118 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
119 | return; | ||
120 | |||
121 | local_irq_save(flags); | ||
122 | trace_function(tr, ip, parent_ip, flags, pc); | ||
71 | local_irq_restore(flags); | 123 | local_irq_restore(flags); |
72 | 124 | ||
73 | out: | ||
74 | atomic_dec(&data->disabled); | 125 | atomic_dec(&data->disabled); |
75 | out_enable: | ||
76 | preempt_enable_notrace(); | 126 | preempt_enable_notrace(); |
77 | } | 127 | } |
78 | 128 | ||
79 | static struct ftrace_ops trace_ops __read_mostly = | 129 | static struct ftrace_ops trace_ops __read_mostly = |
80 | { | 130 | { |
81 | .func = wakeup_tracer_call, | 131 | .func = wakeup_tracer_call, |
132 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
82 | }; | 133 | }; |
83 | #endif /* CONFIG_FUNCTION_TRACER */ | 134 | #endif /* CONFIG_FUNCTION_TRACER */ |
84 | 135 | ||
136 | static int start_func_tracer(int graph) | ||
137 | { | ||
138 | int ret; | ||
139 | |||
140 | if (!graph) | ||
141 | ret = register_ftrace_function(&trace_ops); | ||
142 | else | ||
143 | ret = register_ftrace_graph(&wakeup_graph_return, | ||
144 | &wakeup_graph_entry); | ||
145 | |||
146 | if (!ret && tracing_is_enabled()) | ||
147 | tracer_enabled = 1; | ||
148 | else | ||
149 | tracer_enabled = 0; | ||
150 | |||
151 | return ret; | ||
152 | } | ||
153 | |||
154 | static void stop_func_tracer(int graph) | ||
155 | { | ||
156 | tracer_enabled = 0; | ||
157 | |||
158 | if (!graph) | ||
159 | unregister_ftrace_function(&trace_ops); | ||
160 | else | ||
161 | unregister_ftrace_graph(); | ||
162 | } | ||
163 | |||
164 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
165 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
166 | { | ||
167 | |||
168 | if (!(bit & TRACE_DISPLAY_GRAPH)) | ||
169 | return -EINVAL; | ||
170 | |||
171 | if (!(is_graph() ^ set)) | ||
172 | return 0; | ||
173 | |||
174 | stop_func_tracer(!set); | ||
175 | |||
176 | wakeup_reset(wakeup_trace); | ||
177 | tracing_max_latency = 0; | ||
178 | |||
179 | return start_func_tracer(set); | ||
180 | } | ||
181 | |||
182 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
183 | { | ||
184 | struct trace_array *tr = wakeup_trace; | ||
185 | struct trace_array_cpu *data; | ||
186 | unsigned long flags; | ||
187 | int pc, ret = 0; | ||
188 | |||
189 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
190 | return 0; | ||
191 | |||
192 | local_save_flags(flags); | ||
193 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
194 | atomic_dec(&data->disabled); | ||
195 | preempt_enable_notrace(); | ||
196 | |||
197 | return ret; | ||
198 | } | ||
199 | |||
200 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) | ||
201 | { | ||
202 | struct trace_array *tr = wakeup_trace; | ||
203 | struct trace_array_cpu *data; | ||
204 | unsigned long flags; | ||
205 | int pc; | ||
206 | |||
207 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
208 | return; | ||
209 | |||
210 | local_save_flags(flags); | ||
211 | __trace_graph_return(tr, trace, flags, pc); | ||
212 | atomic_dec(&data->disabled); | ||
213 | |||
214 | preempt_enable_notrace(); | ||
215 | return; | ||
216 | } | ||
217 | |||
218 | static void wakeup_trace_open(struct trace_iterator *iter) | ||
219 | { | ||
220 | if (is_graph()) | ||
221 | graph_trace_open(iter); | ||
222 | } | ||
223 | |||
224 | static void wakeup_trace_close(struct trace_iterator *iter) | ||
225 | { | ||
226 | if (iter->private) | ||
227 | graph_trace_close(iter); | ||
228 | } | ||
229 | |||
230 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) | ||
231 | |||
232 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
233 | { | ||
234 | /* | ||
235 | * In graph mode call the graph tracer output function, | ||
236 | * otherwise go with the TRACE_FN event handler | ||
237 | */ | ||
238 | if (is_graph()) | ||
239 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); | ||
240 | |||
241 | return TRACE_TYPE_UNHANDLED; | ||
242 | } | ||
243 | |||
244 | static void wakeup_print_header(struct seq_file *s) | ||
245 | { | ||
246 | if (is_graph()) | ||
247 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); | ||
248 | else | ||
249 | trace_default_header(s); | ||
250 | } | ||
251 | |||
252 | static void | ||
253 | __trace_function(struct trace_array *tr, | ||
254 | unsigned long ip, unsigned long parent_ip, | ||
255 | unsigned long flags, int pc) | ||
256 | { | ||
257 | if (is_graph()) | ||
258 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
259 | else | ||
260 | trace_function(tr, ip, parent_ip, flags, pc); | ||
261 | } | ||
262 | #else | ||
263 | #define __trace_function trace_function | ||
264 | |||
265 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
266 | { | ||
267 | return -EINVAL; | ||
268 | } | ||
269 | |||
270 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
271 | { | ||
272 | return -1; | ||
273 | } | ||
274 | |||
275 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
276 | { | ||
277 | return TRACE_TYPE_UNHANDLED; | ||
278 | } | ||
279 | |||
280 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | ||
281 | static void wakeup_print_header(struct seq_file *s) { } | ||
282 | static void wakeup_trace_open(struct trace_iterator *iter) { } | ||
283 | static void wakeup_trace_close(struct trace_iterator *iter) { } | ||
284 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
285 | |||
85 | /* | 286 | /* |
86 | * Should this new latency be reported/recorded? | 287 | * Should this new latency be reported/recorded? |
87 | */ | 288 | */ |
@@ -152,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore, | |||
152 | /* The task we are waiting for is waking up */ | 353 | /* The task we are waiting for is waking up */ |
153 | data = wakeup_trace->data[wakeup_cpu]; | 354 | data = wakeup_trace->data[wakeup_cpu]; |
154 | 355 | ||
155 | trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); | 356 | __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); |
156 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); | 357 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); |
157 | 358 | ||
158 | T0 = data->preempt_timestamp; | 359 | T0 = data->preempt_timestamp; |
@@ -252,7 +453,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
252 | * is not called by an assembly function (where as schedule is) | 453 | * is not called by an assembly function (where as schedule is) |
253 | * it should be safe to use it here. | 454 | * it should be safe to use it here. |
254 | */ | 455 | */ |
255 | trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); | 456 | __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); |
256 | 457 | ||
257 | out_locked: | 458 | out_locked: |
258 | arch_spin_unlock(&wakeup_lock); | 459 | arch_spin_unlock(&wakeup_lock); |
@@ -303,12 +504,8 @@ static void start_wakeup_tracer(struct trace_array *tr) | |||
303 | */ | 504 | */ |
304 | smp_wmb(); | 505 | smp_wmb(); |
305 | 506 | ||
306 | register_ftrace_function(&trace_ops); | 507 | if (start_func_tracer(is_graph())) |
307 | 508 | printk(KERN_ERR "failed to start wakeup tracer\n"); | |
308 | if (tracing_is_enabled()) | ||
309 | tracer_enabled = 1; | ||
310 | else | ||
311 | tracer_enabled = 0; | ||
312 | 509 | ||
313 | return; | 510 | return; |
314 | fail_deprobe_wake_new: | 511 | fail_deprobe_wake_new: |
@@ -320,7 +517,7 @@ fail_deprobe: | |||
320 | static void stop_wakeup_tracer(struct trace_array *tr) | 517 | static void stop_wakeup_tracer(struct trace_array *tr) |
321 | { | 518 | { |
322 | tracer_enabled = 0; | 519 | tracer_enabled = 0; |
323 | unregister_ftrace_function(&trace_ops); | 520 | stop_func_tracer(is_graph()); |
324 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); | 521 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); |
325 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); | 522 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); |
326 | unregister_trace_sched_wakeup(probe_wakeup, NULL); | 523 | unregister_trace_sched_wakeup(probe_wakeup, NULL); |
@@ -379,9 +576,15 @@ static struct tracer wakeup_tracer __read_mostly = | |||
379 | .start = wakeup_tracer_start, | 576 | .start = wakeup_tracer_start, |
380 | .stop = wakeup_tracer_stop, | 577 | .stop = wakeup_tracer_stop, |
381 | .print_max = 1, | 578 | .print_max = 1, |
579 | .print_header = wakeup_print_header, | ||
580 | .print_line = wakeup_print_line, | ||
581 | .flags = &tracer_flags, | ||
582 | .set_flag = wakeup_set_flag, | ||
382 | #ifdef CONFIG_FTRACE_SELFTEST | 583 | #ifdef CONFIG_FTRACE_SELFTEST |
383 | .selftest = trace_selftest_startup_wakeup, | 584 | .selftest = trace_selftest_startup_wakeup, |
384 | #endif | 585 | #endif |
586 | .open = wakeup_trace_open, | ||
587 | .close = wakeup_trace_close, | ||
385 | .use_max_tr = 1, | 588 | .use_max_tr = 1, |
386 | }; | 589 | }; |
387 | 590 | ||
@@ -394,9 +597,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
394 | .stop = wakeup_tracer_stop, | 597 | .stop = wakeup_tracer_stop, |
395 | .wait_pipe = poll_wait_pipe, | 598 | .wait_pipe = poll_wait_pipe, |
396 | .print_max = 1, | 599 | .print_max = 1, |
600 | .print_header = wakeup_print_header, | ||
601 | .print_line = wakeup_print_line, | ||
602 | .flags = &tracer_flags, | ||
603 | .set_flag = wakeup_set_flag, | ||
397 | #ifdef CONFIG_FTRACE_SELFTEST | 604 | #ifdef CONFIG_FTRACE_SELFTEST |
398 | .selftest = trace_selftest_startup_wakeup, | 605 | .selftest = trace_selftest_startup_wakeup, |
399 | #endif | 606 | #endif |
607 | .open = wakeup_trace_open, | ||
608 | .close = wakeup_trace_close, | ||
400 | .use_max_tr = 1, | 609 | .use_max_tr = 1, |
401 | }; | 610 | }; |
402 | 611 | ||
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b3209..288541f977fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) | |||
101 | 101 | ||
102 | #ifdef CONFIG_DYNAMIC_FTRACE | 102 | #ifdef CONFIG_DYNAMIC_FTRACE |
103 | 103 | ||
104 | static int trace_selftest_test_probe1_cnt; | ||
105 | static void trace_selftest_test_probe1_func(unsigned long ip, | ||
106 | unsigned long pip) | ||
107 | { | ||
108 | trace_selftest_test_probe1_cnt++; | ||
109 | } | ||
110 | |||
111 | static int trace_selftest_test_probe2_cnt; | ||
112 | static void trace_selftest_test_probe2_func(unsigned long ip, | ||
113 | unsigned long pip) | ||
114 | { | ||
115 | trace_selftest_test_probe2_cnt++; | ||
116 | } | ||
117 | |||
118 | static int trace_selftest_test_probe3_cnt; | ||
119 | static void trace_selftest_test_probe3_func(unsigned long ip, | ||
120 | unsigned long pip) | ||
121 | { | ||
122 | trace_selftest_test_probe3_cnt++; | ||
123 | } | ||
124 | |||
125 | static int trace_selftest_test_global_cnt; | ||
126 | static void trace_selftest_test_global_func(unsigned long ip, | ||
127 | unsigned long pip) | ||
128 | { | ||
129 | trace_selftest_test_global_cnt++; | ||
130 | } | ||
131 | |||
132 | static int trace_selftest_test_dyn_cnt; | ||
133 | static void trace_selftest_test_dyn_func(unsigned long ip, | ||
134 | unsigned long pip) | ||
135 | { | ||
136 | trace_selftest_test_dyn_cnt++; | ||
137 | } | ||
138 | |||
139 | static struct ftrace_ops test_probe1 = { | ||
140 | .func = trace_selftest_test_probe1_func, | ||
141 | }; | ||
142 | |||
143 | static struct ftrace_ops test_probe2 = { | ||
144 | .func = trace_selftest_test_probe2_func, | ||
145 | }; | ||
146 | |||
147 | static struct ftrace_ops test_probe3 = { | ||
148 | .func = trace_selftest_test_probe3_func, | ||
149 | }; | ||
150 | |||
151 | static struct ftrace_ops test_global = { | ||
152 | .func = trace_selftest_test_global_func, | ||
153 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
154 | }; | ||
155 | |||
156 | static void print_counts(void) | ||
157 | { | ||
158 | printk("(%d %d %d %d %d) ", | ||
159 | trace_selftest_test_probe1_cnt, | ||
160 | trace_selftest_test_probe2_cnt, | ||
161 | trace_selftest_test_probe3_cnt, | ||
162 | trace_selftest_test_global_cnt, | ||
163 | trace_selftest_test_dyn_cnt); | ||
164 | } | ||
165 | |||
166 | static void reset_counts(void) | ||
167 | { | ||
168 | trace_selftest_test_probe1_cnt = 0; | ||
169 | trace_selftest_test_probe2_cnt = 0; | ||
170 | trace_selftest_test_probe3_cnt = 0; | ||
171 | trace_selftest_test_global_cnt = 0; | ||
172 | trace_selftest_test_dyn_cnt = 0; | ||
173 | } | ||
174 | |||
175 | static int trace_selftest_ops(int cnt) | ||
176 | { | ||
177 | int save_ftrace_enabled = ftrace_enabled; | ||
178 | struct ftrace_ops *dyn_ops; | ||
179 | char *func1_name; | ||
180 | char *func2_name; | ||
181 | int len1; | ||
182 | int len2; | ||
183 | int ret = -1; | ||
184 | |||
185 | printk(KERN_CONT "PASSED\n"); | ||
186 | pr_info("Testing dynamic ftrace ops #%d: ", cnt); | ||
187 | |||
188 | ftrace_enabled = 1; | ||
189 | reset_counts(); | ||
190 | |||
191 | /* Handle PPC64 '.' name */ | ||
192 | func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
193 | func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); | ||
194 | len1 = strlen(func1_name); | ||
195 | len2 = strlen(func2_name); | ||
196 | |||
197 | /* | ||
198 | * Probe 1 will trace function 1. | ||
199 | * Probe 2 will trace function 2. | ||
200 | * Probe 3 will trace functions 1 and 2. | ||
201 | */ | ||
202 | ftrace_set_filter(&test_probe1, func1_name, len1, 1); | ||
203 | ftrace_set_filter(&test_probe2, func2_name, len2, 1); | ||
204 | ftrace_set_filter(&test_probe3, func1_name, len1, 1); | ||
205 | ftrace_set_filter(&test_probe3, func2_name, len2, 0); | ||
206 | |||
207 | register_ftrace_function(&test_probe1); | ||
208 | register_ftrace_function(&test_probe2); | ||
209 | register_ftrace_function(&test_probe3); | ||
210 | register_ftrace_function(&test_global); | ||
211 | |||
212 | DYN_FTRACE_TEST_NAME(); | ||
213 | |||
214 | print_counts(); | ||
215 | |||
216 | if (trace_selftest_test_probe1_cnt != 1) | ||
217 | goto out; | ||
218 | if (trace_selftest_test_probe2_cnt != 0) | ||
219 | goto out; | ||
220 | if (trace_selftest_test_probe3_cnt != 1) | ||
221 | goto out; | ||
222 | if (trace_selftest_test_global_cnt == 0) | ||
223 | goto out; | ||
224 | |||
225 | DYN_FTRACE_TEST_NAME2(); | ||
226 | |||
227 | print_counts(); | ||
228 | |||
229 | if (trace_selftest_test_probe1_cnt != 1) | ||
230 | goto out; | ||
231 | if (trace_selftest_test_probe2_cnt != 1) | ||
232 | goto out; | ||
233 | if (trace_selftest_test_probe3_cnt != 2) | ||
234 | goto out; | ||
235 | |||
236 | /* Add a dynamic probe */ | ||
237 | dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); | ||
238 | if (!dyn_ops) { | ||
239 | printk("MEMORY ERROR "); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | dyn_ops->func = trace_selftest_test_dyn_func; | ||
244 | |||
245 | register_ftrace_function(dyn_ops); | ||
246 | |||
247 | trace_selftest_test_global_cnt = 0; | ||
248 | |||
249 | DYN_FTRACE_TEST_NAME(); | ||
250 | |||
251 | print_counts(); | ||
252 | |||
253 | if (trace_selftest_test_probe1_cnt != 2) | ||
254 | goto out_free; | ||
255 | if (trace_selftest_test_probe2_cnt != 1) | ||
256 | goto out_free; | ||
257 | if (trace_selftest_test_probe3_cnt != 3) | ||
258 | goto out_free; | ||
259 | if (trace_selftest_test_global_cnt == 0) | ||
260 | goto out; | ||
261 | if (trace_selftest_test_dyn_cnt == 0) | ||
262 | goto out_free; | ||
263 | |||
264 | DYN_FTRACE_TEST_NAME2(); | ||
265 | |||
266 | print_counts(); | ||
267 | |||
268 | if (trace_selftest_test_probe1_cnt != 2) | ||
269 | goto out_free; | ||
270 | if (trace_selftest_test_probe2_cnt != 2) | ||
271 | goto out_free; | ||
272 | if (trace_selftest_test_probe3_cnt != 4) | ||
273 | goto out_free; | ||
274 | |||
275 | ret = 0; | ||
276 | out_free: | ||
277 | unregister_ftrace_function(dyn_ops); | ||
278 | kfree(dyn_ops); | ||
279 | |||
280 | out: | ||
281 | /* Purposely unregister in the same order */ | ||
282 | unregister_ftrace_function(&test_probe1); | ||
283 | unregister_ftrace_function(&test_probe2); | ||
284 | unregister_ftrace_function(&test_probe3); | ||
285 | unregister_ftrace_function(&test_global); | ||
286 | |||
287 | /* Make sure everything is off */ | ||
288 | reset_counts(); | ||
289 | DYN_FTRACE_TEST_NAME(); | ||
290 | DYN_FTRACE_TEST_NAME(); | ||
291 | |||
292 | if (trace_selftest_test_probe1_cnt || | ||
293 | trace_selftest_test_probe2_cnt || | ||
294 | trace_selftest_test_probe3_cnt || | ||
295 | trace_selftest_test_global_cnt || | ||
296 | trace_selftest_test_dyn_cnt) | ||
297 | ret = -1; | ||
298 | |||
299 | ftrace_enabled = save_ftrace_enabled; | ||
300 | |||
301 | return ret; | ||
302 | } | ||
303 | |||
104 | /* Test dynamic code modification and ftrace filters */ | 304 | /* Test dynamic code modification and ftrace filters */ |
105 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | 305 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, |
106 | struct trace_array *tr, | 306 | struct trace_array *tr, |
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
131 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 331 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
132 | 332 | ||
133 | /* filter only on our function */ | 333 | /* filter only on our function */ |
134 | ftrace_set_filter(func_name, strlen(func_name), 1); | 334 | ftrace_set_global_filter(func_name, strlen(func_name), 1); |
135 | 335 | ||
136 | /* enable tracing */ | 336 | /* enable tracing */ |
137 | ret = tracer_init(trace, tr); | 337 | ret = tracer_init(trace, tr); |
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
166 | 366 | ||
167 | /* check the trace buffer */ | 367 | /* check the trace buffer */ |
168 | ret = trace_test_buffer(tr, &count); | 368 | ret = trace_test_buffer(tr, &count); |
169 | trace->reset(tr); | ||
170 | tracing_start(); | 369 | tracing_start(); |
171 | 370 | ||
172 | /* we should only have one item */ | 371 | /* we should only have one item */ |
173 | if (!ret && count != 1) { | 372 | if (!ret && count != 1) { |
373 | trace->reset(tr); | ||
174 | printk(KERN_CONT ".. filter failed count=%ld ..", count); | 374 | printk(KERN_CONT ".. filter failed count=%ld ..", count); |
175 | ret = -1; | 375 | ret = -1; |
176 | goto out; | 376 | goto out; |
177 | } | 377 | } |
178 | 378 | ||
379 | /* Test the ops with global tracing running */ | ||
380 | ret = trace_selftest_ops(1); | ||
381 | trace->reset(tr); | ||
382 | |||
179 | out: | 383 | out: |
180 | ftrace_enabled = save_ftrace_enabled; | 384 | ftrace_enabled = save_ftrace_enabled; |
181 | tracer_enabled = save_tracer_enabled; | 385 | tracer_enabled = save_tracer_enabled; |
182 | 386 | ||
183 | /* Enable tracing on all functions again */ | 387 | /* Enable tracing on all functions again */ |
184 | ftrace_set_filter(NULL, 0, 1); | 388 | ftrace_set_global_filter(NULL, 0, 1); |
389 | |||
390 | /* Test the ops with global tracing off */ | ||
391 | if (!ret) | ||
392 | ret = trace_selftest_ops(2); | ||
185 | 393 | ||
186 | return ret; | 394 | return ret; |
187 | } | 395 | } |
@@ -558,7 +766,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
558 | static int trace_wakeup_test_thread(void *data) | 766 | static int trace_wakeup_test_thread(void *data) |
559 | { | 767 | { |
560 | /* Make this a RT thread, doesn't need to be too high */ | 768 | /* Make this a RT thread, doesn't need to be too high */ |
561 | struct sched_param param = { .sched_priority = 5 }; | 769 | static const struct sched_param param = { .sched_priority = 5 }; |
562 | struct completion *x = data; | 770 | struct completion *x = data; |
563 | 771 | ||
564 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 772 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5bf..b4c475a0a48b 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c | |||
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) | |||
5 | /* used to call mcount */ | 5 | /* used to call mcount */ |
6 | return 0; | 6 | return 0; |
7 | } | 7 | } |
8 | |||
9 | int DYN_FTRACE_TEST_NAME2(void) | ||
10 | { | ||
11 | /* used to call mcount */ | ||
12 | return 0; | ||
13 | } | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a6b7e0e0f3eb..b0b53b8e4c25 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
134 | { | 134 | { |
135 | .func = stack_trace_call, | 135 | .func = stack_trace_call, |
136 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
136 | }; | 137 | }; |
137 | 138 | ||
138 | static ssize_t | 139 | static ssize_t |
@@ -195,6 +196,7 @@ static const struct file_operations stack_max_size_fops = { | |||
195 | .open = tracing_open_generic, | 196 | .open = tracing_open_generic, |
196 | .read = stack_max_size_read, | 197 | .read = stack_max_size_read, |
197 | .write = stack_max_size_write, | 198 | .write = stack_max_size_write, |
199 | .llseek = default_llseek, | ||
198 | }; | 200 | }; |
199 | 201 | ||
200 | static void * | 202 | static void * |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb5..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
25 | 25 | ||
26 | /* All syscall exit events have the same fields */ | ||
27 | static LIST_HEAD(syscall_exit_fields); | ||
28 | |||
29 | static struct list_head * | 26 | static struct list_head * |
30 | syscall_get_enter_fields(struct ftrace_event_call *call) | 27 | syscall_get_enter_fields(struct ftrace_event_call *call) |
31 | { | 28 | { |
@@ -34,61 +31,66 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
34 | return &entry->enter_fields; | 31 | return &entry->enter_fields; |
35 | } | 32 | } |
36 | 33 | ||
37 | static struct list_head * | ||
38 | syscall_get_exit_fields(struct ftrace_event_call *call) | ||
39 | { | ||
40 | return &syscall_exit_fields; | ||
41 | } | ||
42 | |||
43 | struct trace_event_functions enter_syscall_print_funcs = { | 34 | struct trace_event_functions enter_syscall_print_funcs = { |
44 | .trace = print_syscall_enter, | 35 | .trace = print_syscall_enter, |
45 | }; | 36 | }; |
46 | 37 | ||
47 | struct trace_event_functions exit_syscall_print_funcs = { | 38 | struct trace_event_functions exit_syscall_print_funcs = { |
48 | .trace = print_syscall_exit, | 39 | .trace = print_syscall_exit, |
49 | }; | 40 | }; |
50 | 41 | ||
51 | struct ftrace_event_class event_class_syscall_enter = { | 42 | struct ftrace_event_class event_class_syscall_enter = { |
52 | .system = "syscalls", | 43 | .system = "syscalls", |
53 | .reg = syscall_enter_register, | 44 | .reg = syscall_enter_register, |
54 | .define_fields = syscall_enter_define_fields, | 45 | .define_fields = syscall_enter_define_fields, |
55 | .get_fields = syscall_get_enter_fields, | 46 | .get_fields = syscall_get_enter_fields, |
56 | .raw_init = init_syscall_trace, | 47 | .raw_init = init_syscall_trace, |
57 | }; | 48 | }; |
58 | 49 | ||
59 | struct ftrace_event_class event_class_syscall_exit = { | 50 | struct ftrace_event_class event_class_syscall_exit = { |
60 | .system = "syscalls", | 51 | .system = "syscalls", |
61 | .reg = syscall_exit_register, | 52 | .reg = syscall_exit_register, |
62 | .define_fields = syscall_exit_define_fields, | 53 | .define_fields = syscall_exit_define_fields, |
63 | .get_fields = syscall_get_exit_fields, | 54 | .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), |
64 | .raw_init = init_syscall_trace, | 55 | .raw_init = init_syscall_trace, |
65 | }; | 56 | }; |
66 | 57 | ||
67 | extern unsigned long __start_syscalls_metadata[]; | 58 | extern struct syscall_metadata *__start_syscalls_metadata[]; |
68 | extern unsigned long __stop_syscalls_metadata[]; | 59 | extern struct syscall_metadata *__stop_syscalls_metadata[]; |
69 | 60 | ||
70 | static struct syscall_metadata **syscalls_metadata; | 61 | static struct syscall_metadata **syscalls_metadata; |
71 | 62 | ||
72 | static struct syscall_metadata *find_syscall_meta(unsigned long syscall) | 63 | #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME |
64 | static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) | ||
65 | { | ||
66 | /* | ||
67 | * Only compare after the "sys" prefix. Archs that use | ||
68 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
69 | * with "SyS" instead of "sys", leading to an unwanted | ||
70 | * mismatch. | ||
71 | */ | ||
72 | return !strcmp(sym + 3, name + 3); | ||
73 | } | ||
74 | #endif | ||
75 | |||
76 | static __init struct syscall_metadata * | ||
77 | find_syscall_meta(unsigned long syscall) | ||
73 | { | 78 | { |
74 | struct syscall_metadata *start; | 79 | struct syscall_metadata **start; |
75 | struct syscall_metadata *stop; | 80 | struct syscall_metadata **stop; |
76 | char str[KSYM_SYMBOL_LEN]; | 81 | char str[KSYM_SYMBOL_LEN]; |
77 | 82 | ||
78 | 83 | ||
79 | start = (struct syscall_metadata *)__start_syscalls_metadata; | 84 | start = __start_syscalls_metadata; |
80 | stop = (struct syscall_metadata *)__stop_syscalls_metadata; | 85 | stop = __stop_syscalls_metadata; |
81 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); | 86 | kallsyms_lookup(syscall, NULL, NULL, NULL, str); |
82 | 87 | ||
88 | if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) | ||
89 | return NULL; | ||
90 | |||
83 | for ( ; start < stop; start++) { | 91 | for ( ; start < stop; start++) { |
84 | /* | 92 | if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) |
85 | * Only compare after the "sys" prefix. Archs that use | 93 | return *start; |
86 | * syscall wrappers may have syscalls symbols aliases prefixed | ||
87 | * with "SyS" instead of "sys", leading to an unwanted | ||
88 | * mismatch. | ||
89 | */ | ||
90 | if (start->name && !strcmp(start->name + 3, str + 3)) | ||
91 | return start; | ||
92 | } | 94 | } |
93 | return NULL; | 95 | return NULL; |
94 | } | 96 | } |
@@ -367,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) | |||
367 | int num; | 369 | int num; |
368 | 370 | ||
369 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 371 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
370 | if (num < 0 || num >= NR_syscalls) | 372 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
371 | return -ENOSYS; | 373 | return -ENOSYS; |
372 | mutex_lock(&syscall_trace_lock); | 374 | mutex_lock(&syscall_trace_lock); |
373 | if (!sys_refcount_enter) | 375 | if (!sys_refcount_enter) |
@@ -385,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) | |||
385 | int num; | 387 | int num; |
386 | 388 | ||
387 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 389 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
388 | if (num < 0 || num >= NR_syscalls) | 390 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
389 | return; | 391 | return; |
390 | mutex_lock(&syscall_trace_lock); | 392 | mutex_lock(&syscall_trace_lock); |
391 | sys_refcount_enter--; | 393 | sys_refcount_enter--; |
@@ -401,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) | |||
401 | int num; | 403 | int num; |
402 | 404 | ||
403 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 405 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
404 | if (num < 0 || num >= NR_syscalls) | 406 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
405 | return -ENOSYS; | 407 | return -ENOSYS; |
406 | mutex_lock(&syscall_trace_lock); | 408 | mutex_lock(&syscall_trace_lock); |
407 | if (!sys_refcount_exit) | 409 | if (!sys_refcount_exit) |
@@ -419,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
419 | int num; | 421 | int num; |
420 | 422 | ||
421 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | 423 | num = ((struct syscall_metadata *)call->data)->syscall_nr; |
422 | if (num < 0 || num >= NR_syscalls) | 424 | if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) |
423 | return; | 425 | return; |
424 | mutex_lock(&syscall_trace_lock); | 426 | mutex_lock(&syscall_trace_lock); |
425 | sys_refcount_exit--; | 427 | sys_refcount_exit--; |
@@ -432,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) | |||
432 | int init_syscall_trace(struct ftrace_event_call *call) | 434 | int init_syscall_trace(struct ftrace_event_call *call) |
433 | { | 435 | { |
434 | int id; | 436 | int id; |
437 | int num; | ||
438 | |||
439 | num = ((struct syscall_metadata *)call->data)->syscall_nr; | ||
440 | if (num < 0 || num >= NR_syscalls) { | ||
441 | pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", | ||
442 | ((struct syscall_metadata *)call->data)->name); | ||
443 | return -ENOSYS; | ||
444 | } | ||
435 | 445 | ||
436 | if (set_syscall_print_fmt(call) < 0) | 446 | if (set_syscall_print_fmt(call) < 0) |
437 | return -ENOMEM; | 447 | return -ENOMEM; |
@@ -446,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call) | |||
446 | return id; | 456 | return id; |
447 | } | 457 | } |
448 | 458 | ||
449 | unsigned long __init arch_syscall_addr(int nr) | 459 | unsigned long __init __weak arch_syscall_addr(int nr) |
450 | { | 460 | { |
451 | return (unsigned long)sys_call_table[nr]; | 461 | return (unsigned long)sys_call_table[nr]; |
452 | } | 462 | } |
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c | |||
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) | |||
263 | { | 263 | { |
264 | int ret, cpu; | 264 | int ret, cpu; |
265 | 265 | ||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
266 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | 271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); |
267 | if (ret) | 272 | if (ret) |
268 | goto out; | 273 | goto out; |
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) | |||
279 | if (ret) | 284 | if (ret) |
280 | goto no_creation; | 285 | goto no_creation; |
281 | 286 | ||
282 | for_each_possible_cpu(cpu) { | ||
283 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
284 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
285 | } | ||
286 | |||
287 | return 0; | 287 | return 0; |
288 | 288 | ||
289 | no_creation: | 289 | no_creation: |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..b219f1449c54 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -25,9 +25,10 @@ | |||
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/jump_label.h> | ||
28 | 29 | ||
29 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint * const __start___tracepoints_ptrs[]; |
30 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint * const __stop___tracepoints_ptrs[]; |
31 | 32 | ||
32 | /* Set to 1 to enable tracepoint debug output */ | 33 | /* Set to 1 to enable tracepoint debug output */ |
33 | static const int tracepoint_debug; | 34 | static const int tracepoint_debug; |
@@ -250,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
250 | { | 251 | { |
251 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); | 252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); |
252 | 253 | ||
253 | if (elem->regfunc && !elem->state && active) | 254 | if (elem->regfunc && !jump_label_enabled(&elem->key) && active) |
254 | elem->regfunc(); | 255 | elem->regfunc(); |
255 | else if (elem->unregfunc && elem->state && !active) | 256 | else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) |
256 | elem->unregfunc(); | 257 | elem->unregfunc(); |
257 | 258 | ||
258 | /* | 259 | /* |
@@ -263,7 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
263 | * is used. | 264 | * is used. |
264 | */ | 265 | */ |
265 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
266 | elem->state = active; | 267 | if (active && !jump_label_enabled(&elem->key)) |
268 | jump_label_inc(&elem->key); | ||
269 | else if (!active && jump_label_enabled(&elem->key)) | ||
270 | jump_label_dec(&elem->key); | ||
267 | } | 271 | } |
268 | 272 | ||
269 | /* | 273 | /* |
@@ -274,10 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
274 | */ | 278 | */ |
275 | static void disable_tracepoint(struct tracepoint *elem) | 279 | static void disable_tracepoint(struct tracepoint *elem) |
276 | { | 280 | { |
277 | if (elem->unregfunc && elem->state) | 281 | if (elem->unregfunc && jump_label_enabled(&elem->key)) |
278 | elem->unregfunc(); | 282 | elem->unregfunc(); |
279 | 283 | ||
280 | elem->state = 0; | 284 | if (jump_label_enabled(&elem->key)) |
285 | jump_label_dec(&elem->key); | ||
281 | rcu_assign_pointer(elem->funcs, NULL); | 286 | rcu_assign_pointer(elem->funcs, NULL); |
282 | } | 287 | } |
283 | 288 | ||
@@ -288,10 +293,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
288 | * | 293 | * |
289 | * Updates the probe callback corresponding to a range of tracepoints. | 294 | * Updates the probe callback corresponding to a range of tracepoints. |
290 | */ | 295 | */ |
291 | void | 296 | void tracepoint_update_probe_range(struct tracepoint * const *begin, |
292 | tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | 297 | struct tracepoint * const *end) |
293 | { | 298 | { |
294 | struct tracepoint *iter; | 299 | struct tracepoint * const *iter; |
295 | struct tracepoint_entry *mark_entry; | 300 | struct tracepoint_entry *mark_entry; |
296 | 301 | ||
297 | if (!begin) | 302 | if (!begin) |
@@ -299,12 +304,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
299 | 304 | ||
300 | mutex_lock(&tracepoints_mutex); | 305 | mutex_lock(&tracepoints_mutex); |
301 | for (iter = begin; iter < end; iter++) { | 306 | for (iter = begin; iter < end; iter++) { |
302 | mark_entry = get_tracepoint(iter->name); | 307 | mark_entry = get_tracepoint((*iter)->name); |
303 | if (mark_entry) { | 308 | if (mark_entry) { |
304 | set_tracepoint(&mark_entry, iter, | 309 | set_tracepoint(&mark_entry, *iter, |
305 | !!mark_entry->refcount); | 310 | !!mark_entry->refcount); |
306 | } else { | 311 | } else { |
307 | disable_tracepoint(iter); | 312 | disable_tracepoint(*iter); |
308 | } | 313 | } |
309 | } | 314 | } |
310 | mutex_unlock(&tracepoints_mutex); | 315 | mutex_unlock(&tracepoints_mutex); |
@@ -316,8 +321,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) | |||
316 | static void tracepoint_update_probes(void) | 321 | static void tracepoint_update_probes(void) |
317 | { | 322 | { |
318 | /* Core kernel tracepoints */ | 323 | /* Core kernel tracepoints */ |
319 | tracepoint_update_probe_range(__start___tracepoints, | 324 | tracepoint_update_probe_range(__start___tracepoints_ptrs, |
320 | __stop___tracepoints); | 325 | __stop___tracepoints_ptrs); |
321 | /* tracepoints in modules. */ | 326 | /* tracepoints in modules. */ |
322 | module_update_tracepoints(); | 327 | module_update_tracepoints(); |
323 | } | 328 | } |
@@ -504,8 +509,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); | |||
504 | * Will return the first tracepoint in the range if the input tracepoint is | 509 | * Will return the first tracepoint in the range if the input tracepoint is |
505 | * NULL. | 510 | * NULL. |
506 | */ | 511 | */ |
507 | int tracepoint_get_iter_range(struct tracepoint **tracepoint, | 512 | int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, |
508 | struct tracepoint *begin, struct tracepoint *end) | 513 | struct tracepoint * const *begin, struct tracepoint * const *end) |
509 | { | 514 | { |
510 | if (!*tracepoint && begin != end) { | 515 | if (!*tracepoint && begin != end) { |
511 | *tracepoint = begin; | 516 | *tracepoint = begin; |
@@ -524,7 +529,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) | |||
524 | /* Core kernel tracepoints */ | 529 | /* Core kernel tracepoints */ |
525 | if (!iter->module) { | 530 | if (!iter->module) { |
526 | found = tracepoint_get_iter_range(&iter->tracepoint, | 531 | found = tracepoint_get_iter_range(&iter->tracepoint, |
527 | __start___tracepoints, __stop___tracepoints); | 532 | __start___tracepoints_ptrs, |
533 | __stop___tracepoints_ptrs); | ||
528 | if (found) | 534 | if (found) |
529 | goto end; | 535 | goto end; |
530 | } | 536 | } |
@@ -575,8 +581,8 @@ int tracepoint_module_notify(struct notifier_block *self, | |||
575 | switch (val) { | 581 | switch (val) { |
576 | case MODULE_STATE_COMING: | 582 | case MODULE_STATE_COMING: |
577 | case MODULE_STATE_GOING: | 583 | case MODULE_STATE_GOING: |
578 | tracepoint_update_probe_range(mod->tracepoints, | 584 | tracepoint_update_probe_range(mod->tracepoints_ptrs, |
579 | mod->tracepoints + mod->num_tracepoints); | 585 | mod->tracepoints_ptrs + mod->num_tracepoints); |
580 | break; | 586 | break; |
581 | } | 587 | } |
582 | return 0; | 588 | return 0; |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf8..24dc60d9fa1f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
63 | stats->ac_ppid = pid_alive(tsk) ? | 63 | stats->ac_ppid = pid_alive(tsk) ? |
64 | rcu_dereference(tsk->real_parent)->tgid : 0; | 64 | rcu_dereference(tsk->real_parent)->tgid : 0; |
65 | rcu_read_unlock(); | 65 | rcu_read_unlock(); |
66 | stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; | 66 | stats->ac_utime = cputime_to_usecs(tsk->utime); |
67 | stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; | 67 | stats->ac_stime = cputime_to_usecs(tsk->stime); |
68 | stats->ac_utimescaled = | 68 | stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); |
69 | cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; | 69 | stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); |
70 | stats->ac_stimescaled = | ||
71 | cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; | ||
72 | stats->ac_minflt = tsk->min_flt; | 70 | stats->ac_minflt = tsk->min_flt; |
73 | stats->ac_majflt = tsk->maj_flt; | 71 | stats->ac_majflt = tsk->maj_flt; |
74 | 72 | ||
diff --git a/kernel/uid16.c b/kernel/uid16.c index 419209893d87..51c6e89e8619 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
189 | struct group_info *group_info; | 189 | struct group_info *group_info; |
190 | int retval; | 190 | int retval; |
191 | 191 | ||
192 | if (!capable(CAP_SETGID)) | 192 | if (!nsown_capable(CAP_SETGID)) |
193 | return -EPERM; | 193 | return -EPERM; |
194 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 194 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
195 | return -EINVAL; | 195 | return -EINVAL; |
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3430a2..92cb706c7fc8 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); | |||
20 | 20 | ||
21 | /* | 21 | /* |
22 | * Removes a registered user return notifier. Must be called from atomic | 22 | * Removes a registered user return notifier. Must be called from atomic |
23 | * context, and from the same cpu registration occured in. | 23 | * context, and from the same cpu registration occurred in. |
24 | */ | 24 | */ |
25 | void user_return_notifier_unregister(struct user_return_notifier *urn) | 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) |
26 | { | 26 | { |
diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736d..9e03e9c1df8d 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -17,9 +17,13 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/user_namespace.h> | 18 | #include <linux/user_namespace.h> |
19 | 19 | ||
20 | /* | ||
21 | * userns count is 1 for root user, 1 for init_uts_ns, | ||
22 | * and 1 for... ? | ||
23 | */ | ||
20 | struct user_namespace init_user_ns = { | 24 | struct user_namespace init_user_ns = { |
21 | .kref = { | 25 | .kref = { |
22 | .refcount = ATOMIC_INIT(2), | 26 | .refcount = ATOMIC_INIT(3), |
23 | }, | 27 | }, |
24 | .creator = &root_user, | 28 | .creator = &root_user, |
25 | }; | 29 | }; |
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; | |||
47 | */ | 51 | */ |
48 | static DEFINE_SPINLOCK(uidhash_lock); | 52 | static DEFINE_SPINLOCK(uidhash_lock); |
49 | 53 | ||
50 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ | 54 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ |
51 | struct user_struct root_user = { | 55 | struct user_struct root_user = { |
52 | .__count = ATOMIC_INIT(2), | 56 | .__count = ATOMIC_INIT(2), |
53 | .processes = ATOMIC_INIT(1), | 57 | .processes = ATOMIC_INIT(1), |
@@ -91,6 +95,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | |||
91 | * upon function exit. | 95 | * upon function exit. |
92 | */ | 96 | */ |
93 | static void free_user(struct user_struct *up, unsigned long flags) | 97 | static void free_user(struct user_struct *up, unsigned long flags) |
98 | __releases(&uidhash_lock) | ||
94 | { | 99 | { |
95 | uid_hash_remove(up); | 100 | uid_hash_remove(up); |
96 | spin_unlock_irqrestore(&uidhash_lock, flags); | 101 | spin_unlock_irqrestore(&uidhash_lock, flags); |
@@ -157,6 +162,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
157 | spin_lock_irq(&uidhash_lock); | 162 | spin_lock_irq(&uidhash_lock); |
158 | up = uid_hash_find(uid, hashent); | 163 | up = uid_hash_find(uid, hashent); |
159 | if (up) { | 164 | if (up) { |
165 | put_user_ns(ns); | ||
160 | key_put(new->uid_keyring); | 166 | key_put(new->uid_keyring); |
161 | key_put(new->session_keyring); | 167 | key_put(new->session_keyring); |
162 | kmem_cache_free(uid_cachep, new); | 168 | kmem_cache_free(uid_cachep, new); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291a..9da289c34f22 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -12,6 +12,8 @@ | |||
12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
14 | 14 | ||
15 | static struct kmem_cache *user_ns_cachep __read_mostly; | ||
16 | |||
15 | /* | 17 | /* |
16 | * Create a new user namespace, deriving the creator from the user in the | 18 | * Create a new user namespace, deriving the creator from the user in the |
17 | * passed credentials, and replacing that user with the new root user for the | 19 | * passed credentials, and replacing that user with the new root user for the |
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) | |||
26 | struct user_struct *root_user; | 28 | struct user_struct *root_user; |
27 | int n; | 29 | int n; |
28 | 30 | ||
29 | ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); | 31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); |
30 | if (!ns) | 32 | if (!ns) |
31 | return -ENOMEM; | 33 | return -ENOMEM; |
32 | 34 | ||
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) | |||
38 | /* Alloc new root user. */ | 40 | /* Alloc new root user. */ |
39 | root_user = alloc_uid(ns, 0); | 41 | root_user = alloc_uid(ns, 0); |
40 | if (!root_user) { | 42 | if (!root_user) { |
41 | kfree(ns); | 43 | kmem_cache_free(user_ns_cachep, ns); |
42 | return -ENOMEM; | 44 | return -ENOMEM; |
43 | } | 45 | } |
44 | 46 | ||
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) | |||
71 | struct user_namespace *ns = | 73 | struct user_namespace *ns = |
72 | container_of(work, struct user_namespace, destroyer); | 74 | container_of(work, struct user_namespace, destroyer); |
73 | free_uid(ns->creator); | 75 | free_uid(ns->creator); |
74 | kfree(ns); | 76 | kmem_cache_free(user_ns_cachep, ns); |
75 | } | 77 | } |
76 | 78 | ||
77 | void free_user_ns(struct kref *kref) | 79 | void free_user_ns(struct kref *kref) |
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t | |||
126 | /* No useful relationship so no mapping */ | 128 | /* No useful relationship so no mapping */ |
127 | return overflowgid; | 129 | return overflowgid; |
128 | } | 130 | } |
131 | |||
132 | static __init int user_namespaces_init(void) | ||
133 | { | ||
134 | user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); | ||
135 | return 0; | ||
136 | } | ||
137 | module_init(user_namespaces_init); | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b8ea52..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/user_namespace.h> | ||
18 | #include <linux/proc_fs.h> | ||
17 | 19 | ||
18 | static struct uts_namespace *create_uts_ns(void) | 20 | static struct uts_namespace *create_uts_ns(void) |
19 | { | 21 | { |
@@ -30,7 +32,8 @@ static struct uts_namespace *create_uts_ns(void) | |||
30 | * @old_ns: namespace to clone | 32 | * @old_ns: namespace to clone |
31 | * Return NULL on error (failure to kmalloc), new ns otherwise | 33 | * Return NULL on error (failure to kmalloc), new ns otherwise |
32 | */ | 34 | */ |
33 | static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | 35 | static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, |
36 | struct uts_namespace *old_ns) | ||
34 | { | 37 | { |
35 | struct uts_namespace *ns; | 38 | struct uts_namespace *ns; |
36 | 39 | ||
@@ -40,6 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
40 | 43 | ||
41 | down_read(&uts_sem); | 44 | down_read(&uts_sem); |
42 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); | ||
43 | up_read(&uts_sem); | 47 | up_read(&uts_sem); |
44 | return ns; | 48 | return ns; |
45 | } | 49 | } |
@@ -50,8 +54,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
50 | * utsname of this process won't be seen by parent, and vice | 54 | * utsname of this process won't be seen by parent, and vice |
51 | * versa. | 55 | * versa. |
52 | */ | 56 | */ |
53 | struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) | 57 | struct uts_namespace *copy_utsname(unsigned long flags, |
58 | struct task_struct *tsk) | ||
54 | { | 59 | { |
60 | struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; | ||
55 | struct uts_namespace *new_ns; | 61 | struct uts_namespace *new_ns; |
56 | 62 | ||
57 | BUG_ON(!old_ns); | 63 | BUG_ON(!old_ns); |
@@ -60,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol | |||
60 | if (!(flags & CLONE_NEWUTS)) | 66 | if (!(flags & CLONE_NEWUTS)) |
61 | return old_ns; | 67 | return old_ns; |
62 | 68 | ||
63 | new_ns = clone_uts_ns(old_ns); | 69 | new_ns = clone_uts_ns(tsk, old_ns); |
64 | 70 | ||
65 | put_uts_ns(old_ns); | 71 | put_uts_ns(old_ns); |
66 | return new_ns; | 72 | return new_ns; |
@@ -71,5 +77,44 @@ void free_uts_ns(struct kref *kref) | |||
71 | struct uts_namespace *ns; | 77 | struct uts_namespace *ns; |
72 | 78 | ||
73 | ns = container_of(kref, struct uts_namespace, kref); | 79 | ns = container_of(kref, struct uts_namespace, kref); |
80 | put_user_ns(ns->user_ns); | ||
74 | kfree(ns); | 81 | kfree(ns); |
75 | } | 82 | } |
83 | |||
84 | static void *utsns_get(struct task_struct *task) | ||
85 | { | ||
86 | struct uts_namespace *ns = NULL; | ||
87 | struct nsproxy *nsproxy; | ||
88 | |||
89 | rcu_read_lock(); | ||
90 | nsproxy = task_nsproxy(task); | ||
91 | if (nsproxy) { | ||
92 | ns = nsproxy->uts_ns; | ||
93 | get_uts_ns(ns); | ||
94 | } | ||
95 | rcu_read_unlock(); | ||
96 | |||
97 | return ns; | ||
98 | } | ||
99 | |||
100 | static void utsns_put(void *ns) | ||
101 | { | ||
102 | put_uts_ns(ns); | ||
103 | } | ||
104 | |||
105 | static int utsns_install(struct nsproxy *nsproxy, void *ns) | ||
106 | { | ||
107 | get_uts_ns(ns); | ||
108 | put_uts_ns(nsproxy->uts_ns); | ||
109 | nsproxy->uts_ns = ns; | ||
110 | return 0; | ||
111 | } | ||
112 | |||
113 | const struct proc_ns_operations utsns_operations = { | ||
114 | .name = "uts", | ||
115 | .type = CLONE_NEWUTS, | ||
116 | .get = utsns_get, | ||
117 | .put = utsns_put, | ||
118 | .install = utsns_install, | ||
119 | }; | ||
120 | |||
diff --git a/kernel/wait.c b/kernel/wait.c index c4bd3d825f35..f45ea8d2a1ce 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
92 | } | 92 | } |
93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
94 | 94 | ||
95 | /* | 95 | /** |
96 | * finish_wait - clean up after waiting in a queue | 96 | * finish_wait - clean up after waiting in a queue |
97 | * @q: waitqueue waited on | 97 | * @q: waitqueue waited on |
98 | * @wait: wait descriptor | 98 | * @wait: wait descriptor |
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
127 | } | 127 | } |
128 | EXPORT_SYMBOL(finish_wait); | 128 | EXPORT_SYMBOL(finish_wait); |
129 | 129 | ||
130 | /* | 130 | /** |
131 | * abort_exclusive_wait - abort exclusive waiting in a queue | 131 | * abort_exclusive_wait - abort exclusive waiting in a queue |
132 | * @q: waitqueue waited on | 132 | * @q: waitqueue waited on |
133 | * @wait: wait descriptor | 133 | * @wait: wait descriptor |
134 | * @state: runstate of the waiter to be woken | 134 | * @mode: runstate of the waiter to be woken |
135 | * @key: key to identify a wait bit queue or %NULL | 135 | * @key: key to identify a wait bit queue or %NULL |
136 | * | 136 | * |
137 | * Sets current thread back to running state and removes | 137 | * Sets current thread back to running state and removes |
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); | |||
142 | * woken up through the queue. | 142 | * woken up through the queue. |
143 | * | 143 | * |
144 | * This prevents waiter starvation where an exclusive waiter | 144 | * This prevents waiter starvation where an exclusive waiter |
145 | * aborts and is woken up concurrently and noone wakes up | 145 | * aborts and is woken up concurrently and no one wakes up |
146 | * the next waiter. | 146 | * the next waiter. |
147 | */ | 147 | */ |
148 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | 148 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9c3c52ecc1..3d0c56ad4792 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -27,8 +27,8 @@ | |||
27 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
29 | 29 | ||
30 | int watchdog_enabled; | 30 | int watchdog_enabled = 1; |
31 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly watchdog_thresh = 10; |
32 | 32 | ||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
@@ -43,21 +43,22 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | |||
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | static int __read_mostly did_panic; | ||
47 | static int __initdata no_watchdog; | ||
48 | |||
49 | |||
50 | /* boot commands */ | 46 | /* boot commands */ |
51 | /* | 47 | /* |
52 | * Should we panic when a soft-lockup or hard-lockup occurs: | 48 | * Should we panic when a soft-lockup or hard-lockup occurs: |
53 | */ | 49 | */ |
54 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
55 | static int hardlockup_panic; | 51 | static int hardlockup_panic = |
52 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
56 | 53 | ||
57 | static int __init hardlockup_panic_setup(char *str) | 54 | static int __init hardlockup_panic_setup(char *str) |
58 | { | 55 | { |
59 | if (!strncmp(str, "panic", 5)) | 56 | if (!strncmp(str, "panic", 5)) |
60 | hardlockup_panic = 1; | 57 | hardlockup_panic = 1; |
58 | else if (!strncmp(str, "nopanic", 7)) | ||
59 | hardlockup_panic = 0; | ||
60 | else if (!strncmp(str, "0", 1)) | ||
61 | watchdog_enabled = 0; | ||
61 | return 1; | 62 | return 1; |
62 | } | 63 | } |
63 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 64 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
@@ -76,7 +77,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); | |||
76 | 77 | ||
77 | static int __init nowatchdog_setup(char *str) | 78 | static int __init nowatchdog_setup(char *str) |
78 | { | 79 | { |
79 | no_watchdog = 1; | 80 | watchdog_enabled = 0; |
80 | return 1; | 81 | return 1; |
81 | } | 82 | } |
82 | __setup("nowatchdog", nowatchdog_setup); | 83 | __setup("nowatchdog", nowatchdog_setup); |
@@ -84,12 +85,23 @@ __setup("nowatchdog", nowatchdog_setup); | |||
84 | /* deprecated */ | 85 | /* deprecated */ |
85 | static int __init nosoftlockup_setup(char *str) | 86 | static int __init nosoftlockup_setup(char *str) |
86 | { | 87 | { |
87 | no_watchdog = 1; | 88 | watchdog_enabled = 0; |
88 | return 1; | 89 | return 1; |
89 | } | 90 | } |
90 | __setup("nosoftlockup", nosoftlockup_setup); | 91 | __setup("nosoftlockup", nosoftlockup_setup); |
91 | /* */ | 92 | /* */ |
92 | 93 | ||
94 | /* | ||
95 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | ||
96 | * lockups can have false positives under extreme conditions. So we generally | ||
97 | * want a higher threshold for soft lockups than for hard lockups. So we couple | ||
98 | * the thresholds with a factor: we make the soft threshold twice the amount of | ||
99 | * time the hard threshold is. | ||
100 | */ | ||
101 | static int get_softlockup_thresh(void) | ||
102 | { | ||
103 | return watchdog_thresh * 2; | ||
104 | } | ||
93 | 105 | ||
94 | /* | 106 | /* |
95 | * Returns seconds, approximately. We don't need nanosecond | 107 | * Returns seconds, approximately. We don't need nanosecond |
@@ -104,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu) | |||
104 | static unsigned long get_sample_period(void) | 116 | static unsigned long get_sample_period(void) |
105 | { | 117 | { |
106 | /* | 118 | /* |
107 | * convert softlockup_thresh from seconds to ns | 119 | * convert watchdog_thresh from seconds to ns |
108 | * the divide by 5 is to give hrtimer 5 chances to | 120 | * the divide by 5 is to give hrtimer 5 chances to |
109 | * increment before the hardlockup detector generates | 121 | * increment before the hardlockup detector generates |
110 | * a warning | 122 | * a warning |
111 | */ | 123 | */ |
112 | return softlockup_thresh / 5 * NSEC_PER_SEC; | 124 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); |
113 | } | 125 | } |
114 | 126 | ||
115 | /* Commands for resetting the watchdog */ | 127 | /* Commands for resetting the watchdog */ |
@@ -117,12 +129,12 @@ static void __touch_watchdog(void) | |||
117 | { | 129 | { |
118 | int this_cpu = smp_processor_id(); | 130 | int this_cpu = smp_processor_id(); |
119 | 131 | ||
120 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | 132 | __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); |
121 | } | 133 | } |
122 | 134 | ||
123 | void touch_softlockup_watchdog(void) | 135 | void touch_softlockup_watchdog(void) |
124 | { | 136 | { |
125 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | 137 | __this_cpu_write(watchdog_touch_ts, 0); |
126 | } | 138 | } |
127 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 139 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
128 | 140 | ||
@@ -166,12 +178,12 @@ void touch_softlockup_watchdog_sync(void) | |||
166 | /* watchdog detector functions */ | 178 | /* watchdog detector functions */ |
167 | static int is_hardlockup(void) | 179 | static int is_hardlockup(void) |
168 | { | 180 | { |
169 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | 181 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
170 | 182 | ||
171 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | 183 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
172 | return 1; | 184 | return 1; |
173 | 185 | ||
174 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | 186 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
175 | return 0; | 187 | return 0; |
176 | } | 188 | } |
177 | #endif | 189 | #endif |
@@ -181,24 +193,12 @@ static int is_softlockup(unsigned long touch_ts) | |||
181 | unsigned long now = get_timestamp(smp_processor_id()); | 193 | unsigned long now = get_timestamp(smp_processor_id()); |
182 | 194 | ||
183 | /* Warn about unreasonable delays: */ | 195 | /* Warn about unreasonable delays: */ |
184 | if (time_after(now, touch_ts + softlockup_thresh)) | 196 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
185 | return now - touch_ts; | 197 | return now - touch_ts; |
186 | 198 | ||
187 | return 0; | 199 | return 0; |
188 | } | 200 | } |
189 | 201 | ||
190 | static int | ||
191 | watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
192 | { | ||
193 | did_panic = 1; | ||
194 | |||
195 | return NOTIFY_DONE; | ||
196 | } | ||
197 | |||
198 | static struct notifier_block panic_block = { | ||
199 | .notifier_call = watchdog_panic, | ||
200 | }; | ||
201 | |||
202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
203 | static struct perf_event_attr wd_hw_attr = { | 203 | static struct perf_event_attr wd_hw_attr = { |
204 | .type = PERF_TYPE_HARDWARE, | 204 | .type = PERF_TYPE_HARDWARE, |
@@ -209,15 +209,15 @@ static struct perf_event_attr wd_hw_attr = { | |||
209 | }; | 209 | }; |
210 | 210 | ||
211 | /* Callback function for perf event subsystem */ | 211 | /* Callback function for perf event subsystem */ |
212 | void watchdog_overflow_callback(struct perf_event *event, int nmi, | 212 | static void watchdog_overflow_callback(struct perf_event *event, int nmi, |
213 | struct perf_sample_data *data, | 213 | struct perf_sample_data *data, |
214 | struct pt_regs *regs) | 214 | struct pt_regs *regs) |
215 | { | 215 | { |
216 | /* Ensure the watchdog never gets throttled */ | 216 | /* Ensure the watchdog never gets throttled */ |
217 | event->hw.interrupts = 0; | 217 | event->hw.interrupts = 0; |
218 | 218 | ||
219 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | 219 | if (__this_cpu_read(watchdog_nmi_touch) == true) { |
220 | __get_cpu_var(watchdog_nmi_touch) = false; | 220 | __this_cpu_write(watchdog_nmi_touch, false); |
221 | return; | 221 | return; |
222 | } | 222 | } |
223 | 223 | ||
@@ -231,7 +231,7 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
231 | int this_cpu = smp_processor_id(); | 231 | int this_cpu = smp_processor_id(); |
232 | 232 | ||
233 | /* only print hardlockups once */ | 233 | /* only print hardlockups once */ |
234 | if (__get_cpu_var(hard_watchdog_warn) == true) | 234 | if (__this_cpu_read(hard_watchdog_warn) == true) |
235 | return; | 235 | return; |
236 | 236 | ||
237 | if (hardlockup_panic) | 237 | if (hardlockup_panic) |
@@ -239,16 +239,16 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, | |||
239 | else | 239 | else |
240 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 240 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
241 | 241 | ||
242 | __get_cpu_var(hard_watchdog_warn) = true; | 242 | __this_cpu_write(hard_watchdog_warn, true); |
243 | return; | 243 | return; |
244 | } | 244 | } |
245 | 245 | ||
246 | __get_cpu_var(hard_watchdog_warn) = false; | 246 | __this_cpu_write(hard_watchdog_warn, false); |
247 | return; | 247 | return; |
248 | } | 248 | } |
249 | static void watchdog_interrupt_count(void) | 249 | static void watchdog_interrupt_count(void) |
250 | { | 250 | { |
251 | __get_cpu_var(hrtimer_interrupts)++; | 251 | __this_cpu_inc(hrtimer_interrupts); |
252 | } | 252 | } |
253 | #else | 253 | #else |
254 | static inline void watchdog_interrupt_count(void) { return; } | 254 | static inline void watchdog_interrupt_count(void) { return; } |
@@ -257,7 +257,7 @@ static inline void watchdog_interrupt_count(void) { return; } | |||
257 | /* watchdog kicker functions */ | 257 | /* watchdog kicker functions */ |
258 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 258 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
259 | { | 259 | { |
260 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | 260 | unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); |
261 | struct pt_regs *regs = get_irq_regs(); | 261 | struct pt_regs *regs = get_irq_regs(); |
262 | int duration; | 262 | int duration; |
263 | 263 | ||
@@ -265,18 +265,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
265 | watchdog_interrupt_count(); | 265 | watchdog_interrupt_count(); |
266 | 266 | ||
267 | /* kick the softlockup detector */ | 267 | /* kick the softlockup detector */ |
268 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | 268 | wake_up_process(__this_cpu_read(softlockup_watchdog)); |
269 | 269 | ||
270 | /* .. and repeat */ | 270 | /* .. and repeat */ |
271 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | 271 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); |
272 | 272 | ||
273 | if (touch_ts == 0) { | 273 | if (touch_ts == 0) { |
274 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | 274 | if (unlikely(__this_cpu_read(softlockup_touch_sync))) { |
275 | /* | 275 | /* |
276 | * If the time stamp was touched atomically | 276 | * If the time stamp was touched atomically |
277 | * make sure the scheduler tick is up to date. | 277 | * make sure the scheduler tick is up to date. |
278 | */ | 278 | */ |
279 | __get_cpu_var(softlockup_touch_sync) = false; | 279 | __this_cpu_write(softlockup_touch_sync, false); |
280 | sched_clock_tick(); | 280 | sched_clock_tick(); |
281 | } | 281 | } |
282 | __touch_watchdog(); | 282 | __touch_watchdog(); |
@@ -292,7 +292,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
292 | duration = is_softlockup(touch_ts); | 292 | duration = is_softlockup(touch_ts); |
293 | if (unlikely(duration)) { | 293 | if (unlikely(duration)) { |
294 | /* only warn once */ | 294 | /* only warn once */ |
295 | if (__get_cpu_var(soft_watchdog_warn) == true) | 295 | if (__this_cpu_read(soft_watchdog_warn) == true) |
296 | return HRTIMER_RESTART; | 296 | return HRTIMER_RESTART; |
297 | 297 | ||
298 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 298 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
@@ -307,9 +307,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
307 | 307 | ||
308 | if (softlockup_panic) | 308 | if (softlockup_panic) |
309 | panic("softlockup: hung tasks"); | 309 | panic("softlockup: hung tasks"); |
310 | __get_cpu_var(soft_watchdog_warn) = true; | 310 | __this_cpu_write(soft_watchdog_warn, true); |
311 | } else | 311 | } else |
312 | __get_cpu_var(soft_watchdog_warn) = false; | 312 | __this_cpu_write(soft_watchdog_warn, false); |
313 | 313 | ||
314 | return HRTIMER_RESTART; | 314 | return HRTIMER_RESTART; |
315 | } | 315 | } |
@@ -320,7 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
320 | */ | 320 | */ |
321 | static int watchdog(void *unused) | 321 | static int watchdog(void *unused) |
322 | { | 322 | { |
323 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 323 | static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
324 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 324 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
325 | 325 | ||
326 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 326 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
@@ -370,15 +370,22 @@ static int watchdog_nmi_enable(int cpu) | |||
370 | 370 | ||
371 | /* Try to register using hardware perf events */ | 371 | /* Try to register using hardware perf events */ |
372 | wd_attr = &wd_hw_attr; | 372 | wd_attr = &wd_hw_attr; |
373 | wd_attr->sample_period = hw_nmi_get_sample_period(); | 373 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
374 | event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); | 374 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); |
375 | if (!IS_ERR(event)) { | 375 | if (!IS_ERR(event)) { |
376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |
377 | goto out_save; | 377 | goto out_save; |
378 | } | 378 | } |
379 | 379 | ||
380 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | 380 | |
381 | return -1; | 381 | /* vary the KERN level based on the returned errno */ |
382 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
383 | printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
384 | else if (PTR_ERR(event) == -ENOENT) | ||
385 | printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); | ||
386 | else | ||
387 | printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); | ||
388 | return PTR_ERR(event); | ||
382 | 389 | ||
383 | /* success path */ | 390 | /* success path */ |
384 | out_save: | 391 | out_save: |
@@ -408,31 +415,37 @@ static void watchdog_nmi_disable(int cpu) { return; } | |||
408 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 415 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
409 | 416 | ||
410 | /* prepare/enable/disable routines */ | 417 | /* prepare/enable/disable routines */ |
411 | static int watchdog_prepare_cpu(int cpu) | 418 | static void watchdog_prepare_cpu(int cpu) |
412 | { | 419 | { |
413 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | 420 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); |
414 | 421 | ||
415 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | 422 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); |
416 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 423 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
417 | hrtimer->function = watchdog_timer_fn; | 424 | hrtimer->function = watchdog_timer_fn; |
418 | |||
419 | return 0; | ||
420 | } | 425 | } |
421 | 426 | ||
422 | static int watchdog_enable(int cpu) | 427 | static int watchdog_enable(int cpu) |
423 | { | 428 | { |
424 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | 429 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); |
430 | int err = 0; | ||
425 | 431 | ||
426 | /* enable the perf event */ | 432 | /* enable the perf event */ |
427 | if (watchdog_nmi_enable(cpu) != 0) | 433 | err = watchdog_nmi_enable(cpu); |
428 | return -1; | 434 | |
435 | /* Regardless of err above, fall through and start softlockup */ | ||
429 | 436 | ||
430 | /* create the watchdog thread */ | 437 | /* create the watchdog thread */ |
431 | if (!p) { | 438 | if (!p) { |
432 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 439 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); |
433 | if (IS_ERR(p)) { | 440 | if (IS_ERR(p)) { |
434 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 441 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
435 | return -1; | 442 | if (!err) { |
443 | /* if hardlockup hasn't already set this */ | ||
444 | err = PTR_ERR(p); | ||
445 | /* and disable the perf event */ | ||
446 | watchdog_nmi_disable(cpu); | ||
447 | } | ||
448 | goto out; | ||
436 | } | 449 | } |
437 | kthread_bind(p, cpu); | 450 | kthread_bind(p, cpu); |
438 | per_cpu(watchdog_touch_ts, cpu) = 0; | 451 | per_cpu(watchdog_touch_ts, cpu) = 0; |
@@ -440,10 +453,8 @@ static int watchdog_enable(int cpu) | |||
440 | wake_up_process(p); | 453 | wake_up_process(p); |
441 | } | 454 | } |
442 | 455 | ||
443 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | 456 | out: |
444 | watchdog_enabled = 1; | 457 | return err; |
445 | |||
446 | return 0; | ||
447 | } | 458 | } |
448 | 459 | ||
449 | static void watchdog_disable(int cpu) | 460 | static void watchdog_disable(int cpu) |
@@ -470,12 +481,16 @@ static void watchdog_disable(int cpu) | |||
470 | static void watchdog_enable_all_cpus(void) | 481 | static void watchdog_enable_all_cpus(void) |
471 | { | 482 | { |
472 | int cpu; | 483 | int cpu; |
473 | int result = 0; | 484 | |
485 | watchdog_enabled = 0; | ||
474 | 486 | ||
475 | for_each_online_cpu(cpu) | 487 | for_each_online_cpu(cpu) |
476 | result += watchdog_enable(cpu); | 488 | if (!watchdog_enable(cpu)) |
489 | /* if any cpu succeeds, watchdog is considered | ||
490 | enabled for the system */ | ||
491 | watchdog_enabled = 1; | ||
477 | 492 | ||
478 | if (result) | 493 | if (!watchdog_enabled) |
479 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | 494 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); |
480 | 495 | ||
481 | } | 496 | } |
@@ -495,26 +510,25 @@ static void watchdog_disable_all_cpus(void) | |||
495 | /* sysctl functions */ | 510 | /* sysctl functions */ |
496 | #ifdef CONFIG_SYSCTL | 511 | #ifdef CONFIG_SYSCTL |
497 | /* | 512 | /* |
498 | * proc handler for /proc/sys/kernel/nmi_watchdog | 513 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh |
499 | */ | 514 | */ |
500 | 515 | ||
501 | int proc_dowatchdog_enabled(struct ctl_table *table, int write, | 516 | int proc_dowatchdog(struct ctl_table *table, int write, |
502 | void __user *buffer, size_t *length, loff_t *ppos) | 517 | void __user *buffer, size_t *lenp, loff_t *ppos) |
503 | { | 518 | { |
504 | proc_dointvec(table, write, buffer, length, ppos); | 519 | int ret; |
520 | |||
521 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
522 | if (ret || !write) | ||
523 | goto out; | ||
505 | 524 | ||
506 | if (watchdog_enabled) | 525 | if (watchdog_enabled && watchdog_thresh) |
507 | watchdog_enable_all_cpus(); | 526 | watchdog_enable_all_cpus(); |
508 | else | 527 | else |
509 | watchdog_disable_all_cpus(); | 528 | watchdog_disable_all_cpus(); |
510 | return 0; | ||
511 | } | ||
512 | 529 | ||
513 | int proc_dowatchdog_thresh(struct ctl_table *table, int write, | 530 | out: |
514 | void __user *buffer, | 531 | return ret; |
515 | size_t *lenp, loff_t *ppos) | ||
516 | { | ||
517 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
518 | } | 532 | } |
519 | #endif /* CONFIG_SYSCTL */ | 533 | #endif /* CONFIG_SYSCTL */ |
520 | 534 | ||
@@ -530,13 +544,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
530 | switch (action) { | 544 | switch (action) { |
531 | case CPU_UP_PREPARE: | 545 | case CPU_UP_PREPARE: |
532 | case CPU_UP_PREPARE_FROZEN: | 546 | case CPU_UP_PREPARE_FROZEN: |
533 | if (watchdog_prepare_cpu(hotcpu)) | 547 | watchdog_prepare_cpu(hotcpu); |
534 | return NOTIFY_BAD; | ||
535 | break; | 548 | break; |
536 | case CPU_ONLINE: | 549 | case CPU_ONLINE: |
537 | case CPU_ONLINE_FROZEN: | 550 | case CPU_ONLINE_FROZEN: |
538 | if (watchdog_enable(hotcpu)) | 551 | if (watchdog_enabled) |
539 | return NOTIFY_BAD; | 552 | watchdog_enable(hotcpu); |
540 | break; | 553 | break; |
541 | #ifdef CONFIG_HOTPLUG_CPU | 554 | #ifdef CONFIG_HOTPLUG_CPU |
542 | case CPU_UP_CANCELED: | 555 | case CPU_UP_CANCELED: |
@@ -549,6 +562,12 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
549 | break; | 562 | break; |
550 | #endif /* CONFIG_HOTPLUG_CPU */ | 563 | #endif /* CONFIG_HOTPLUG_CPU */ |
551 | } | 564 | } |
565 | |||
566 | /* | ||
567 | * hardlockup and softlockup are not important enough | ||
568 | * to block cpu bring up. Just always succeed and | ||
569 | * rely on printk output to flag problems. | ||
570 | */ | ||
552 | return NOTIFY_OK; | 571 | return NOTIFY_OK; |
553 | } | 572 | } |
554 | 573 | ||
@@ -556,22 +575,16 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
556 | .notifier_call = cpu_callback | 575 | .notifier_call = cpu_callback |
557 | }; | 576 | }; |
558 | 577 | ||
559 | static int __init spawn_watchdog_task(void) | 578 | void __init lockup_detector_init(void) |
560 | { | 579 | { |
561 | void *cpu = (void *)(long)smp_processor_id(); | 580 | void *cpu = (void *)(long)smp_processor_id(); |
562 | int err; | 581 | int err; |
563 | 582 | ||
564 | if (no_watchdog) | ||
565 | return 0; | ||
566 | |||
567 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 583 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
568 | WARN_ON(err == NOTIFY_BAD); | 584 | WARN_ON(notifier_to_errno(err)); |
569 | 585 | ||
570 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 586 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
571 | register_cpu_notifier(&cpu_nfb); | 587 | register_cpu_notifier(&cpu_nfb); |
572 | 588 | ||
573 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | 589 | return; |
574 | |||
575 | return 0; | ||
576 | } | 590 | } |
577 | early_initcall(spawn_watchdog_task); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f77afd939229..0400553f0d04 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -42,9 +42,6 @@ | |||
42 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
43 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
44 | 44 | ||
45 | #define CREATE_TRACE_POINTS | ||
46 | #include <trace/events/workqueue.h> | ||
47 | |||
48 | #include "workqueue_sched.h" | 45 | #include "workqueue_sched.h" |
49 | 46 | ||
50 | enum { | 47 | enum { |
@@ -82,7 +79,9 @@ enum { | |||
82 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | 79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ |
83 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | 80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ |
84 | 81 | ||
85 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | 82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, |
83 | /* call for help after 10ms | ||
84 | (min two ticks) */ | ||
86 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | 85 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ |
87 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | 86 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ |
88 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | 87 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ |
@@ -252,10 +251,15 @@ struct workqueue_struct *system_wq __read_mostly; | |||
252 | struct workqueue_struct *system_long_wq __read_mostly; | 251 | struct workqueue_struct *system_long_wq __read_mostly; |
253 | struct workqueue_struct *system_nrt_wq __read_mostly; | 252 | struct workqueue_struct *system_nrt_wq __read_mostly; |
254 | struct workqueue_struct *system_unbound_wq __read_mostly; | 253 | struct workqueue_struct *system_unbound_wq __read_mostly; |
254 | struct workqueue_struct *system_freezable_wq __read_mostly; | ||
255 | EXPORT_SYMBOL_GPL(system_wq); | 255 | EXPORT_SYMBOL_GPL(system_wq); |
256 | EXPORT_SYMBOL_GPL(system_long_wq); | 256 | EXPORT_SYMBOL_GPL(system_long_wq); |
257 | EXPORT_SYMBOL_GPL(system_nrt_wq); | 257 | EXPORT_SYMBOL_GPL(system_nrt_wq); |
258 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 258 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
259 | EXPORT_SYMBOL_GPL(system_freezable_wq); | ||
260 | |||
261 | #define CREATE_TRACE_POINTS | ||
262 | #include <trace/events/workqueue.h> | ||
259 | 263 | ||
260 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | 264 | #define for_each_busy_worker(worker, i, pos, gcwq) \ |
261 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | 265 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ |
@@ -310,25 +314,15 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | |||
310 | (cpu) < WORK_CPU_NONE; \ | 314 | (cpu) < WORK_CPU_NONE; \ |
311 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) | 315 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) |
312 | 316 | ||
313 | #ifdef CONFIG_LOCKDEP | ||
314 | /** | ||
315 | * in_workqueue_context() - in context of specified workqueue? | ||
316 | * @wq: the workqueue of interest | ||
317 | * | ||
318 | * Checks lockdep state to see if the current task is executing from | ||
319 | * within a workqueue item. This function exists only if lockdep is | ||
320 | * enabled. | ||
321 | */ | ||
322 | int in_workqueue_context(struct workqueue_struct *wq) | ||
323 | { | ||
324 | return lock_is_held(&wq->lockdep_map); | ||
325 | } | ||
326 | #endif | ||
327 | |||
328 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 317 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
329 | 318 | ||
330 | static struct debug_obj_descr work_debug_descr; | 319 | static struct debug_obj_descr work_debug_descr; |
331 | 320 | ||
321 | static void *work_debug_hint(void *addr) | ||
322 | { | ||
323 | return ((struct work_struct *) addr)->func; | ||
324 | } | ||
325 | |||
332 | /* | 326 | /* |
333 | * fixup_init is called when: | 327 | * fixup_init is called when: |
334 | * - an active object is initialized | 328 | * - an active object is initialized |
@@ -400,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) | |||
400 | 394 | ||
401 | static struct debug_obj_descr work_debug_descr = { | 395 | static struct debug_obj_descr work_debug_descr = { |
402 | .name = "work_struct", | 396 | .name = "work_struct", |
397 | .debug_hint = work_debug_hint, | ||
403 | .fixup_init = work_fixup_init, | 398 | .fixup_init = work_fixup_init, |
404 | .fixup_activate = work_fixup_activate, | 399 | .fixup_activate = work_fixup_activate, |
405 | .fixup_free = work_fixup_free, | 400 | .fixup_free = work_fixup_free, |
@@ -604,7 +599,9 @@ static bool keep_working(struct global_cwq *gcwq) | |||
604 | { | 599 | { |
605 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | 600 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); |
606 | 601 | ||
607 | return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; | 602 | return !list_empty(&gcwq->worklist) && |
603 | (atomic_read(nr_running) <= 1 || | ||
604 | gcwq->flags & GCWQ_HIGHPRI_PENDING); | ||
608 | } | 605 | } |
609 | 606 | ||
610 | /* Do we need a new worker? Called from manager. */ | 607 | /* Do we need a new worker? Called from manager. */ |
@@ -674,7 +671,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
674 | { | 671 | { |
675 | struct worker *worker = kthread_data(task); | 672 | struct worker *worker = kthread_data(task); |
676 | 673 | ||
677 | if (likely(!(worker->flags & WORKER_NOT_RUNNING))) | 674 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
678 | atomic_inc(get_gcwq_nr_running(cpu)); | 675 | atomic_inc(get_gcwq_nr_running(cpu)); |
679 | } | 676 | } |
680 | 677 | ||
@@ -700,7 +697,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
700 | struct global_cwq *gcwq = get_gcwq(cpu); | 697 | struct global_cwq *gcwq = get_gcwq(cpu); |
701 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | 698 | atomic_t *nr_running = get_gcwq_nr_running(cpu); |
702 | 699 | ||
703 | if (unlikely(worker->flags & WORKER_NOT_RUNNING)) | 700 | if (worker->flags & WORKER_NOT_RUNNING) |
704 | return NULL; | 701 | return NULL; |
705 | 702 | ||
706 | /* this can only happen on the local cpu */ | 703 | /* this can only happen on the local cpu */ |
@@ -781,7 +778,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
781 | 778 | ||
782 | worker->flags &= ~flags; | 779 | worker->flags &= ~flags; |
783 | 780 | ||
784 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | 781 | /* |
782 | * If transitioning out of NOT_RUNNING, increment nr_running. Note | ||
783 | * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask | ||
784 | * of multiple flags, not a single flag. | ||
785 | */ | ||
785 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | 786 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) |
786 | if (!(worker->flags & WORKER_NOT_RUNNING)) | 787 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
787 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | 788 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); |
@@ -945,6 +946,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq, | |||
945 | wake_up_worker(gcwq); | 946 | wake_up_worker(gcwq); |
946 | } | 947 | } |
947 | 948 | ||
949 | /* | ||
950 | * Test whether @work is being queued from another work executing on the | ||
951 | * same workqueue. This is rather expensive and should only be used from | ||
952 | * cold paths. | ||
953 | */ | ||
954 | static bool is_chained_work(struct workqueue_struct *wq) | ||
955 | { | ||
956 | unsigned long flags; | ||
957 | unsigned int cpu; | ||
958 | |||
959 | for_each_gcwq_cpu(cpu) { | ||
960 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
961 | struct worker *worker; | ||
962 | struct hlist_node *pos; | ||
963 | int i; | ||
964 | |||
965 | spin_lock_irqsave(&gcwq->lock, flags); | ||
966 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
967 | if (worker->task != current) | ||
968 | continue; | ||
969 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
970 | /* | ||
971 | * I'm @worker, no locking necessary. See if @work | ||
972 | * is headed to the same workqueue. | ||
973 | */ | ||
974 | return worker->current_cwq->wq == wq; | ||
975 | } | ||
976 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
977 | } | ||
978 | return false; | ||
979 | } | ||
980 | |||
948 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | 981 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
949 | struct work_struct *work) | 982 | struct work_struct *work) |
950 | { | 983 | { |
@@ -956,7 +989,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
956 | 989 | ||
957 | debug_work_activate(work); | 990 | debug_work_activate(work); |
958 | 991 | ||
959 | if (WARN_ON_ONCE(wq->flags & WQ_DYING)) | 992 | /* if dying, only works from the same workqueue are allowed */ |
993 | if (unlikely(wq->flags & WQ_DYING) && | ||
994 | WARN_ON_ONCE(!is_chained_work(wq))) | ||
960 | return; | 995 | return; |
961 | 996 | ||
962 | /* determine gcwq to use */ | 997 | /* determine gcwq to use */ |
@@ -997,6 +1032,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
997 | 1032 | ||
998 | /* gcwq determined, get cwq and queue */ | 1033 | /* gcwq determined, get cwq and queue */ |
999 | cwq = get_cwq(gcwq->cpu, wq); | 1034 | cwq = get_cwq(gcwq->cpu, wq); |
1035 | trace_workqueue_queue_work(cpu, cwq, work); | ||
1000 | 1036 | ||
1001 | BUG_ON(!list_empty(&work->entry)); | 1037 | BUG_ON(!list_empty(&work->entry)); |
1002 | 1038 | ||
@@ -1004,6 +1040,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1004 | work_flags = work_color_to_flags(cwq->work_color); | 1040 | work_flags = work_color_to_flags(cwq->work_color); |
1005 | 1041 | ||
1006 | if (likely(cwq->nr_active < cwq->max_active)) { | 1042 | if (likely(cwq->nr_active < cwq->max_active)) { |
1043 | trace_workqueue_activate_work(work); | ||
1007 | cwq->nr_active++; | 1044 | cwq->nr_active++; |
1008 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | 1045 | worklist = gcwq_determine_ins_pos(gcwq, cwq); |
1009 | } else { | 1046 | } else { |
@@ -1254,8 +1291,14 @@ __acquires(&gcwq->lock) | |||
1254 | return true; | 1291 | return true; |
1255 | spin_unlock_irq(&gcwq->lock); | 1292 | spin_unlock_irq(&gcwq->lock); |
1256 | 1293 | ||
1257 | /* CPU has come up inbetween, retry migration */ | 1294 | /* |
1295 | * We've raced with CPU hot[un]plug. Give it a breather | ||
1296 | * and retry migration. cond_resched() is required here; | ||
1297 | * otherwise, we might deadlock against cpu_stop trying to | ||
1298 | * bring down the CPU on non-preemptive kernel. | ||
1299 | */ | ||
1258 | cpu_relax(); | 1300 | cpu_relax(); |
1301 | cond_resched(); | ||
1259 | } | 1302 | } |
1260 | } | 1303 | } |
1261 | 1304 | ||
@@ -1329,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | |||
1329 | worker->id = id; | 1372 | worker->id = id; |
1330 | 1373 | ||
1331 | if (!on_unbound_cpu) | 1374 | if (!on_unbound_cpu) |
1332 | worker->task = kthread_create(worker_thread, worker, | 1375 | worker->task = kthread_create_on_node(worker_thread, |
1333 | "kworker/%u:%d", gcwq->cpu, id); | 1376 | worker, |
1377 | cpu_to_node(gcwq->cpu), | ||
1378 | "kworker/%u:%d", gcwq->cpu, id); | ||
1334 | else | 1379 | else |
1335 | worker->task = kthread_create(worker_thread, worker, | 1380 | worker->task = kthread_create(worker_thread, worker, |
1336 | "kworker/u:%d", id); | 1381 | "kworker/u:%d", id); |
@@ -1679,6 +1724,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | |||
1679 | struct work_struct, entry); | 1724 | struct work_struct, entry); |
1680 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | 1725 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); |
1681 | 1726 | ||
1727 | trace_workqueue_activate_work(work); | ||
1682 | move_linked_works(work, pos, NULL); | 1728 | move_linked_works(work, pos, NULL); |
1683 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | 1729 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); |
1684 | cwq->nr_active++; | 1730 | cwq->nr_active++; |
@@ -1816,7 +1862,7 @@ __acquires(&gcwq->lock) | |||
1816 | spin_unlock_irq(&gcwq->lock); | 1862 | spin_unlock_irq(&gcwq->lock); |
1817 | 1863 | ||
1818 | work_clear_pending(work); | 1864 | work_clear_pending(work); |
1819 | lock_map_acquire(&cwq->wq->lockdep_map); | 1865 | lock_map_acquire_read(&cwq->wq->lockdep_map); |
1820 | lock_map_acquire(&lockdep_map); | 1866 | lock_map_acquire(&lockdep_map); |
1821 | trace_workqueue_execute_start(work); | 1867 | trace_workqueue_execute_start(work); |
1822 | f(work); | 1868 | f(work); |
@@ -2019,6 +2065,15 @@ repeat: | |||
2019 | move_linked_works(work, scheduled, &n); | 2065 | move_linked_works(work, scheduled, &n); |
2020 | 2066 | ||
2021 | process_scheduled_works(rescuer); | 2067 | process_scheduled_works(rescuer); |
2068 | |||
2069 | /* | ||
2070 | * Leave this gcwq. If keep_working() is %true, notify a | ||
2071 | * regular worker; otherwise, we end up with 0 concurrency | ||
2072 | * and stalling the execution. | ||
2073 | */ | ||
2074 | if (keep_working(gcwq)) | ||
2075 | wake_up_worker(gcwq); | ||
2076 | |||
2022 | spin_unlock_irq(&gcwq->lock); | 2077 | spin_unlock_irq(&gcwq->lock); |
2023 | } | 2078 | } |
2024 | 2079 | ||
@@ -2074,7 +2129,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | |||
2074 | * checks and call back into the fixup functions where we | 2129 | * checks and call back into the fixup functions where we |
2075 | * might deadlock. | 2130 | * might deadlock. |
2076 | */ | 2131 | */ |
2077 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); | 2132 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); |
2078 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); | 2133 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
2079 | init_completion(&barr->done); | 2134 | init_completion(&barr->done); |
2080 | 2135 | ||
@@ -2326,27 +2381,17 @@ out_unlock: | |||
2326 | } | 2381 | } |
2327 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2382 | EXPORT_SYMBOL_GPL(flush_workqueue); |
2328 | 2383 | ||
2329 | /** | 2384 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, |
2330 | * flush_work - block until a work_struct's callback has terminated | 2385 | bool wait_executing) |
2331 | * @work: the work which is to be flushed | ||
2332 | * | ||
2333 | * Returns false if @work has already terminated. | ||
2334 | * | ||
2335 | * It is expected that, prior to calling flush_work(), the caller has | ||
2336 | * arranged for the work to not be requeued, otherwise it doesn't make | ||
2337 | * sense to use this function. | ||
2338 | */ | ||
2339 | int flush_work(struct work_struct *work) | ||
2340 | { | 2386 | { |
2341 | struct worker *worker = NULL; | 2387 | struct worker *worker = NULL; |
2342 | struct global_cwq *gcwq; | 2388 | struct global_cwq *gcwq; |
2343 | struct cpu_workqueue_struct *cwq; | 2389 | struct cpu_workqueue_struct *cwq; |
2344 | struct wq_barrier barr; | ||
2345 | 2390 | ||
2346 | might_sleep(); | 2391 | might_sleep(); |
2347 | gcwq = get_work_gcwq(work); | 2392 | gcwq = get_work_gcwq(work); |
2348 | if (!gcwq) | 2393 | if (!gcwq) |
2349 | return 0; | 2394 | return false; |
2350 | 2395 | ||
2351 | spin_lock_irq(&gcwq->lock); | 2396 | spin_lock_irq(&gcwq->lock); |
2352 | if (!list_empty(&work->entry)) { | 2397 | if (!list_empty(&work->entry)) { |
@@ -2359,28 +2404,137 @@ int flush_work(struct work_struct *work) | |||
2359 | cwq = get_work_cwq(work); | 2404 | cwq = get_work_cwq(work); |
2360 | if (unlikely(!cwq || gcwq != cwq->gcwq)) | 2405 | if (unlikely(!cwq || gcwq != cwq->gcwq)) |
2361 | goto already_gone; | 2406 | goto already_gone; |
2362 | } else { | 2407 | } else if (wait_executing) { |
2363 | worker = find_worker_executing_work(gcwq, work); | 2408 | worker = find_worker_executing_work(gcwq, work); |
2364 | if (!worker) | 2409 | if (!worker) |
2365 | goto already_gone; | 2410 | goto already_gone; |
2366 | cwq = worker->current_cwq; | 2411 | cwq = worker->current_cwq; |
2367 | } | 2412 | } else |
2413 | goto already_gone; | ||
2368 | 2414 | ||
2369 | insert_wq_barrier(cwq, &barr, work, worker); | 2415 | insert_wq_barrier(cwq, barr, work, worker); |
2370 | spin_unlock_irq(&gcwq->lock); | 2416 | spin_unlock_irq(&gcwq->lock); |
2371 | 2417 | ||
2372 | lock_map_acquire(&cwq->wq->lockdep_map); | 2418 | /* |
2419 | * If @max_active is 1 or rescuer is in use, flushing another work | ||
2420 | * item on the same workqueue may lead to deadlock. Make sure the | ||
2421 | * flusher is not running on the same workqueue by verifying write | ||
2422 | * access. | ||
2423 | */ | ||
2424 | if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) | ||
2425 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
2426 | else | ||
2427 | lock_map_acquire_read(&cwq->wq->lockdep_map); | ||
2373 | lock_map_release(&cwq->wq->lockdep_map); | 2428 | lock_map_release(&cwq->wq->lockdep_map); |
2374 | 2429 | ||
2375 | wait_for_completion(&barr.done); | 2430 | return true; |
2376 | destroy_work_on_stack(&barr.work); | ||
2377 | return 1; | ||
2378 | already_gone: | 2431 | already_gone: |
2379 | spin_unlock_irq(&gcwq->lock); | 2432 | spin_unlock_irq(&gcwq->lock); |
2380 | return 0; | 2433 | return false; |
2434 | } | ||
2435 | |||
2436 | /** | ||
2437 | * flush_work - wait for a work to finish executing the last queueing instance | ||
2438 | * @work: the work to flush | ||
2439 | * | ||
2440 | * Wait until @work has finished execution. This function considers | ||
2441 | * only the last queueing instance of @work. If @work has been | ||
2442 | * enqueued across different CPUs on a non-reentrant workqueue or on | ||
2443 | * multiple workqueues, @work might still be executing on return on | ||
2444 | * some of the CPUs from earlier queueing. | ||
2445 | * | ||
2446 | * If @work was queued only on a non-reentrant, ordered or unbound | ||
2447 | * workqueue, @work is guaranteed to be idle on return if it hasn't | ||
2448 | * been requeued since flush started. | ||
2449 | * | ||
2450 | * RETURNS: | ||
2451 | * %true if flush_work() waited for the work to finish execution, | ||
2452 | * %false if it was already idle. | ||
2453 | */ | ||
2454 | bool flush_work(struct work_struct *work) | ||
2455 | { | ||
2456 | struct wq_barrier barr; | ||
2457 | |||
2458 | if (start_flush_work(work, &barr, true)) { | ||
2459 | wait_for_completion(&barr.done); | ||
2460 | destroy_work_on_stack(&barr.work); | ||
2461 | return true; | ||
2462 | } else | ||
2463 | return false; | ||
2381 | } | 2464 | } |
2382 | EXPORT_SYMBOL_GPL(flush_work); | 2465 | EXPORT_SYMBOL_GPL(flush_work); |
2383 | 2466 | ||
2467 | static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | ||
2468 | { | ||
2469 | struct wq_barrier barr; | ||
2470 | struct worker *worker; | ||
2471 | |||
2472 | spin_lock_irq(&gcwq->lock); | ||
2473 | |||
2474 | worker = find_worker_executing_work(gcwq, work); | ||
2475 | if (unlikely(worker)) | ||
2476 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2477 | |||
2478 | spin_unlock_irq(&gcwq->lock); | ||
2479 | |||
2480 | if (unlikely(worker)) { | ||
2481 | wait_for_completion(&barr.done); | ||
2482 | destroy_work_on_stack(&barr.work); | ||
2483 | return true; | ||
2484 | } else | ||
2485 | return false; | ||
2486 | } | ||
2487 | |||
2488 | static bool wait_on_work(struct work_struct *work) | ||
2489 | { | ||
2490 | bool ret = false; | ||
2491 | int cpu; | ||
2492 | |||
2493 | might_sleep(); | ||
2494 | |||
2495 | lock_map_acquire(&work->lockdep_map); | ||
2496 | lock_map_release(&work->lockdep_map); | ||
2497 | |||
2498 | for_each_gcwq_cpu(cpu) | ||
2499 | ret |= wait_on_cpu_work(get_gcwq(cpu), work); | ||
2500 | return ret; | ||
2501 | } | ||
2502 | |||
2503 | /** | ||
2504 | * flush_work_sync - wait until a work has finished execution | ||
2505 | * @work: the work to flush | ||
2506 | * | ||
2507 | * Wait until @work has finished execution. On return, it's | ||
2508 | * guaranteed that all queueing instances of @work which happened | ||
2509 | * before this function is called are finished. In other words, if | ||
2510 | * @work hasn't been requeued since this function was called, @work is | ||
2511 | * guaranteed to be idle on return. | ||
2512 | * | ||
2513 | * RETURNS: | ||
2514 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2515 | * %false if it was already idle. | ||
2516 | */ | ||
2517 | bool flush_work_sync(struct work_struct *work) | ||
2518 | { | ||
2519 | struct wq_barrier barr; | ||
2520 | bool pending, waited; | ||
2521 | |||
2522 | /* we'll wait for executions separately, queue barr only if pending */ | ||
2523 | pending = start_flush_work(work, &barr, false); | ||
2524 | |||
2525 | /* wait for executions to finish */ | ||
2526 | waited = wait_on_work(work); | ||
2527 | |||
2528 | /* wait for the pending one */ | ||
2529 | if (pending) { | ||
2530 | wait_for_completion(&barr.done); | ||
2531 | destroy_work_on_stack(&barr.work); | ||
2532 | } | ||
2533 | |||
2534 | return pending || waited; | ||
2535 | } | ||
2536 | EXPORT_SYMBOL_GPL(flush_work_sync); | ||
2537 | |||
2384 | /* | 2538 | /* |
2385 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, | 2539 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, |
2386 | * so this work can't be re-armed in any way. | 2540 | * so this work can't be re-armed in any way. |
@@ -2423,39 +2577,7 @@ static int try_to_grab_pending(struct work_struct *work) | |||
2423 | return ret; | 2577 | return ret; |
2424 | } | 2578 | } |
2425 | 2579 | ||
2426 | static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | 2580 | static bool __cancel_work_timer(struct work_struct *work, |
2427 | { | ||
2428 | struct wq_barrier barr; | ||
2429 | struct worker *worker; | ||
2430 | |||
2431 | spin_lock_irq(&gcwq->lock); | ||
2432 | |||
2433 | worker = find_worker_executing_work(gcwq, work); | ||
2434 | if (unlikely(worker)) | ||
2435 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2436 | |||
2437 | spin_unlock_irq(&gcwq->lock); | ||
2438 | |||
2439 | if (unlikely(worker)) { | ||
2440 | wait_for_completion(&barr.done); | ||
2441 | destroy_work_on_stack(&barr.work); | ||
2442 | } | ||
2443 | } | ||
2444 | |||
2445 | static void wait_on_work(struct work_struct *work) | ||
2446 | { | ||
2447 | int cpu; | ||
2448 | |||
2449 | might_sleep(); | ||
2450 | |||
2451 | lock_map_acquire(&work->lockdep_map); | ||
2452 | lock_map_release(&work->lockdep_map); | ||
2453 | |||
2454 | for_each_gcwq_cpu(cpu) | ||
2455 | wait_on_cpu_work(get_gcwq(cpu), work); | ||
2456 | } | ||
2457 | |||
2458 | static int __cancel_work_timer(struct work_struct *work, | ||
2459 | struct timer_list* timer) | 2581 | struct timer_list* timer) |
2460 | { | 2582 | { |
2461 | int ret; | 2583 | int ret; |
@@ -2472,42 +2594,81 @@ static int __cancel_work_timer(struct work_struct *work, | |||
2472 | } | 2594 | } |
2473 | 2595 | ||
2474 | /** | 2596 | /** |
2475 | * cancel_work_sync - block until a work_struct's callback has terminated | 2597 | * cancel_work_sync - cancel a work and wait for it to finish |
2476 | * @work: the work which is to be flushed | 2598 | * @work: the work to cancel |
2477 | * | 2599 | * |
2478 | * Returns true if @work was pending. | 2600 | * Cancel @work and wait for its execution to finish. This function |
2479 | * | 2601 | * can be used even if the work re-queues itself or migrates to |
2480 | * cancel_work_sync() will cancel the work if it is queued. If the work's | 2602 | * another workqueue. On return from this function, @work is |
2481 | * callback appears to be running, cancel_work_sync() will block until it | 2603 | * guaranteed to be not pending or executing on any CPU. |
2482 | * has completed. | ||
2483 | * | ||
2484 | * It is possible to use this function if the work re-queues itself. It can | ||
2485 | * cancel the work even if it migrates to another workqueue, however in that | ||
2486 | * case it only guarantees that work->func() has completed on the last queued | ||
2487 | * workqueue. | ||
2488 | * | 2604 | * |
2489 | * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not | 2605 | * cancel_work_sync(&delayed_work->work) must not be used for |
2490 | * pending, otherwise it goes into a busy-wait loop until the timer expires. | 2606 | * delayed_work's. Use cancel_delayed_work_sync() instead. |
2491 | * | 2607 | * |
2492 | * The caller must ensure that workqueue_struct on which this work was last | 2608 | * The caller must ensure that the workqueue on which @work was last |
2493 | * queued can't be destroyed before this function returns. | 2609 | * queued can't be destroyed before this function returns. |
2610 | * | ||
2611 | * RETURNS: | ||
2612 | * %true if @work was pending, %false otherwise. | ||
2494 | */ | 2613 | */ |
2495 | int cancel_work_sync(struct work_struct *work) | 2614 | bool cancel_work_sync(struct work_struct *work) |
2496 | { | 2615 | { |
2497 | return __cancel_work_timer(work, NULL); | 2616 | return __cancel_work_timer(work, NULL); |
2498 | } | 2617 | } |
2499 | EXPORT_SYMBOL_GPL(cancel_work_sync); | 2618 | EXPORT_SYMBOL_GPL(cancel_work_sync); |
2500 | 2619 | ||
2501 | /** | 2620 | /** |
2502 | * cancel_delayed_work_sync - reliably kill off a delayed work. | 2621 | * flush_delayed_work - wait for a dwork to finish executing the last queueing |
2503 | * @dwork: the delayed work struct | 2622 | * @dwork: the delayed work to flush |
2623 | * | ||
2624 | * Delayed timer is cancelled and the pending work is queued for | ||
2625 | * immediate execution. Like flush_work(), this function only | ||
2626 | * considers the last queueing instance of @dwork. | ||
2627 | * | ||
2628 | * RETURNS: | ||
2629 | * %true if flush_work() waited for the work to finish execution, | ||
2630 | * %false if it was already idle. | ||
2631 | */ | ||
2632 | bool flush_delayed_work(struct delayed_work *dwork) | ||
2633 | { | ||
2634 | if (del_timer_sync(&dwork->timer)) | ||
2635 | __queue_work(raw_smp_processor_id(), | ||
2636 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
2637 | return flush_work(&dwork->work); | ||
2638 | } | ||
2639 | EXPORT_SYMBOL(flush_delayed_work); | ||
2640 | |||
2641 | /** | ||
2642 | * flush_delayed_work_sync - wait for a dwork to finish | ||
2643 | * @dwork: the delayed work to flush | ||
2644 | * | ||
2645 | * Delayed timer is cancelled and the pending work is queued for | ||
2646 | * execution immediately. Other than timer handling, its behavior | ||
2647 | * is identical to flush_work_sync(). | ||
2648 | * | ||
2649 | * RETURNS: | ||
2650 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2651 | * %false if it was already idle. | ||
2652 | */ | ||
2653 | bool flush_delayed_work_sync(struct delayed_work *dwork) | ||
2654 | { | ||
2655 | if (del_timer_sync(&dwork->timer)) | ||
2656 | __queue_work(raw_smp_processor_id(), | ||
2657 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
2658 | return flush_work_sync(&dwork->work); | ||
2659 | } | ||
2660 | EXPORT_SYMBOL(flush_delayed_work_sync); | ||
2661 | |||
2662 | /** | ||
2663 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish | ||
2664 | * @dwork: the delayed work cancel | ||
2504 | * | 2665 | * |
2505 | * Returns true if @dwork was pending. | 2666 | * This is cancel_work_sync() for delayed works. |
2506 | * | 2667 | * |
2507 | * It is possible to use this function if @dwork rearms itself via queue_work() | 2668 | * RETURNS: |
2508 | * or queue_delayed_work(). See also the comment for cancel_work_sync(). | 2669 | * %true if @dwork was pending, %false otherwise. |
2509 | */ | 2670 | */ |
2510 | int cancel_delayed_work_sync(struct delayed_work *dwork) | 2671 | bool cancel_delayed_work_sync(struct delayed_work *dwork) |
2511 | { | 2672 | { |
2512 | return __cancel_work_timer(&dwork->work, &dwork->timer); | 2673 | return __cancel_work_timer(&dwork->work, &dwork->timer); |
2513 | } | 2674 | } |
@@ -2559,23 +2720,6 @@ int schedule_delayed_work(struct delayed_work *dwork, | |||
2559 | EXPORT_SYMBOL(schedule_delayed_work); | 2720 | EXPORT_SYMBOL(schedule_delayed_work); |
2560 | 2721 | ||
2561 | /** | 2722 | /** |
2562 | * flush_delayed_work - block until a dwork_struct's callback has terminated | ||
2563 | * @dwork: the delayed work which is to be flushed | ||
2564 | * | ||
2565 | * Any timeout is cancelled, and any pending work is run immediately. | ||
2566 | */ | ||
2567 | void flush_delayed_work(struct delayed_work *dwork) | ||
2568 | { | ||
2569 | if (del_timer_sync(&dwork->timer)) { | ||
2570 | __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, | ||
2571 | &dwork->work); | ||
2572 | put_cpu(); | ||
2573 | } | ||
2574 | flush_work(&dwork->work); | ||
2575 | } | ||
2576 | EXPORT_SYMBOL(flush_delayed_work); | ||
2577 | |||
2578 | /** | ||
2579 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 2723 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
2580 | * @cpu: cpu to use | 2724 | * @cpu: cpu to use |
2581 | * @dwork: job to be done | 2725 | * @dwork: job to be done |
@@ -2592,13 +2736,15 @@ int schedule_delayed_work_on(int cpu, | |||
2592 | EXPORT_SYMBOL(schedule_delayed_work_on); | 2736 | EXPORT_SYMBOL(schedule_delayed_work_on); |
2593 | 2737 | ||
2594 | /** | 2738 | /** |
2595 | * schedule_on_each_cpu - call a function on each online CPU from keventd | 2739 | * schedule_on_each_cpu - execute a function synchronously on each online CPU |
2596 | * @func: the function to call | 2740 | * @func: the function to call |
2597 | * | 2741 | * |
2598 | * Returns zero on success. | 2742 | * schedule_on_each_cpu() executes @func on each online CPU using the |
2599 | * Returns -ve errno on failure. | 2743 | * system workqueue and blocks until all CPUs have completed. |
2600 | * | ||
2601 | * schedule_on_each_cpu() is very slow. | 2744 | * schedule_on_each_cpu() is very slow. |
2745 | * | ||
2746 | * RETURNS: | ||
2747 | * 0 on success, -errno on failure. | ||
2602 | */ | 2748 | */ |
2603 | int schedule_on_each_cpu(work_func_t func) | 2749 | int schedule_on_each_cpu(work_func_t func) |
2604 | { | 2750 | { |
@@ -2764,6 +2910,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2764 | unsigned int cpu; | 2910 | unsigned int cpu; |
2765 | 2911 | ||
2766 | /* | 2912 | /* |
2913 | * Workqueues which may be used during memory reclaim should | ||
2914 | * have a rescuer to guarantee forward progress. | ||
2915 | */ | ||
2916 | if (flags & WQ_MEM_RECLAIM) | ||
2917 | flags |= WQ_RESCUER; | ||
2918 | |||
2919 | /* | ||
2767 | * Unbound workqueues aren't concurrency managed and should be | 2920 | * Unbound workqueues aren't concurrency managed and should be |
2768 | * dispatched to workers immediately. | 2921 | * dispatched to workers immediately. |
2769 | */ | 2922 | */ |
@@ -2828,7 +2981,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2828 | */ | 2981 | */ |
2829 | spin_lock(&workqueue_lock); | 2982 | spin_lock(&workqueue_lock); |
2830 | 2983 | ||
2831 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) | 2984 | if (workqueue_freezing && wq->flags & WQ_FREEZABLE) |
2832 | for_each_cwq_cpu(cpu, wq) | 2985 | for_each_cwq_cpu(cpu, wq) |
2833 | get_cwq(cpu, wq)->max_active = 0; | 2986 | get_cwq(cpu, wq)->max_active = 0; |
2834 | 2987 | ||
@@ -2856,11 +3009,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
2856 | */ | 3009 | */ |
2857 | void destroy_workqueue(struct workqueue_struct *wq) | 3010 | void destroy_workqueue(struct workqueue_struct *wq) |
2858 | { | 3011 | { |
3012 | unsigned int flush_cnt = 0; | ||
2859 | unsigned int cpu; | 3013 | unsigned int cpu; |
2860 | 3014 | ||
3015 | /* | ||
3016 | * Mark @wq dying and drain all pending works. Once WQ_DYING is | ||
3017 | * set, only chain queueing is allowed. IOW, only currently | ||
3018 | * pending or running work items on @wq can queue further work | ||
3019 | * items on it. @wq is flushed repeatedly until it becomes empty. | ||
3020 | * The number of flushing is detemined by the depth of chaining and | ||
3021 | * should be relatively short. Whine if it takes too long. | ||
3022 | */ | ||
2861 | wq->flags |= WQ_DYING; | 3023 | wq->flags |= WQ_DYING; |
3024 | reflush: | ||
2862 | flush_workqueue(wq); | 3025 | flush_workqueue(wq); |
2863 | 3026 | ||
3027 | for_each_cwq_cpu(cpu, wq) { | ||
3028 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3029 | |||
3030 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
3031 | continue; | ||
3032 | |||
3033 | if (++flush_cnt == 10 || | ||
3034 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
3035 | printk(KERN_WARNING "workqueue %s: flush on " | ||
3036 | "destruction isn't complete after %u tries\n", | ||
3037 | wq->name, flush_cnt); | ||
3038 | goto reflush; | ||
3039 | } | ||
3040 | |||
2864 | /* | 3041 | /* |
2865 | * wq list is used to freeze wq, remove from list after | 3042 | * wq list is used to freeze wq, remove from list after |
2866 | * flushing is complete in case freeze races us. | 3043 | * flushing is complete in case freeze races us. |
@@ -2916,7 +3093,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
2916 | 3093 | ||
2917 | spin_lock_irq(&gcwq->lock); | 3094 | spin_lock_irq(&gcwq->lock); |
2918 | 3095 | ||
2919 | if (!(wq->flags & WQ_FREEZEABLE) || | 3096 | if (!(wq->flags & WQ_FREEZABLE) || |
2920 | !(gcwq->flags & GCWQ_FREEZING)) | 3097 | !(gcwq->flags & GCWQ_FREEZING)) |
2921 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | 3098 | get_cwq(gcwq->cpu, wq)->max_active = max_active; |
2922 | 3099 | ||
@@ -3166,7 +3343,7 @@ static int __cpuinit trustee_thread(void *__gcwq) | |||
3166 | * want to get it over with ASAP - spam rescuers, wake up as | 3343 | * want to get it over with ASAP - spam rescuers, wake up as |
3167 | * many idlers as necessary and create new ones till the | 3344 | * many idlers as necessary and create new ones till the |
3168 | * worklist is empty. Note that if the gcwq is frozen, there | 3345 | * worklist is empty. Note that if the gcwq is frozen, there |
3169 | * may be frozen works in freezeable cwqs. Don't declare | 3346 | * may be frozen works in freezable cwqs. Don't declare |
3170 | * completion while frozen. | 3347 | * completion while frozen. |
3171 | */ | 3348 | */ |
3172 | while (gcwq->nr_workers != gcwq->nr_idle || | 3349 | while (gcwq->nr_workers != gcwq->nr_idle || |
@@ -3424,9 +3601,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
3424 | /** | 3601 | /** |
3425 | * freeze_workqueues_begin - begin freezing workqueues | 3602 | * freeze_workqueues_begin - begin freezing workqueues |
3426 | * | 3603 | * |
3427 | * Start freezing workqueues. After this function returns, all | 3604 | * Start freezing workqueues. After this function returns, all freezable |
3428 | * freezeable workqueues will queue new works to their frozen_works | 3605 | * workqueues will queue new works to their frozen_works list instead of |
3429 | * list instead of gcwq->worklist. | 3606 | * gcwq->worklist. |
3430 | * | 3607 | * |
3431 | * CONTEXT: | 3608 | * CONTEXT: |
3432 | * Grabs and releases workqueue_lock and gcwq->lock's. | 3609 | * Grabs and releases workqueue_lock and gcwq->lock's. |
@@ -3452,7 +3629,7 @@ void freeze_workqueues_begin(void) | |||
3452 | list_for_each_entry(wq, &workqueues, list) { | 3629 | list_for_each_entry(wq, &workqueues, list) { |
3453 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3630 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3454 | 3631 | ||
3455 | if (cwq && wq->flags & WQ_FREEZEABLE) | 3632 | if (cwq && wq->flags & WQ_FREEZABLE) |
3456 | cwq->max_active = 0; | 3633 | cwq->max_active = 0; |
3457 | } | 3634 | } |
3458 | 3635 | ||
@@ -3463,7 +3640,7 @@ void freeze_workqueues_begin(void) | |||
3463 | } | 3640 | } |
3464 | 3641 | ||
3465 | /** | 3642 | /** |
3466 | * freeze_workqueues_busy - are freezeable workqueues still busy? | 3643 | * freeze_workqueues_busy - are freezable workqueues still busy? |
3467 | * | 3644 | * |
3468 | * Check whether freezing is complete. This function must be called | 3645 | * Check whether freezing is complete. This function must be called |
3469 | * between freeze_workqueues_begin() and thaw_workqueues(). | 3646 | * between freeze_workqueues_begin() and thaw_workqueues(). |
@@ -3472,8 +3649,8 @@ void freeze_workqueues_begin(void) | |||
3472 | * Grabs and releases workqueue_lock. | 3649 | * Grabs and releases workqueue_lock. |
3473 | * | 3650 | * |
3474 | * RETURNS: | 3651 | * RETURNS: |
3475 | * %true if some freezeable workqueues are still busy. %false if | 3652 | * %true if some freezable workqueues are still busy. %false if freezing |
3476 | * freezing is complete. | 3653 | * is complete. |
3477 | */ | 3654 | */ |
3478 | bool freeze_workqueues_busy(void) | 3655 | bool freeze_workqueues_busy(void) |
3479 | { | 3656 | { |
@@ -3493,7 +3670,7 @@ bool freeze_workqueues_busy(void) | |||
3493 | list_for_each_entry(wq, &workqueues, list) { | 3670 | list_for_each_entry(wq, &workqueues, list) { |
3494 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3671 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3495 | 3672 | ||
3496 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3673 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
3497 | continue; | 3674 | continue; |
3498 | 3675 | ||
3499 | BUG_ON(cwq->nr_active < 0); | 3676 | BUG_ON(cwq->nr_active < 0); |
@@ -3538,7 +3715,7 @@ void thaw_workqueues(void) | |||
3538 | list_for_each_entry(wq, &workqueues, list) { | 3715 | list_for_each_entry(wq, &workqueues, list) { |
3539 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | 3716 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
3540 | 3717 | ||
3541 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | 3718 | if (!cwq || !(wq->flags & WQ_FREEZABLE)) |
3542 | continue; | 3719 | continue; |
3543 | 3720 | ||
3544 | /* restore max_active and repopulate worklist */ | 3721 | /* restore max_active and repopulate worklist */ |
@@ -3612,7 +3789,10 @@ static int __init init_workqueues(void) | |||
3612 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | 3789 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); |
3613 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | 3790 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, |
3614 | WQ_UNBOUND_MAX_ACTIVE); | 3791 | WQ_UNBOUND_MAX_ACTIVE); |
3615 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); | 3792 | system_freezable_wq = alloc_workqueue("events_freezable", |
3793 | WQ_FREEZABLE, 0); | ||
3794 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || | ||
3795 | !system_unbound_wq || !system_freezable_wq); | ||
3616 | return 0; | 3796 | return 0; |
3617 | } | 3797 | } |
3618 | early_initcall(init_workqueues); | 3798 | early_initcall(init_workqueues); |